In [None]:
import rdflib
import pandas as pd

# Step 1: Load the knowledge graph
g = rdflib.Graph()
g.parse('/content/SampleKG3.ttl', format='turtle')

# Step 2: Extract applicant IDs
applicant_ids = set()
for s, p, o in g:
    if isinstance(s, rdflib.URIRef) and 'Applicant' in s:
        applicant_id = s.split('_')[-1]
        applicant_ids.add(applicant_id)

# Convert to a list for further processing
applicant_list = list(applicant_ids)

# Print the extracted applicant IDs for verification
print("Extracted Applicant IDs (sample):", applicant_list[:10])
print("Total number of extracted Applicant IDs:", len(applicant_list))

# Step 3: Ensure IDs are strings
applicant_list = [str(id) for id in applicant_list]

# Save the list of applicants to a CSV file for verification
applicant_df = pd.DataFrame(applicant_list, columns=['Applicant_ID'])
applicant_df.to_csv('/content/applicants.csv', index=False)

# Load the applicant list
applicant_df = pd.read_csv('/content/applicants.csv')
applicant_list = applicant_df['Applicant_ID'].tolist()

# Print the loaded applicant list for verification
print("Loaded Applicant List (sample):", applicant_list[:10])
print("Total number of loaded Applicant IDs:", len(applicant_list))

# Step 4: Load the original CSV file
file_path = '/content/drive/MyDrive/home-credit-default-risk/application_train.csv'
df = pd.read_csv(file_path)

# Ensure SK_ID_CURR is in string format for matching
df['SK_ID_CURR'] = df['SK_ID_CURR'].astype(str)

# Print the first few rows of the dataframe to verify SK_ID_CURR
print("Original DataFrame (sample):", df.head())
print("Total number of records in the original DataFrame:", len(df))

# Print a sample of SK_ID_CURR values to verify
print("SK_ID_CURR (sample):", df['SK_ID_CURR'].head(10).tolist())

# Step 5: Filter records based on the applicant IDs
df_filtered = df[df['SK_ID_CURR'].isin(applicant_list)]

# Print the filtered dataframe to verify the results
print("Filtered DataFrame (sample):", df_filtered.head())
print("Number of filtered records:", len(df_filtered))

# Ensure that the filtered dataframe is not empty
if df_filtered.empty:
    print("No records found after filtering.")
else:
    # Step 6: Save the filtered records to a new CSV file using pandas to_csv directly
    filtered_file_path = '/content/filtered_records.csv'
    df_filtered.to_csv(filtered_file_path, index=False)
    print(f"Filtered records saved to {filtered_file_path}")

    # Step 7: Verify the saved CSV file by reloading it and checking its size
    df_filtered_saved = pd.read_csv(filtered_file_path)
    print("Number of records in saved CSV:", len(df_filtered_saved))
    print("Saved CSV (sample):", df_filtered_saved.head())

# Verify that the original DataFrame is not being mistakenly saved
if len(df_filtered_saved) == len(df):
    print("Error: The original DataFrame is being saved instead of the filtered DataFrame.")
else:
    print("Success: The filtered DataFrame is correctly saved.")


Extracted Applicant IDs (sample): ['136017', '134255', '394060', '134442', '223725', '232219', '228971', '144058', '135492', '398387']
Total number of extracted Applicant IDs: 7989
Loaded Applicant List (sample): ['136017', '134255', '394060', '134442', '223725', '232219', '228971', '144058', '135492', '398387']
Total number of loaded Applicant IDs: 7989
Original DataFrame (sample):   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0     100002       1         Cash loans           M            N   
1     100003       0         Cash loans           F            N   
2     100004       0    Revolving loans           M            Y   
3     100006       0         Cash loans           F            N   
4     100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
file_path = '/content/filtered_records.csv'
df = pd.read_csv(file_path)

# Select relevant features
features = [
    'TARGET', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_EMPLOYED',
    'NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE'
]

# Subset the dataframe
df_subset = df[features]

# Separate numerical and categorical features
numerical_features = [
    'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_EMPLOYED'
]
categorical_features = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_EDUCATION_TYPE',
    'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE'
]

# Impute numerical features
num_imputer = SimpleImputer(strategy='median')
df_subset[numerical_features] = num_imputer.fit_transform(df_subset[numerical_features])

# Impute categorical features
cat_imputer = SimpleImputer(strategy='most_frequent')
df_subset[categorical_features] = cat_imputer.fit_transform(df_subset[categorical_features])

# Encode categorical variables
for feature in categorical_features:
    le = LabelEncoder()
    df_subset[feature] = le.fit_transform(df_subset[feature])

# Split the data into features (X) and target (y)
X = df_subset.drop('TARGET', axis=1)
y = df_subset['TARGET']

# Normalize numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Initialize the MLP classifier
clf = MLPClassifier(max_iter=10000)

# Define a dictionary with the metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted')
}

# Perform 10-fold cross-validation with the defined metrics
results = cross_validate(clf, X, y, cv=10, scoring=scoring)

# Display the results
print("Accuracy: ", results['test_accuracy'].mean())
print("Precision: ", results['test_precision'].mean())
print("Recall: ", results['test_recall'].mean())
print("F1 Score: ", results['test_f1_score'].mean())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset[numerical_features] = num_imputer.fit_transform(df_subset[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset[categorical_features] = cat_imputer.fit_transform(df_subset[categorical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset[feature] = le

Accuracy:  0.9217574599828733
Precision:  0.8520804120628329
Recall:  0.9217574599828733
F1 Score:  0.8855498983654359
