In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import os
from pathlib import Path 
from imblearn.over_sampling import SMOTE
%matplotlib inline
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200) # So we can see all columns
pd.set_option('display.max_rows',200)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [2]:
df = pd.read_csv('D:/Banque Misr Internship/Loan Datasets/sampled_data.csv')

In [3]:
df.head(50)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,OCCUPATION_TYPE,ORGANIZATION_TYPE,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_YEAR
0,0,100002,1,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,Laborers,Business Entity Type 3,0.262949,0.139376,2.0,2.0,2.0,2.0,-1134.0,0.0,1.0
1,1,100003,0,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,Core staff,School,0.622246,,1.0,0.0,1.0,0.0,-828.0,0.0,0.0
2,2,100004,0,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,Laborers,Government,0.555912,0.729567,0.0,0.0,0.0,0.0,-815.0,0.0,0.0
3,3,100006,0,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,Laborers,Business Entity Type 3,0.650442,,2.0,0.0,2.0,0.0,-617.0,,
4,4,100007,0,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,Core staff,Religion,0.322738,,0.0,0.0,0.0,0.0,-1106.0,0.0,0.0
5,5,100008,0,M,N,Y,0,99000.0,490495.5,27517.5,454500.0,"Spouse, partner",State servant,Secondary / secondary special,Married,House / apartment,0.035792,-16941,-1588,Laborers,Other,0.354225,0.621226,0.0,0.0,0.0,0.0,-2536.0,0.0,1.0
6,6,100009,0,F,Y,Y,1,171000.0,1560726.0,41301.0,1395000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.035792,-13778,-3130,Accountants,Business Entity Type 3,0.724,0.49206,1.0,0.0,1.0,0.0,-1562.0,0.0,2.0
7,7,100010,0,M,Y,Y,0,360000.0,1530000.0,42075.0,1530000.0,Unaccompanied,State servant,Higher education,Married,House / apartment,0.003122,-18850,-449,Managers,Other,0.714279,0.540654,2.0,0.0,2.0,0.0,-1070.0,0.0,0.0
8,8,100011,0,F,N,Y,0,112500.0,1019610.0,33826.5,913500.0,Children,Pensioner,Secondary / secondary special,Married,House / apartment,0.018634,-20099,365243,,XNA,0.205747,0.751724,1.0,0.0,1.0,0.0,0.0,0.0,1.0
9,9,100012,0,M,N,Y,0,135000.0,405000.0,20250.0,405000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.019689,-14469,-2019,Laborers,Electricity,0.746644,,2.0,0.0,2.0,0.0,-1673.0,,


One-hot encode the categorical features

In [4]:
categorical_columns = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
    'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 
    'ORGANIZATION_TYPE'
]
# One-hot encode the categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Check the resulting DataFrame
df_encoded.head()
df_encoded.columns

Index(['Unnamed: 0', 'SK_ID_CURR', 'TARGET', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       ...
       'ORGANIZATION_TYPE_Trade: type 4', 'ORGANIZATION_TYPE_Trade: type 5',
       'ORGANIZATION_TYPE_Trade: type 6', 'ORGANIZATION_TYPE_Trade: type 7',
       'ORGANIZATION_TYPE_Transport: type 1',
       'ORGANIZATION_TYPE_Transport: type 2',
       'ORGANIZATION_TYPE_Transport: type 3',
       'ORGANIZATION_TYPE_Transport: type 4', 'ORGANIZATION_TYPE_University',
       'ORGANIZATION_TYPE_XNA'],
      dtype='object', length=125)

In [5]:
# Identify the remaining columns with null values
null_columns = df_encoded.columns[df_encoded.isnull().any()]

# Display columns with nulls
print("Columns with nulls:", null_columns)

Columns with nulls: Index(['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
       'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object')


In [6]:
# Example Strategy: Fill numeric columns with the median
for col in null_columns:
    if df_encoded[col].dtype in ['float64', 'int64']:
        df_encoded[col].fillna(df_encoded[col].median(), inplace=True)
    else:
        # Fill categorical columns with the mode (most frequent value)
        df_encoded[col].fillna(df_encoded[col].mode()[0], inplace=True)

# Verify that there are no more nulls
print("Null values remaining:", df_encoded.isnull().sum().sum())

# The DataFrame is now ready for modeling
df_encoded.head()

Null values remaining: 0


Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_YEAR,CODE_GENDER_M,CODE_GENDER_XNA,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA
0,0,100002,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,0.262949,0.139376,2.0,2.0,2.0,2.0,-1134.0,0.0,1.0,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,100003,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,0.622246,0.535276,1.0,0.0,1.0,0.0,-828.0,0.0,0.0,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2,100004,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,0.555912,0.729567,0.0,0.0,0.0,0.0,-815.0,0.0,0.0,True,False,True,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,3,100006,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,0.650442,0.535276,2.0,0.0,2.0,0.0,-617.0,0.0,1.0,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,4,100007,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,0.322738,0.535276,0.0,0.0,0.0,0.0,-1106.0,0.0,0.0,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
def get_numerical_summary(df):
    total = df.shape[0]
    missing_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
    missing_percent = {}
    for col in missing_columns:
        null_count = df[col].isnull().sum()
        per = (null_count/total) * 100
        missing_percent[col] = per
        print("{} : {} ({}%)".format(col, null_count, round(per, 3)))
    return missing_percent

In [8]:
missing_percent = get_numerical_summary(df_encoded)
print(missing_percent)

{}


Convert Days birth and Days employed to positive values

In [9]:
# # Assuming df is your DataFrame

# # Convert DAYS_BIRTH from days to years (positive values)
# df['AGE_YEARS'] = (-df['DAYS_BIRTH']) // 365

# # Convert DAYS_EMPLOYED from days to years (positive values)
# df['YEARS_EMPLOYED'] = (-df['DAYS_EMPLOYED']) // 365

# # You could also keep the original days if you prefer
# df['DAYS_BIRTH_POSITIVE'] = -df['DAYS_BIRTH']
# df['DAYS_EMPLOYED_POSITIVE'] = -df['DAYS_EMPLOYED']

# # Drop the original negative columns if not needed
# df.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

# # Display the transformed DataFrame
# print(df[['AGE_YEARS', 'YEARS_EMPLOYED']].head())

In [10]:
# # Add new features

# # 1. Debt-to-Income Ratio
# df['Debt_to_Income_Ratio'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']

# # 2. Annuity-to-Income Ratio
# df['Annuity_to_Income_Ratio'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']

# # 3. Credit-to-Annual Income Ratio

# # Handle potential division by zero by adding a small constant (e.g., 1e-5)
# df['YEARS_EMPLOYED'] = df['YEARS_EMPLOYED'].replace(0, 1e-5)

# df['Credit_to_Annual_Income_Ratio'] = df['AMT_CREDIT'] / (df['AMT_INCOME_TOTAL'] * df['YEARS_EMPLOYED'])


# # Display the first few rows to verify the new features
# df[['Debt_to_Income_Ratio', 'Annuity_to_Income_Ratio', 'Credit_to_Annual_Income_Ratio']].head(20)



In [11]:
df.head(
    
)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,OCCUPATION_TYPE,ORGANIZATION_TYPE,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_YEAR
0,0,100002,1,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,Laborers,Business Entity Type 3,0.262949,0.139376,2.0,2.0,2.0,2.0,-1134.0,0.0,1.0
1,1,100003,0,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,Core staff,School,0.622246,,1.0,0.0,1.0,0.0,-828.0,0.0,0.0
2,2,100004,0,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,Laborers,Government,0.555912,0.729567,0.0,0.0,0.0,0.0,-815.0,0.0,0.0
3,3,100006,0,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,Laborers,Business Entity Type 3,0.650442,,2.0,0.0,2.0,0.0,-617.0,,
4,4,100007,0,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,Core staff,Religion,0.322738,,0.0,0.0,0.0,0.0,-1106.0,0.0,0.0


In [12]:
# Split the data into features (X) and target (y)
x = df_encoded.drop('TARGET', axis=1)
y = df['TARGET']

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [14]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [15]:
scaler = StandardScaler()
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

In [16]:
model = LogisticRegression(max_iter=3000)
model.fit(X_train_smote_scaled, y_train_smote)

In [17]:
# Predict using the models
y_pred_logistic = model.predict(X_test_scaled)

In [18]:
y_proba_logistic = model.predict_proba(X_test_scaled)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba_logistic)


In [19]:
best_f1 = 0
best_threshold = 0.5
for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_proba_logistic >= threshold).astype(int)
    current_f1 = f1_score(y_test, y_pred_adjusted)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

# Evaluate the model with the best threshold
y_pred_best = (y_proba_logistic >= best_threshold).astype(int)
best_accuracy = accuracy_score(y_test, y_pred_best)
best_f1 = f1_score(y_test, y_pred_best)

In [20]:
# Evaluate models
logistic_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_logistic),
    "ROC-AUC": roc_auc,
    "F1 Score": best_f1
}
print(logistic_metrics)

{'Accuracy': 0.9149413575563119, 'ROC-AUC': 0.6660848210977994, 'F1 Score': 0.23365422311277606}


In [21]:
# Decision Tree Model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_smote, y_train_smote)


In [22]:
# Neural Network Model
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
nn_model.fit(X_train_smote_scaled, y_train_smote)

In [26]:
# Predictions and evaluation for Decision Tree
y_pred_tree = tree_model.predict(X_test)
print("Decision Tree Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("F1 Score:", f1_score(y_test, y_pred_tree))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_tree))

# Predictions and evaluation for Neural Network
y_pred_nn = nn_model.predict(X_test_scaled)
print("\nNeural Network Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("F1 Score:", f1_score(y_test, y_pred_nn))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_nn))


Decision Tree Performance:
Accuracy: 0.8384351898020682
F1 Score: 0.15037336829504647
ROC-AUC Score: 0.5370387862261763

Neural Network Performance:
Accuracy: 0.9048604938539251
F1 Score: 0.1186866151220002
ROC-AUC Score: 0.528340851805294


In [None]:
# Create new features based on the description above

# 1. Debt-to-Income Ratio
new_sample_df['Debt_to_Income_Ratio'] = new_sample_df['AMT_CREDIT'] / new_sample_df['AMT_INCOME_TOTAL']

# 2. Annuity-to-Income Ratio
new_sample_df['Annuity_to_Income_Ratio'] = new_sample_df['AMT_ANNUITY'] / new_sample_df['AMT_INCOME_TOTAL']

# 3. Credit-to-Goods Ratio
new_sample_df['Credit_to_Goods_Ratio'] = new_sample_df['AMT_CREDIT'] / new_sample_df['AMT_GOODS_PRICE']

# 4. Income Per Child
new_sample_df['Income_Per_Child'] = new_sample_df['AMT_INCOME_TOTAL'] / (new_sample_df['CNT_CHILDREN'] + 1)

# 5. Employment to Age Ratio
new_sample_df['Employment_to_Age_Ratio'] = new_sample_df['DAYS_EMPLOYED'] / (-new_sample_df['DAYS_BIRTH'])

# 6. Income Stability (Income * Employment Length)
new_sample_df['Income_Stability'] = new_sample_df['AMT_INCOME_TOTAL'] * new_sample_df['DAYS_EMPLOYED']

# 7. Credit Percentage of Income
new_sample_df['Credit_Percentage_of_Income'] = new_sample_df['AMT_CREDIT'] / new_sample_df['AMT_INCOME_TOTAL']

# Display the first few rows to check the new features
new_sample_df.head()