In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import os
from pathlib import Path 
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
%matplotlib inline
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200) # So we can see all columns
pd.set_option('display.max_rows',200)

In [2]:
df = pd.read_csv('D:/Banque Misr Internship/Loan Datasets/sampled_data.csv')

In [None]:
# filepath = Path('D:/Banque Misr Internship/Loan Datasets/sampled_data.csv')  
# filepath.parent.mkdir(parents=True, exist_ok=True)  
# df.to_csv(filepath)

In [3]:
df.head(5)


Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,OCCUPATION_TYPE,ORGANIZATION_TYPE,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,Laborers,Business Entity Type 3,0.262949,0.139376,2.0,2.0,2.0,2.0,-1134.0,0.0,1.0
1,100003,0,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,Core staff,School,0.622246,,1.0,0.0,1.0,0.0,-828.0,0.0,0.0
2,100004,0,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,Laborers,Government,0.555912,0.729567,0.0,0.0,0.0,0.0,-815.0,0.0,0.0
3,100006,0,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,Laborers,Business Entity Type 3,0.650442,,2.0,0.0,2.0,0.0,-617.0,,
4,100007,0,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,Core staff,Religion,0.322738,,0.0,0.0,0.0,0.0,-1106.0,0.0,0.0


Convert negative values to positive

In [4]:
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].abs()
df['DAYS_BIRTH'] = df['DAYS_BIRTH'].abs()

In [5]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,OCCUPATION_TYPE,ORGANIZATION_TYPE,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,9461,637,Laborers,Business Entity Type 3,0.262949,0.139376,2.0,2.0,2.0,2.0,-1134.0,0.0,1.0
1,100003,0,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,16765,1188,Core staff,School,0.622246,,1.0,0.0,1.0,0.0,-828.0,0.0,0.0
2,100004,0,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,19046,225,Laborers,Government,0.555912,0.729567,0.0,0.0,0.0,0.0,-815.0,0.0,0.0
3,100006,0,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,19005,3039,Laborers,Business Entity Type 3,0.650442,,2.0,0.0,2.0,0.0,-617.0,,
4,100007,0,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,19932,3038,Core staff,Religion,0.322738,,0.0,0.0,0.0,0.0,-1106.0,0.0,0.0


One-hot encode the categorical features

In [6]:
categorical_columns = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
    'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
    'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 
    'ORGANIZATION_TYPE'
]
# One-hot encode the categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Check the resulting DataFrame
df_encoded.head()
df_encoded.columns

Index(['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       ...
       'ORGANIZATION_TYPE_Trade: type 4', 'ORGANIZATION_TYPE_Trade: type 5',
       'ORGANIZATION_TYPE_Trade: type 6', 'ORGANIZATION_TYPE_Trade: type 7',
       'ORGANIZATION_TYPE_Transport: type 1',
       'ORGANIZATION_TYPE_Transport: type 2',
       'ORGANIZATION_TYPE_Transport: type 3',
       'ORGANIZATION_TYPE_Transport: type 4', 'ORGANIZATION_TYPE_University',
       'ORGANIZATION_TYPE_XNA'],
      dtype='object', length=124)

In [7]:
# Identify the remaining columns with null values
null_columns = df_encoded.columns[df_encoded.isnull().any()]

# Display columns with nulls
print("Columns with nulls:", null_columns)

Columns with nulls: Index(['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
       'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object')


Remove Nulls

In [8]:
df_cleaned = df_encoded.dropna()


In [9]:
def get_numerical_summary(df):
    total = df.shape[0]
    missing_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
    missing_percent = {}
    for col in missing_columns:
        null_count = df[col].isnull().sum()
        per = (null_count/total) * 100
        missing_percent[col] = per
        print("{} : {} ({}%)".format(col, null_count, round(per, 3)))
    return missing_percent

In [10]:
missing_percent = get_numerical_summary(df_cleaned)
print(missing_percent)

{}


In [11]:
# Split the data into features (X) and target (y)
x = df_cleaned.drop('TARGET', axis=1)
y = df_cleaned['TARGET']

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [13]:
from imblearn.over_sampling import ADASYN


In [14]:
adasyn = ADASYN(sampling_strategy='minority', n_neighbors=5, random_state=42)
X_res, y_res = adasyn.fit_resample(X_train, y_train)

In [15]:
# Split the resampled data into training and validation sets
X_train_res, X_val_res, y_train_res, y_val_res = train_test_split(X_res, y_res, test_size=0.3, random_state=42)


In [21]:
# Initialize the Logistic Regression model
logreg = LogisticRegression(C=0.1,max_iter=5000, random_state=42)
#Saga handles large datasets and can be used for L1 (lasso) regularization.
# Train the model on the resampled training data
logreg.fit(X_train_res, y_train_res)

# Predict on the validation set
y_pred_logreg = logreg.predict(X_val_res)

# Evaluate the model
print("Logistic Regression Model")
print("Accuracy:", accuracy_score(y_val_res, y_pred_logreg))
print(f1_score(y_val_res, y_pred_logreg))
print(roc_auc_score(y_val_res, y_pred_logreg))


Logistic Regression Model
Accuracy: 0.799991613904147
0.8029251363411006
0.799945913984376


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3

In [None]:
# Evaluate models
logistic_metrics = {
    "F1 Score": best_f1,
    "ROC": roc_auc_score(y_test, y_pred_logreg)
}
print(logistic_metrics)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_smote_scaled, y_train_smote)

In [None]:
# Predict using the models
y_pred_logistic = model.predict(X_test_scaled)

In [None]:
y_proba_logistic = model.predict_proba(X_test_scaled)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba_logistic)


In [None]:
best_f1 = 0
best_threshold = 0.5
for threshold in np.arange(0.1, 1.0, 0.1):
    y_pred_adjusted = (y_proba_logistic >= threshold).astype(int)
    current_f1 = f1_score(y_test, y_pred_adjusted)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

# Evaluate the model with the best threshold
y_pred_best = (y_proba_logistic >= best_threshold).astype(int)
best_accuracy = accuracy_score(y_test, y_pred_best)
best_f1 = f1_score(y_test, y_pred_best)

In [None]:
# Evaluate models
logistic_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_logistic),
    "ROC-AUC": roc_auc,
    "F1 Score": best_f1
}
print(logistic_metrics)

In [22]:
# Decision Tree Model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_res, y_train_res)


In [23]:
# Neural Network Model
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000, random_state=42)
nn_model.fit(X_train_res, y_train_res)

In [24]:
# Predictions and evaluation for Decision Tree
y_pred_tree = tree_model.predict(X_val_res)
print("Decision Tree Performance:")
print("Accuracy:", accuracy_score(y_val_res, y_pred_tree))
print(f1_score(y_val_res, y_pred_tree))
print(roc_auc_score(y_val_res, y_pred_tree))

# Predictions and evaluation for Neural Network
y_pred_nn = nn_model.predict(X_val_res)
print("\nNeural Network Performance:")
print("Accuracy:", accuracy_score(y_val_res, y_pred_nn))
print(f1_score(y_val_res, y_pred_nn))
print(roc_auc_score(y_val_res, y_pred_nn))


Decision Tree Performance:
Accuracy: 0.8992620235649293
0.9003422171523385
0.8992335110794214

Neural Network Performance:
Accuracy: 0.5020650761038199
0.6684349762325233
0.5001264438610129
