In [1]:
from sklearn.feature_selection import mutual_info_classif           # Information Gain for classification task
from sklearn.feature_selection import chi2                          # chi-square feature selection method
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE                           # Recurssive feature elimination algorithm
from sklearn.feature_selection import SequentialFeatureSelector     # Forward feature selectioin method
from sklearn.linear_model import LassoCV , RidgeCV                  # lasso and ridge cross validation algorithms for feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel               # to select features according to prefit LassoCV and RidgeCV
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
from sklearn.preprocessing import StandardScaler


### Dataset Loading ###

In [2]:
PATH = './data/ld3/Loan_default.csv'
Target_Variable = 'Default'

frame = pd.read_csv(PATH)

In [3]:
frame.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [4]:
frame.isnull().sum()

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

In [5]:
for i in frame.columns[frame.isna().sum() > 0]:
    frame[i] = frame[i].interpolate('linear')

In [6]:
frame.isnull().sum()

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

In [7]:

cat_vars = ['Education','EmploymentType','MaritalStatus','HasMortgage','HasDependents','LoanPurpose','HasCoSigner']

for i in cat_vars:
    le = LabelEncoder()
    frame[i] = le.fit_transform(frame[i])

In [8]:
[frame[i].iloc[0:10] for i in frame.columns]

[0    I38PQUQS96
 1    HPSK72WA7R
 2    C1OZ6DPJ8Y
 3    V2KKSFM3UN
 4    EY08JDHTZP
 5    A9S62RQ7US
 6    H8GXPAOS71
 7    0HGZQKJ36W
 8    1R0N3LGNRJ
 9    CM9L1GTT2P
 Name: LoanID, dtype: object,
 0    56
 1    69
 2    46
 3    32
 4    60
 5    25
 6    38
 7    56
 8    36
 9    40
 Name: Age, dtype: int64,
 0     85994
 1     50432
 2     84208
 3     31713
 4     20437
 5     90298
 6    111188
 7    126802
 8     42053
 9    132784
 Name: Income, dtype: int64,
 0     50587
 1    124440
 2    129188
 3     44799
 4      9139
 5     90448
 6    177025
 7    155511
 8     92357
 9    228510
 Name: LoanAmount, dtype: int64,
 0    520
 1    458
 2    451
 3    743
 4    633
 5    720
 6    429
 7    531
 8    827
 9    480
 Name: CreditScore, dtype: int64,
 0     80
 1     15
 2     26
 3      0
 4      8
 5     18
 6     80
 7     67
 8     83
 9    114
 Name: MonthsEmployed, dtype: int64,
 0    4
 1    1
 2    3
 3    3
 4    4
 5    2
 6    1
 7    4
 8    1
 9    4
 Name: NumC

In [9]:
frame = frame.dropna()
frame

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,0,0,0,1,1,4,1,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,2,0,1,0,0,4,1,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,2,3,0,1,1,0,0,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,1,0,1,0,0,1,0,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,0,3,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255342,8C6S86ESGC,19,37979,210682,541,109,4,14.11,12,0.85,0,0,1,0,0,4,0,0
255343,98R4KDHNND,32,51953,189899,511,14,2,11.55,24,0.21,1,1,0,0,0,3,0,1
255344,XQK1UUUNGP,56,84820,208294,597,70,3,5.29,60,0.50,1,2,1,1,1,0,1,0
255345,JAO28CPL4H,42,85109,60575,809,40,1,20.90,48,0.44,1,1,2,1,1,4,0,0


In [10]:

X = frame.drop([Target_Variable,'LoanID'],axis=1)
Y = frame[Target_Variable]


In [11]:
# Identify the minority class label
unique, counts = np.unique(Y, return_counts=True)
minority_class = unique[np.argmin(counts)]

# Get the data belonging to the minority class
minority_X = X[Y == minority_class]
minority_y = Y[Y == minority_class]

# Get the majority class data (optional, for combining later)
majority_X = X[Y != minority_class]
majority_y = Y[Y != minority_class]

# Upsample the minority class to match the size of the majority class
n_samples = len(majority_X)  # Number of samples in the majority class
upsampled_minority = resample(minority_X, replace=True, n_samples=n_samples, random_state=42)
upsampled_minority_y = np.full(len(upsampled_minority), minority_class)  # Assign class labels

# Combine the upsampled data with the majority class (optional)
X_combined = np.concatenate((upsampled_minority, majority_X))
y_combined = np.concatenate((upsampled_minority_y, majority_y))


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

X = pd.DataFrame(X_scaled,columns=X.columns)
Y = y_combined





print(X.shape)
print(Y.shape)

(451388, 16)
(451388,)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### Chi-Square Feature Selection ###

In [13]:
selector = VarianceThreshold(threshold=1)  # Remove features with variance < 0.01
selector.fit(X)
print(selector.get_support())

[ True  True False  True False False  True False False  True  True  True
 False  True False  True]


In [14]:
selected_features = X.columns[selector.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Age', 'Income', 'CreditScore', 'InterestRate', 'Education',
       'EmploymentType', 'MaritalStatus', 'HasDependents', 'HasCoSigner'],
      dtype='object')


In [15]:
clf = LogisticRegression()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6583623681684222

In [16]:
clf = RandomForestClassifier()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9745366174584953

### Mutual Info ###

In [17]:
fs = mutual_info_classif(X,Y)

In [18]:
selected_features = X.columns[fs > 0.05] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Income', 'LoanAmount'], dtype='object')


In [19]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5763599379695084

In [20]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.946589329949852

### Forward Selection Method ###

In [21]:
clf = LogisticRegression(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=5)
sfs.fit(X, Y)
sfs.get_support()

array([ True,  True,  True, False,  True, False,  True, False, False,
       False, False, False, False, False, False, False])

In [22]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Age', 'Income', 'LoanAmount', 'MonthsEmployed', 'InterestRate'], dtype='object')


In [23]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6719164333809975

In [24]:
clf = RandomForestClassifier(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=5,n_jobs=16)
sfs.fit(X, Y)
sfs.get_support()



array([False,  True,  True,  True, False, False, False,  True,  True,
       False, False, False, False, False, False, False])

In [25]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Income', 'LoanAmount', 'CreditScore', 'LoanTerm', 'DTIRatio'], dtype='object')


In [26]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9813572862331246

### Recurssive Feature Elimination ###

In [27]:
estimator = LogisticRegression(n_jobs=16)
selector = RFE(estimator, n_features_to_select=5, step=0.2)
selector.fit(X, Y)
selector.support_

array([ True,  True,  True, False,  True, False,  True, False, False,
       False, False, False, False, False, False, False])

In [28]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Age', 'Income', 'LoanAmount', 'MonthsEmployed', 'InterestRate'], dtype='object')


In [29]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6719164333809975

In [30]:
estimator = RandomForestClassifier(n_jobs=16)
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, Y)
selector.support_

array([ True,  True,  True,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False])

In [31]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'InterestRate'], dtype='object')


In [32]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9715089387012533

### Lasso Regx Feature Selection ###

In [33]:
clf = LassoCV()
clf.fit(X, Y)
clf.coef_

array([-0.12278952, -0.06745614,  0.05933812, -0.02430023, -0.06974756,
        0.02008826,  0.09489895, -0.00040208,  0.01294537, -0.01715871,
        0.02912888, -0.0046818 , -0.01669751, -0.02425506, -0.00669729,
       -0.02658928])

In [34]:
selected_features = X.columns[clf.coef_ != 0]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
       'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner'],
      dtype='object')


In [35]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6792674494323941

In [36]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9814378453131398

### Ridge Regression For Feature Selection ###

In [37]:
clf = RidgeCV()
clf.fit(X, Y)
clf.coef_

array([-0.12290064, -0.06757247,  0.05945294, -0.02442953, -0.06986721,
        0.02021796,  0.09501237, -0.0005334 ,  0.0130745 , -0.01729007,
        0.02925456, -0.00481032, -0.01682624, -0.02438288, -0.00682944,
       -0.02671432])

In [38]:
selected_features = X.columns[clf.coef_ > 0.01]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['LoanAmount', 'NumCreditLines', 'InterestRate', 'DTIRatio',
       'EmploymentType'],
      dtype='object')


In [39]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6054820453950416

In [40]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9691995784074813