In [1]:
from sklearn.feature_selection import mutual_info_classif           # Information Gain for classification task
from sklearn.feature_selection import chi2                          # chi-square feature selection method
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE                           # Recurssive feature elimination algorithm
from sklearn.feature_selection import SequentialFeatureSelector     # Forward feature selectioin method
from sklearn.linear_model import LassoCV , RidgeCV                  # lasso and ridge cross validation algorithms for feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel               # to select features according to prefit LassoCV and RidgeCV
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
from sklearn.preprocessing import StandardScaler


### Dataset Loading ###

In [2]:
PATH = './data/ld5/train.csv'
Target_Variable = 'Loan Status'

frame = pd.read_csv(PATH)

In [3]:
frame.head()

Unnamed: 0,ID,Loan Amount,Funded Amount,Funded Amount Investor,Term,Batch Enrolled,Interest Rate,Grade,Sub Grade,Employment Duration,...,Recoveries,Collection Recovery Fee,Collection 12 months Medical,Application Type,Last week Pay,Accounts Delinquent,Total Collection Amount,Total Current Balance,Total Revolving Credit Limit,Loan Status
0,65087372,10000,32236,12329.36286,59,BAT2522922,11.135007,B,C4,MORTGAGE,...,2.498291,0.793724,0,INDIVIDUAL,49,0,31,311301,6619,0
1,1450153,3609,11940,12191.99692,59,BAT1586599,12.237563,C,D3,RENT,...,2.377215,0.974821,0,INDIVIDUAL,109,0,53,182610,20885,0
2,1969101,28276,9311,21603.22455,59,BAT2136391,12.545884,F,D4,MORTGAGE,...,4.316277,1.020075,0,INDIVIDUAL,66,0,34,89801,26155,0
3,6651430,11170,6954,17877.15585,59,BAT2428731,16.731201,C,C3,MORTGAGE,...,0.10702,0.749971,0,INDIVIDUAL,39,0,40,9189,60214,0
4,14354669,16890,13226,13539.92667,59,BAT5341619,15.0083,C,D4,MORTGAGE,...,1294.818751,0.368953,0,INDIVIDUAL,18,0,430,126029,22579,0


### Data Preprocessing

In [4]:
frame.isnull().sum()

ID                              0
Loan Amount                     0
Funded Amount                   0
Funded Amount Investor          0
Term                            0
Batch Enrolled                  0
Interest Rate                   0
Grade                           0
Sub Grade                       0
Employment Duration             0
Home Ownership                  0
Verification Status             0
Payment Plan                    0
Loan Title                      0
Debit to Income                 0
Delinquency - two years         0
Inquires - six months           0
Open Account                    0
Public Record                   0
Revolving Balance               0
Revolving Utilities             0
Total Accounts                  0
Initial List Status             0
Total Received Interest         0
Total Received Late Fee         0
Recoveries                      0
Collection Recovery Fee         0
Collection 12 months Medical    0
Application Type                0
Last week Pay 

In [5]:
for i in frame.columns[frame.isna().sum() > 0]:
    frame[i] = frame[i].interpolate('linear')

In [6]:
frame.isnull().sum()

ID                              0
Loan Amount                     0
Funded Amount                   0
Funded Amount Investor          0
Term                            0
Batch Enrolled                  0
Interest Rate                   0
Grade                           0
Sub Grade                       0
Employment Duration             0
Home Ownership                  0
Verification Status             0
Payment Plan                    0
Loan Title                      0
Debit to Income                 0
Delinquency - two years         0
Inquires - six months           0
Open Account                    0
Public Record                   0
Revolving Balance               0
Revolving Utilities             0
Total Accounts                  0
Initial List Status             0
Total Received Interest         0
Total Received Late Fee         0
Recoveries                      0
Collection Recovery Fee         0
Collection 12 months Medical    0
Application Type                0
Last week Pay 

In [7]:

cat_vars = ['Batch Enrolled','Grade','Sub Grade','Employment Duration','Verification Status','Payment Plan','Loan Title','Initial List Status','Application Type']

for i in cat_vars:
    le = LabelEncoder()
    frame[i] = le.fit_transform(frame[i])

In [8]:
[frame[i].iloc[0:10] for i in frame.columns]

[0    65087372
 1     1450153
 2     1969101
 3     6651430
 4    14354669
 5    50509046
 6    32737431
 7    63151650
 8     4279662
 9     4431034
 Name: ID, dtype: int64,
 0    10000
 1     3609
 2    28276
 3    11170
 4    16890
 5    34631
 6    30844
 7    20744
 8     9299
 9    19232
 Name: Loan Amount, dtype: int64,
 0    32236
 1    11940
 2     9311
 3     6954
 4    13226
 5    30203
 6    19773
 7    10609
 8    11238
 9     8962
 Name: Funded Amount, dtype: int64,
 0    12329.362860
 1    12191.996920
 2    21603.224550
 3    17877.155850
 4    13539.926670
 5     8635.931613
 6    15777.511830
 7     7645.014802
 8    13429.456610
 9     7004.097481
 Name: Funded Amount Investor, dtype: float64,
 0    59
 1    59
 2    59
 3    59
 4    59
 5    36
 6    59
 7    58
 8    59
 9    58
 Name: Term, dtype: int64,
 0    16
 1     4
 2    11
 3    15
 4    32
 5    29
 6    31
 7    17
 8    32
 9    10
 Name: Batch Enrolled, dtype: int64,
 0    11.135007
 1    12.237563
 2

In [9]:
frame = frame.dropna()
frame

Unnamed: 0,ID,Loan Amount,Funded Amount,Funded Amount Investor,Term,Batch Enrolled,Interest Rate,Grade,Sub Grade,Employment Duration,...,Recoveries,Collection Recovery Fee,Collection 12 months Medical,Application Type,Last week Pay,Accounts Delinquent,Total Collection Amount,Total Current Balance,Total Revolving Credit Limit,Loan Status
0,65087372,10000,32236,12329.36286,59,16,11.135007,1,13,0,...,2.498291,0.793724,0,0,49,0,31,311301,6619,0
1,1450153,3609,11940,12191.99692,59,4,12.237563,2,17,2,...,2.377215,0.974821,0,0,109,0,53,182610,20885,0
2,1969101,28276,9311,21603.22455,59,11,12.545884,5,18,0,...,4.316277,1.020075,0,0,66,0,34,89801,26155,0
3,6651430,11170,6954,17877.15585,59,15,16.731201,2,12,0,...,0.107020,0.749971,0,0,39,0,40,9189,60214,0
4,14354669,16890,13226,13539.92667,59,32,15.008300,2,18,0,...,1294.818751,0.368953,0,0,18,0,430,126029,22579,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67458,16164945,13601,6848,13175.28583,59,21,9.408858,2,3,0,...,564.614852,0.865230,0,0,69,0,48,181775,34301,1
67459,35182714,8323,11046,15637.46301,59,7,9.972104,2,7,2,...,2.015494,1.403368,0,0,14,0,37,22692,8714,0
67460,16435904,15897,32921,12329.45775,59,5,19.650943,0,27,0,...,5.673092,1.607093,0,0,137,0,17,176857,42330,0
67461,5300325,16567,4975,21353.68465,59,14,13.169095,3,22,1,...,1.157454,0.207608,0,0,73,0,61,361339,39075,0


In [10]:

X = frame.drop([Target_Variable],axis=1)
Y = frame[Target_Variable]


In [11]:
unique, counts = np.unique(Y, return_counts=True)
minority_class = unique[np.argmin(counts)]

minority_X = X[Y == minority_class]
minority_y = Y[Y == minority_class]

majority_X = X[Y != minority_class]
majority_y = Y[Y != minority_class]

n_samples = len(majority_X)  # Number of samples in the majority class
upsampled_minority = resample(minority_X, replace=True, n_samples=n_samples, random_state=42)
upsampled_minority_y = np.full(len(upsampled_minority), minority_class)  # Assign class labels

X_combined = np.concatenate((upsampled_minority, majority_X))
y_combined = np.concatenate((upsampled_minority_y, majority_y))


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

X = pd.DataFrame(X_scaled,columns=X.columns)
Y = y_combined


print(X.shape)
print(Y.shape)

(122444, 34)
(122444,)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### Variance Threshhold For Feature Selection

In [13]:
selector = VarianceThreshold(threshold=1)  # Remove features with variance < 0.01
selector.fit(X)
print(selector.get_support())

[ True False False  True  True False False False False  True  True  True
 False False  True  True False False  True False  True  True  True False
 False False  True False  True  True False False  True  True]


In [14]:
selected_features = X.columns[selector.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['ID', 'Funded Amount Investor', 'Term', 'Employment Duration',
       'Home Ownership', 'Verification Status', 'Debit to Income',
       'Delinquency - two years', 'Public Record', 'Revolving Utilities',
       'Total Accounts', 'Initial List Status', 'Collection Recovery Fee',
       'Application Type', 'Last week Pay', 'Total Current Balance',
       'Total Revolving Credit Limit'],
      dtype='object')


### Logistic Regression With Variance Threshhold

In [15]:
clf = LogisticRegression()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5204048803425149

### Random Forest With Variance Threshhold

In [16]:
clf = RandomForestClassifier()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9991833098225555

### Mutual Information Gain For Feature Selection

In [17]:
fs = mutual_info_classif(X,Y)

In [18]:
selected_features = X.columns[fs > 0.05] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['ID', 'Loan Amount', 'Funded Amount', 'Funded Amount Investor',
       'Interest Rate', 'Home Ownership', 'Debit to Income',
       'Revolving Balance', 'Revolving Utilities', 'Total Received Interest',
       'Total Received Late Fee', 'Recoveries', 'Collection Recovery Fee',
       'Total Current Balance', 'Total Revolving Credit Limit'],
      dtype='object')


### Logistic Regression With Mutual Information Gain

In [19]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5084762541143861

### Random Forest With Mutual information Gain

In [20]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9991833098225555

### Forward Feature Selection (FFS) Method For Logistic Regression

In [21]:
clf = LogisticRegression(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=5)
sfs.fit(X, Y)
sfs.get_support()

array([False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False,  True, False, False,
       False, False, False, False,  True,  True, False])

In [22]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Employment Duration', 'Initial List Status', 'Total Received Late Fee',
       'Total Collection Amount', 'Total Current Balance'],
      dtype='object')


### Logistic Regression With FFS

In [23]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5182517880565248

### Forward Feature Selection For Random Forest

In [24]:
clf = RandomForestClassifier(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=5,n_jobs=16)
sfs.fit(X, Y)
sfs.get_support()

array([False, False, False, False, False, False,  True, False, False,
       False,  True, False, False, False,  True, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False])

In [25]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Interest Rate', 'Home Ownership', 'Debit to Income',
       'Revolving Balance', 'Total Current Balance'],
      dtype='object')


### Random Forest With FFS

In [26]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9980448932115723

### Recurssive Feature Elimination For Logistic Regression

In [27]:
estimator = LogisticRegression(n_jobs=16)
selector = RFE(estimator, n_features_to_select=5, step=0.2)
selector.fit(X, Y)
selector.support_

array([False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
        True, False, False, False,  True, False, False, False, False,
       False, False, False, False, False,  True, False])

In [28]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Grade', 'Delinquency - two years', 'Public Record',
       'Initial List Status', 'Total Current Balance'],
      dtype='object')


### Logistic Regression With RFE

In [29]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5200089093473903

### Recurssive Feature Elimination For Random Forest

In [30]:
estimator = RandomForestClassifier(n_jobs=16)
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, Y)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,  True, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True])

In [31]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Revolving Balance', 'Revolving Utilities', 'Recoveries',
       'Collection Recovery Fee', 'Total Revolving Credit Limit'],
      dtype='object')


### Random Forest With RFE

In [32]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9973271957829088

### Lasso Regression Feature Selection

In [33]:
clf = LassoCV()
clf.fit(X, Y)
clf.coef_

array([-0.00033894, -0.00312509,  0.0012187 ,  0.00016662, -0.00446759,
       -0.00572114,  0.00134265,  0.00856905,  0.00297809,  0.00728517,
        0.00225636,  0.00199381,  0.        , -0.00072998, -0.00188578,
        0.00738815, -0.00124467, -0.00557758,  0.01008322, -0.00075726,
        0.00451508,  0.00023449, -0.01109231,  0.00091927,  0.00669987,
        0.00077059, -0.00248957, -0.00152359, -0.00068712,  0.00596207,
        0.        ,  0.0067743 ,  0.00855814,  0.00204257])

In [34]:
selected_features = X.columns[clf.coef_ != 0]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['ID', 'Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term',
       'Batch Enrolled', 'Interest Rate', 'Grade', 'Sub Grade',
       'Employment Duration', 'Home Ownership', 'Verification Status',
       'Loan Title', 'Debit to Income', 'Delinquency - two years',
       'Inquires - six months', 'Open Account', 'Public Record',
       'Revolving Balance', 'Revolving Utilities', 'Total Accounts',
       'Initial List Status', 'Total Received Interest',
       'Total Received Late Fee', 'Recoveries', 'Collection Recovery Fee',
       'Collection 12 months Medical', 'Application Type', 'Last week Pay',
       'Total Collection Amount', 'Total Current Balance',
       'Total Revolving Credit Limit'],
      dtype='object')


### Logistic Regression With Lasso Regression For Feature Selection

In [35]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5234489073675354

### Random Forest With Lasso Regression For Feature Selection

In [36]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9991833098225555

### Ridge Regression For Feature Selection ###

In [37]:
clf = RidgeCV()
clf.fit(X, Y)
clf.coef_

array([-0.00035147, -0.00313674,  0.00123081,  0.0001784 , -0.00447885,
       -0.00573236,  0.00135381,  0.00857788,  0.00298889,  0.00729617,
        0.0022687 ,  0.002008  ,  0.        , -0.00074157, -0.00189751,
        0.00739913, -0.00125769, -0.00558838,  0.01009363, -0.0007702 ,
        0.00452763,  0.00024501, -0.01110261,  0.00093102,  0.0067115 ,
        0.0007824 , -0.00250202, -0.00153597, -0.00069961,  0.00597329,
        0.        ,  0.00678548,  0.00856973,  0.00205304])

In [38]:
selected_features = X.columns[clf.coef_ > 0.01]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Public Record'], dtype='object')


### Logistic Regression With Ridge Regression For Feature Selection

In [39]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5020169772564159

### Random Forest With Ridge Regression For Feature Selection

In [40]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5019179845076348