In [1]:
from sklearn.feature_selection import mutual_info_classif           # Information Gain for classification task
from sklearn.feature_selection import chi2                          # chi-square feature selection method
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE                           # Recurssive feature elimination algorithm
from sklearn.feature_selection import SequentialFeatureSelector     # Forward feature selectioin method
from sklearn.linear_model import LassoCV , RidgeCV                  # lasso and ridge cross validation algorithms for feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel               # to select features according to prefit LassoCV and RidgeCV
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
from sklearn.preprocessing import StandardScaler


### Dataset Loading ###

In [2]:
PATH = './data/ld4/Anonymize_Loan_Default_data.csv'
Target_Variable = 'repay_fail'

frame = pd.read_csv(PATH)

In [3]:
frame.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,...,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,repay_fail
2,2,2,0.0,0.0,0.0,36 months,0.0,0.0,< 1 year,RENT,...,1.0,0.0,0.0,0.0,0.0,Jan-07,0.0,Jan-07,Jan-07,1
3,545583,703644,2500.0,2500.0,2500.0,36 months,13.98,85.42,4 years,RENT,...,10.0,3075.291779,3075.29,2500.0,575.29,Jul-13,90.85,Aug-13,Jun-16,0
4,532101,687836,5000.0,5000.0,5000.0,36 months,15.95,175.67,4 years,RENT,...,15.0,2948.76,2948.76,1909.02,873.81,Nov-11,175.67,,Mar-12,1
5,877788,1092507,7000.0,7000.0,7000.0,36 months,9.91,225.58,10+ years,MORTGAGE,...,20.0,8082.39188,8082.39,7000.0,1082.39,Mar-14,1550.27,,Mar-14,0
6,875406,1089981,2000.0,2000.0,2000.0,36 months,5.42,60.32,10+ years,RENT,...,15.0,2161.663244,2161.66,2000.0,161.66,Feb-14,53.12,,Jun-16,0


In [4]:
frame.isnull().sum()

id                            0
member_id                     0
loan_amnt                     1
funded_amnt                   1
funded_amnt_inv               1
term                          0
int_rate                      0
installment                   1
emp_length                  993
home_ownership                0
annual_inc                    2
verification_status           0
issue_d                       0
loan_status                   0
purpose                       0
zip_code                      0
addr_state                    0
dti                           0
delinq_2yrs                   1
earliest_cr_line              0
inq_last_6mths                1
mths_since_last_delinq    24363
open_acc                      1
pub_rec                       1
revol_bal                     4
revol_util                   59
total_acc                     1
total_pymnt                   1
total_pymnt_inv               1
total_rec_prncp               1
total_rec_int                 1
last_pym

In [5]:
for i in frame.columns[frame.isna().sum() > 0]:
    frame[i] = frame[i].interpolate('linear')

  frame[i] = frame[i].interpolate('linear')


In [6]:
frame = frame.drop(['next_pymnt_d'],axis=1)

frame.isnull().sum()

id                          0
member_id                   0
loan_amnt                   0
funded_amnt                 0
funded_amnt_inv             0
term                        0
int_rate                    0
installment                 0
emp_length                993
home_ownership              0
annual_inc                  0
verification_status         0
issue_d                     0
loan_status                 0
purpose                     0
zip_code                    0
addr_state                  0
dti                         0
delinq_2yrs                 0
earliest_cr_line            0
inq_last_6mths              0
mths_since_last_delinq      2
open_acc                    0
pub_rec                     0
revol_bal                   0
revol_util                 59
total_acc                   0
total_pymnt                 0
total_pymnt_inv             0
total_rec_prncp             0
total_rec_int               0
last_pymnt_d               71
last_pymnt_amnt             0
last_credi

In [7]:

cat_vars = ['term','emp_length','home_ownership','verification_status','loan_status','purpose','zip_code','addr_state','earliest_cr_line']

for i in cat_vars:
    le = LabelEncoder()
    frame[i] = le.fit_transform(frame[i])

In [8]:
[frame[i].iloc[0:10] for i in frame.columns]

[2           2
 3      545583
 4      532101
 5      877788
 6      875406
 7      506439
 8      981465
 9      749050
 10    1016373
 11     786870
 Name: id, dtype: int64,
 2           2
 3      703644
 4      687836
 5     1092507
 6     1089981
 7      652909
 8     1204637
 9      948200
 10    1243872
 11     990345
 Name: member_id, dtype: int64,
 2         0.0
 3      2500.0
 4      5000.0
 5      7000.0
 6      2000.0
 7      3600.0
 8      8000.0
 9      6000.0
 10    25600.0
 11    19750.0
 Name: loan_amnt, dtype: float64,
 2         0.0
 3      2500.0
 4      5000.0
 5      7000.0
 6      2000.0
 7      3600.0
 8      8000.0
 9      6000.0
 10    25600.0
 11    19750.0
 Name: funded_amnt, dtype: float64,
 2         0.00000
 3      2500.00000
 4      5000.00000
 5      7000.00000
 6      2000.00000
 7      3600.00000
 8      8000.00000
 9      6000.00000
 10    25472.82947
 11    19750.00000
 Name: funded_amnt_inv, dtype: float64,
 2     0
 3     0
 4     0
 5     0
 6     

In [9]:
frame = frame.dropna()
frame

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,...,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,repay_fail
4,532101,687836,5000.0,5000.0,5000.00000,0,15.95,175.67,4,4,...,99.90%,15.0,2948.760000,2948.76,1909.02,873.81,Nov-11,175.67,Mar-12,1
5,877788,1092507,7000.0,7000.0,7000.00000,0,9.91,225.58,1,0,...,47.20%,20.0,8082.391880,8082.39,7000.00,1082.39,Mar-14,1550.27,Mar-14,0
6,875406,1089981,2000.0,2000.0,2000.00000,0,5.42,60.32,1,4,...,0%,15.0,2161.663244,2161.66,2000.00,161.66,Feb-14,53.12,Jun-16,0
7,506439,652909,3600.0,3600.0,3600.00000,0,10.25,116.59,1,0,...,0%,25.0,4206.031191,4206.03,3600.00,606.03,May-13,146.75,Jun-16,0
8,981465,1204637,8000.0,8000.0,8000.00000,0,6.03,243.49,11,0,...,13.60%,49.0,8724.971815,8724.97,8000.00,724.97,Apr-14,1423.66,Apr-14,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38476,849205,1060907,3000.0,3000.0,3000.00000,0,11.99,99.63,3,4,...,73.50%,22.0,3586.619764,3586.62,3000.00,586.62,Aug-14,100.30,Jun-16,0
38477,852914,1065048,10400.0,10400.0,10400.00000,0,13.49,352.88,9,4,...,39.70%,33.0,12703.534030,12703.53,10400.00,2303.53,Sep-14,393.08,Jun-16,0
38478,519553,671637,16000.0,10550.0,10531.35818,1,14.96,250.77,1,0,...,62.20%,25.0,14202.267530,14163.31,10550.00,3652.27,Jun-13,5439.96,Feb-16,0
38479,825638,1034448,10000.0,10000.0,10000.00000,0,16.89,355.99,0,4,...,53.30%,12.0,12815.178320,12815.18,10000.00,2815.18,Aug-14,380.63,Sep-15,0


In [10]:
frame['revol_util'] = frame['revol_util'].apply(lambda x : float(x.strip('�').strip('%')))

In [11]:

X = frame.drop([Target_Variable,'issue_d','last_pymnt_d','last_credit_pull_d'],axis=1)
Y = frame[Target_Variable]


In [12]:
# Identify the minority class label
unique, counts = np.unique(Y, return_counts=True)
minority_class = unique[np.argmin(counts)]

# Get the data belonging to the minority class
minority_X = X[Y == minority_class]
minority_y = Y[Y == minority_class]

# Get the majority class data (optional, for combining later)
majority_X = X[Y != minority_class]
majority_y = Y[Y != minority_class]

# Upsample the minority class to match the size of the majority class
n_samples = len(majority_X)  # Number of samples in the majority class
upsampled_minority = resample(minority_X, replace=True, n_samples=n_samples, random_state=42)
upsampled_minority_y = np.full(len(upsampled_minority), minority_class)  # Assign class labels

# Combine the upsampled data with the majority class (optional)
X_combined = np.concatenate((upsampled_minority, majority_X))
y_combined = np.concatenate((upsampled_minority_y, majority_y))


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

X = pd.DataFrame(X_scaled,columns=X.columns)
Y = y_combined





print(X.shape)
print(Y.shape)

(65216, 31)
(65216,)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### Chi-Square Feature Selection ###

In [14]:
selector = VarianceThreshold(threshold=1)  # Remove features with variance < 0.01
selector.fit(X)
print(selector.get_support())

[ True False False False False False False False  True  True  True  True
  True  True False False False  True  True  True False False  True False
 False  True  True False False  True False]


In [15]:
selected_features = X.columns[selector.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['id', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'loan_status', 'purpose', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'pub_rec', 'total_acc',
       'total_pymnt', 'total_rec_int'],
      dtype='object')


In [16]:
clf = LogisticRegression()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9832264659418269

In [17]:
clf = RandomForestClassifier()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9999070718334727

### Mutual Info ###

In [18]:
fs = mutual_info_classif(X,Y)

In [19]:
selected_features = X.columns[fs > 0.05] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['id', 'member_id', 'funded_amnt_inv', 'int_rate', 'installment',
       'annual_inc', 'loan_status', 'dti', 'revol_bal', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'last_pymnt_amnt'],
      dtype='object')


In [20]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9946566304246818

In [21]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9999535359167364

### Forward Selection Method ###

In [22]:
clf = LogisticRegression(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=5)
sfs.fit(X, Y)
sfs.get_support()

array([False, False, False, False, False, False, False, False,  True,
        True, False, False,  True, False,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [23]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['emp_length', 'home_ownership', 'loan_status', 'zip_code',
       'addr_state'],
      dtype='object')


In [24]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.985270885605427

In [25]:
clf = RandomForestClassifier(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=5,n_jobs=16)
sfs.fit(X, Y)
sfs.get_support()

array([False,  True,  True,  True, False,  True, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [26]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['member_id', 'loan_amnt', 'funded_amnt', 'term', 'loan_status'], dtype='object')


In [27]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

1.0

### Recurssive Feature Elimination ###

In [28]:
estimator = LogisticRegression(n_jobs=16)
selector = RFE(estimator, n_features_to_select=5, step=0.2)
selector.fit(X, Y)
selector.support_

array([False, False, False,  True,  True, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True, False, False])

In [29]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['funded_amnt', 'funded_amnt_inv', 'installment', 'total_pymnt_inv',
       'total_rec_prncp'],
      dtype='object')


In [30]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9822507201932906

In [31]:
estimator = RandomForestClassifier(n_jobs=16)
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, Y)
selector.support_

array([False, False, False,  True, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True, False,  True])

In [32]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['funded_amnt', 'loan_status', 'total_pymnt_inv', 'total_rec_prncp',
       'last_pymnt_amnt'],
      dtype='object')


In [33]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

1.0

### Lasso Regx Feature Selection ###

In [34]:
clf = LassoCV()
clf.fit(X, Y)
clf.coef_

array([-0.00000000e+00, -2.69444430e-02,  8.12422908e-03,  1.10812052e-02,
       -1.40942248e-02,  0.00000000e+00, -0.00000000e+00,  4.93304254e-02,
       -1.59869717e-04,  2.51938947e-04, -2.37585086e-03,  0.00000000e+00,
       -4.16692095e-01,  6.10298782e-04,  1.22998305e-03, -0.00000000e+00,
        2.35672193e-03, -0.00000000e+00, -1.45522311e-03,  4.66774432e-02,
       -3.52319370e-03, -4.56616179e-04,  2.39173822e-03,  1.23895418e-02,
        4.14709472e-03, -3.18674355e-03, -0.00000000e+00,  2.25949203e-02,
       -1.16389672e-01,  4.33754157e-03,  7.10447886e-03])

In [35]:
selected_features = X.columns[clf.coef_ != 0]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'installment', 'emp_length', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'zip_code', 'dti', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec',
       'revol_bal', 'revol_util', 'total_acc', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'last_pymnt_amnt'],
      dtype='object')


In [36]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9944243100083635

In [37]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9998606077502091

### Ridge Regression For Feature Selection ###

In [38]:
clf = RidgeCV()
clf.fit(X, Y)
clf.coef_

array([ 1.70560559e-02, -4.49558116e-02,  7.22219496e-03,  4.99308835e-02,
       -6.14645971e-02,  2.46331475e-03, -3.34320879e-03,  5.53074265e-02,
       -5.79000623e-04,  1.11408361e-03, -2.82836808e-03,  9.16177879e-04,
       -4.16955230e-01,  1.01229379e-03,  1.72235936e-03,  2.08834883e-04,
        2.76325149e-03,  2.53836851e-05, -1.73831427e-03,  4.66401698e-02,
       -3.91702609e-03, -9.31496698e-04,  2.97427511e-03,  1.21006074e-02,
        5.13584976e-03, -3.31731368e-03, -7.49200601e-02,  9.43035773e-02,
       -1.13857051e-01,  9.87850522e-03,  9.06937242e-03])

In [39]:
selected_features = X.columns[clf.coef_ > 0.01]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['id', 'funded_amnt', 'installment', 'inq_last_6mths', 'revol_bal',
       'total_pymnt_inv'],
      dtype='object')


In [40]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.8616299600408884

In [41]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9851779574388997