In [1]:
from sklearn.feature_selection import mutual_info_classif           # Information Gain for classification task
from sklearn.feature_selection import chi2                          # chi-square feature selection method
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE                           # Recurssive feature elimination algorithm
from sklearn.feature_selection import SequentialFeatureSelector     # Forward feature selectioin method
from sklearn.linear_model import LassoCV , RidgeCV                  # lasso and ridge cross validation algorithms for feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel               # to select features according to prefit LassoCV and RidgeCV
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
from sklearn.preprocessing import StandardScaler


### Dataset Loading ###

In [2]:
PATH = './data/ld1/Loan_Default.csv'
Target_Variable = 'Status'

frame = pd.read_csv(PATH)

In [3]:
frame.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


### Data Preprocessing

In [4]:
frame.isnull().sum()

ID                               0
year                             0
loan_limit                    3344
Gender                           0
approv_in_adv                  908
loan_type                        0
loan_purpose                   134
Credit_Worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             36439
Interest_rate_spread         36639
Upfront_charges              39642
term                            41
Neg_ammortization              121
interest_only                    0
lump_sum_payment                 0
property_value               15098
construction_type                0
occupancy_type                   0
Secured_by                       0
total_units                      0
income                        9150
credit_type                      0
Credit_Score                     0
co-applicant_credit_type         0
age                            200
submission_of_applic

In [5]:
for i in frame.columns[frame.isna().sum() > 0]:
    frame[i] = frame[i].interpolate('linear')

  frame[i] = frame[i].interpolate('linear')


In [6]:
frame.isnull().sum()

ID                              0
year                            0
loan_limit                   3344
Gender                          0
approv_in_adv                 908
loan_type                       0
loan_purpose                  134
Credit_Worthiness               0
open_credit                     0
business_or_commercial          0
loan_amount                     0
rate_of_interest                2
Interest_rate_spread            2
Upfront_charges                 2
term                            0
Neg_ammortization             121
interest_only                   0
lump_sum_payment                0
property_value                  0
construction_type               0
occupancy_type                  0
Secured_by                      0
total_units                     0
income                          0
credit_type                     0
Credit_Score                    0
co-applicant_credit_type        0
age                           200
submission_of_application     200
LTV           

Encoding Categorical variables

In [7]:

cat_vars = ['loan_limit', 'Gender', 'approv_in_adv', 'loan_type','loan_purpose', 'Credit_Worthiness', 'open_credit','business_or_commercial', 'term', 'Neg_ammortization','interest_only', 'construction_type', 'occupancy_type', 'Secured_by',  'credit_type', 'co-applicant_credit_type','submission_of_application', 'Region', 'Security_Type', 'dtir1','lump_sum_payment','total_units','age']

for i in cat_vars:
    le = LabelEncoder()
    frame[i] = le.fit_transform(frame[i])

In [8]:
[frame[i].iloc[0:10] for i in frame.columns]

[0    24890
 1    24891
 2    24892
 3    24893
 4    24894
 5    24895
 6    24896
 7    24897
 8    24898
 9    24899
 Name: ID, dtype: int64,
 0    2019
 1    2019
 2    2019
 3    2019
 4    2019
 5    2019
 6    2019
 7    2019
 8    2019
 9    2019
 Name: year, dtype: int64,
 0    0
 1    0
 2    0
 3    0
 4    0
 5    0
 6    0
 7    2
 8    0
 9    0
 Name: loan_limit, dtype: int64,
 0    3
 1    2
 2    2
 3    2
 4    1
 5    1
 6    1
 7    0
 8    1
 9    3
 Name: Gender, dtype: int64,
 0    0
 1    0
 2    1
 3    0
 4    1
 5    1
 6    1
 7    0
 8    0
 9    0
 Name: approv_in_adv, dtype: int64,
 0    0
 1    1
 2    0
 3    0
 4    0
 5    0
 6    0
 7    0
 8    0
 9    2
 Name: loan_type, dtype: int64,
 0    0
 1    0
 2    0
 3    3
 4    0
 5    0
 6    2
 7    3
 8    2
 9    2
 Name: loan_purpose, dtype: int64,
 0    0
 1    0
 2    0
 3    0
 4    0
 5    0
 6    0
 7    0
 8    0
 9    0
 Name: Credit_Worthiness, dtype: int64,
 0    0
 1    0
 2    0
 3    0
 

In [9]:
frame = frame.dropna()
frame

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
2,24892,2019,0,2,1,0,0,0,0,1,...,3,834,0,1,1,80.019685,3,1,0,356
3,24893,2019,0,2,0,0,3,0,0,1,...,3,587,0,2,0,69.376900,0,1,0,308
4,24894,2019,0,1,1,0,0,0,0,1,...,1,602,1,0,0,91.886544,0,1,0,273
5,24895,2019,0,1,1,0,0,0,0,1,...,3,864,1,1,0,70.089286,0,1,0,285
6,24896,2019,0,1,1,0,2,0,0,1,...,3,860,1,3,1,79.109589,0,1,0,334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,173555,2019,0,3,0,0,2,0,0,1,...,0,659,1,3,1,71.792763,3,1,0,380
148666,173556,2019,0,2,0,0,0,0,0,1,...,0,569,0,0,0,74.428934,3,1,0,37
148667,173557,2019,0,2,0,0,3,0,0,1,...,0,702,1,2,0,61.332418,0,1,0,390
148668,173558,2019,0,0,0,0,3,0,0,1,...,3,737,1,3,1,70.683453,0,1,0,160


In [10]:

X = frame.drop([Target_Variable],axis=1)
Y = frame[Target_Variable]


Balancing the difference in distribution using upsampling technique

In [11]:
unique, counts = np.unique(Y, return_counts=True)
minority_class = unique[np.argmin(counts)]

minority_X = X[Y == minority_class]
minority_y = Y[Y == minority_class]


majority_X = X[Y != minority_class]
majority_y = Y[Y != minority_class]


n_samples = len(majority_X)
upsampled_minority = resample(minority_X, replace=True, n_samples=n_samples, random_state=42)
upsampled_minority_y = np.full(len(upsampled_minority), minority_class)


X_combined = np.concatenate((upsampled_minority, majority_X))
y_combined = np.concatenate((upsampled_minority_y, majority_y))


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

X = pd.DataFrame(X_scaled,columns=X.columns)
Y = y_combined


print(X.shape)
print(Y.shape)

(224062, 33)
(224062,)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### Variance Threshhold Feature Selection ###

In [13]:
selector = VarianceThreshold(threshold=1)  # Remove features with variance < 0.01
selector.fit(X)
print(selector.get_support())

[False False  True False False False  True False False False  True  True
 False  True False False False False False False  True False  True  True
  True False False  True False  True False False  True]


In [14]:
selected_features = X.columns[selector.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['loan_limit', 'loan_purpose', 'loan_amount', 'rate_of_interest',
       'Upfront_charges', 'occupancy_type', 'total_units', 'income',
       'credit_type', 'age', 'LTV', 'dtir1'],
      dtype='object')


### Logistic Regression With Variance Threshhold

In [15]:
clf = LogisticRegression()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.5911064226883597

### Random Forest With Variance Threshhold

In [16]:
clf = RandomForestClassifier()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9678662717572118

### Mutual Information Gain Feature Selection

In [17]:
fs = mutual_info_classif(X,Y)

In [18]:
selected_features = X.columns[fs > 0.05] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['ID', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges',
       'property_value', 'credit_type', 'LTV', 'dtir1'],
      dtype='object')


### Logistic Regression With Mutual Information Gain

In [19]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6117580232888384

### Random Forest With Mutual Information Gain

In [20]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9688535453943009

### Forward Feature Selection for Logistic Regression (FFS) Method

In [21]:
clf = LogisticRegression(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=5)
sfs.fit(X, Y)
sfs.get_support()

array([False,  True, False, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
       False,  True, False, False, False, False,  True, False, False,
       False, False, False, False, False, False])

In [22]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['year', 'Credit_Worthiness', 'Neg_ammortization', 'construction_type',
       'credit_type'],
      dtype='object')


### Logistic Regression With FFS

In [23]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.654670615761215

### Forward Feature Selection (FFS) Method for Random Forest 

In [24]:
clf = RandomForestClassifier(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=5,n_jobs=16)
sfs.fit(X, Y)
sfs.get_support()



array([False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False, False, False, False, False,  True,  True, False,
       False, False, False, False, False, False])

In [25]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges',
       'credit_type', 'Credit_Score'],
      dtype='object')


### Random Forest With FFS

In [26]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9798352740698665

### Recursive Feature Elimination (RFE) for Logistic Regression

In [27]:
estimator = LogisticRegression(n_jobs=16)
selector = RFE(estimator, n_features_to_select=5, step=0.2)
selector.fit(X, Y)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False,  True,
       False, False, False, False, False, False,  True, False,  True,
       False,  True, False, False, False, False])

In [28]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Neg_ammortization', 'lump_sum_payment', 'credit_type',
       'co-applicant_credit_type', 'submission_of_application'],
      dtype='object')


### Logistic Regression With RFE

In [29]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6325448668533019

### Recursive Feature Elimination (RFE) for Random Forest

In [30]:
estimator = RandomForestClassifier(n_jobs=16)
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, Y)
selector.support_

array([False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False,  True, False, False, False])

In [31]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges',
       'credit_type', 'LTV'],
      dtype='object')


### Random Forest With RFE

In [32]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9773468035325462

### Lasso Regression for  Feature Selection

In [33]:
clf = LassoCV()
clf.fit(X, Y)
clf.coef_

array([-1.04068289e-03,  0.00000000e+00,  2.02212252e-02,  1.28218298e-02,
       -6.84943926e-03, -1.31266053e-02, -1.41520539e-02,  1.26125866e-02,
       -8.63711134e-03, -4.90824613e-02,  1.99606905e-02,  5.00763849e-04,
       -2.82477567e-02, -8.74756060e-03, -5.80594345e-06, -7.39015648e-02,
       -1.37805741e-02, -8.39940100e-02, -1.00759938e-02, -8.51224233e-03,
       -1.92690237e-02,  4.56912338e-05,  1.73005310e-02, -3.40582130e-02,
        5.16923690e-02,  1.16248600e-03,  8.10028683e-02,  1.38019690e-02,
        6.87152966e-02,  1.14180895e-02,  8.75516858e-03, -4.78062998e-04,
        2.29424544e-02])

In [34]:
selected_features = X.columns[clf.coef_ != 0]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['ID', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization',
       'interest_only', 'lump_sum_payment', 'property_value',
       'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
       'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type',
       'age', 'submission_of_application', 'LTV', 'Region', 'Security_Type',
       'dtir1'],
      dtype='object')


### Logistic Regression With Lasso Feature Selection

In [35]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6643945848717221

### Random Forest With Lasso Feature Selection

In [36]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9742767882500913

### Ridge Regression For Feature Selection ###

In [37]:
clf = RidgeCV()
clf.fit(X, Y)
clf.coef_

array([-0.00112659,  0.        ,  0.02028799,  0.0128845 , -0.00692476,
       -0.01334941, -0.01426027,  0.01268939, -0.00873509, -0.04932564,
        0.02033588,  0.00077099, -0.02861792, -0.00884032, -0.00019734,
       -0.07390715, -0.01390749, -0.08407266, -0.0103361 , -0.00303921,
       -0.01936973,  0.00303921,  0.01737062, -0.03422534,  0.0517613 ,
        0.00124363,  0.0810806 ,  0.01393533,  0.06885138,  0.0114624 ,
        0.00878821, -0.00303921,  0.02298394])

In [38]:
selected_features = X.columns[clf.coef_ > 0.01]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['loan_limit', 'Gender', 'Credit_Worthiness', 'loan_amount',
       'total_units', 'credit_type', 'co-applicant_credit_type', 'age',
       'submission_of_application', 'LTV', 'dtir1'],
      dtype='object')


### Logistic Regression With Ridge Regression Feature Selection 

In [39]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.6118526933636278

### Random Forest With Ridge Regression Feature Selection 

In [40]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.930809699625377