In [1]:
from sklearn.feature_selection import mutual_info_classif           # Information Gain for classification task
from sklearn.feature_selection import chi2                          # chi-square feature selection method
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE                           # Recurssive feature elimination algorithm
from sklearn.feature_selection import SequentialFeatureSelector     # Forward feature selectioin method
from sklearn.linear_model import LassoCV , RidgeCV                  # lasso and ridge cross validation algorithms for feature selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel               # to select features according to prefit LassoCV and RidgeCV
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np
from sklearn.preprocessing import StandardScaler


### Dataset Loading ###

In [2]:
PATH = './data/ld2/Default_Fin.csv'
Target_Variable = 'Defaulted?'

frame = pd.read_csv(PATH)

In [3]:
frame.head()

Unnamed: 0,Index,Employed,Bank Balance,Annual Salary,Defaulted?
0,1,1,8754.36,532339.56,0
1,2,0,9806.16,145273.56,0
2,3,1,12882.6,381205.68,0
3,4,1,6351.0,428453.88,0
4,5,1,9427.92,461562.0,0


In [4]:
frame.isnull().sum()

Index            0
Employed         0
Bank Balance     0
Annual Salary    0
Defaulted?       0
dtype: int64

In [5]:
for i in frame.columns[frame.isna().sum() > 0]:
    frame[i] = frame[i].interpolate('linear')

In [6]:
frame.isnull().sum()

Index            0
Employed         0
Bank Balance     0
Annual Salary    0
Defaulted?       0
dtype: int64

In [7]:

# cat_vars = ['loan_limit', 'Gender', 'approv_in_adv', 'loan_type','loan_purpose', 'Credit_Worthiness', 'open_credit','business_or_commercial', 'term', 'Neg_ammortization','interest_only', 'construction_type', 'occupancy_type', 'Secured_by',  'credit_type', 'co-applicant_credit_type','submission_of_application', 'Region', 'Security_Type', 'dtir1','lump_sum_payment','total_units','age']

# for i in cat_vars:
#     le = LabelEncoder()
#     frame[i] = le.fit_transform(frame[i])

In [8]:
[frame[i].iloc[0:10] for i in frame.columns]

[0     1
 1     2
 2     3
 3     4
 4     5
 5     6
 6     7
 7     8
 8     9
 9    10
 Name: Index, dtype: int64,
 0    1
 1    0
 2    1
 3    1
 4    1
 5    0
 6    1
 7    0
 8    1
 9    1
 Name: Employed, dtype: int64,
 0     8754.36
 1     9806.16
 2    12882.60
 3     6351.00
 4     9427.92
 5    11035.08
 6     9906.12
 7     9704.04
 8    13932.72
 9        0.00
 Name: Bank Balance, dtype: float64,
 0    532339.56
 1    145273.56
 2    381205.68
 3    428453.88
 4    461562.00
 5     89898.72
 6    298862.76
 7    211205.40
 8    449622.36
 9    351303.24
 Name: Annual Salary, dtype: float64,
 0    0
 1    0
 2    0
 3    0
 4    0
 5    0
 6    0
 7    0
 8    0
 9    0
 Name: Defaulted?, dtype: int64]

In [9]:
frame = frame.dropna()
frame

Unnamed: 0,Index,Employed,Bank Balance,Annual Salary,Defaulted?
0,1,1,8754.36,532339.56,0
1,2,0,9806.16,145273.56,0
2,3,1,12882.60,381205.68,0
3,4,1,6351.00,428453.88,0
4,5,1,9427.92,461562.00,0
...,...,...,...,...,...
9995,9996,1,8538.72,635908.56,0
9996,9997,1,9095.52,235928.64,0
9997,9998,1,10144.92,703633.92,0
9998,9999,1,18828.12,440029.32,0


In [10]:

X = frame.drop([Target_Variable],axis=1)
Y = frame[Target_Variable]


In [11]:
# Identify the minority class label
unique, counts = np.unique(Y, return_counts=True)
minority_class = unique[np.argmin(counts)]

# Get the data belonging to the minority class
minority_X = X[Y == minority_class]
minority_y = Y[Y == minority_class]

# Get the majority class data (optional, for combining later)
majority_X = X[Y != minority_class]
majority_y = Y[Y != minority_class]

# Upsample the minority class to match the size of the majority class
n_samples = len(majority_X)  # Number of samples in the majority class
upsampled_minority = resample(minority_X, replace=True, n_samples=n_samples, random_state=42)
upsampled_minority_y = np.full(len(upsampled_minority), minority_class)  # Assign class labels

# Combine the upsampled data with the majority class (optional)
X_combined = np.concatenate((upsampled_minority, majority_X))
y_combined = np.concatenate((upsampled_minority_y, majority_y))


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)

X = pd.DataFrame(X_scaled,columns=X.columns)
Y = y_combined





print(X.shape)
print(Y.shape)

(19334, 4)
(19334,)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### Chi-Square Feature Selection ###

In [13]:
selector = VarianceThreshold(threshold=0.5)  # Remove features with variance < 0.01
selector.fit(X)
print(selector.get_support())

[ True  True  True  True]


In [14]:
selected_features = X.columns[selector.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Index', 'Employed', 'Bank Balance', 'Annual Salary'], dtype='object')


In [15]:
clf = LogisticRegression()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.8810531264692054

In [16]:
clf = RandomForestClassifier()
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9918508070835292

### Mutual Info ###

In [17]:
fs = mutual_info_classif(X,Y)

In [18]:
selected_features = X.columns[fs > 0.05] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Index', 'Bank Balance', 'Annual Salary'], dtype='object')


In [19]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.8827769942015358

In [20]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9926343833254976

### Forward Selection Method ###

In [22]:
clf = LogisticRegression(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=3)
sfs.fit(X, Y)
sfs.get_support()

array([False,  True,  True,  True])

In [23]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Employed', 'Bank Balance', 'Annual Salary'], dtype='object')


In [24]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.8810531264692054

In [27]:
clf = RandomForestClassifier(n_jobs=16)
sfs = SequentialFeatureSelector(clf,n_features_to_select=3,n_jobs=16)
sfs.fit(X, Y)
sfs.get_support()

array([False,  True,  True,  True])

In [28]:
selected_features = X.columns[sfs.get_support()] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Employed', 'Bank Balance', 'Annual Salary'], dtype='object')


In [29]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9902836545995926

### Recurssive Feature Elimination ###

In [31]:
estimator = LogisticRegression(n_jobs=16)
selector = RFE(estimator, n_features_to_select=3, step=0.2)
selector.fit(X, Y)
selector.support_

array([False,  True,  True,  True])

In [32]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Employed', 'Bank Balance', 'Annual Salary'], dtype='object')


In [33]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.8810531264692054

In [35]:
estimator = RandomForestClassifier(n_jobs=16)
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X, Y)
selector.support_

array([ True, False,  True,  True])

In [36]:
selected_features = X.columns[selector.support_] 
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Index', 'Bank Balance', 'Annual Salary'], dtype='object')


In [37]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9932612443190723

### Lasso Regx Feature Selection ###

In [38]:
clf = LassoCV()
clf.fit(X, Y)
clf.coef_

array([-0.        ,  0.02393141,  0.37283512,  0.        ])

In [39]:
selected_features = X.columns[clf.coef_ != 0]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Employed', 'Bank Balance'], dtype='object')


In [40]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.8812098417175991

In [41]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9849553361542078

### Ridge Regression For Feature Selection ###

In [42]:
clf = RidgeCV()
clf.fit(X, Y)
clf.coef_

array([-0.0061876 ,  0.03794714,  0.38752327,  0.00112366])

In [43]:
selected_features = X.columns[clf.coef_ > 0.01]
print("\nSelected Features:") 
print(selected_features)


Selected Features:
Index(['Employed', 'Bank Balance'], dtype='object')


In [44]:
clf = LogisticRegression(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.8812098417175991

In [45]:
clf = RandomForestClassifier(n_jobs=16)
clf.fit(X_train[selected_features],y_train)
score = clf.score(X_test[selected_features],y_test)
score

0.9851120514026015