# Feature Selection

In [1]:
# Import the libraries needed
import os

import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from shap import LinearExplainer, summary_plot

In [2]:
# Read the two datasets from the pickle files
dir_path = os.getcwd()

X_train = pd.read_pickle(dir_path + "\\data\\X_train.pkl")
X_test = pd.read_pickle(dir_path + "\\data\\X_test.pkl") 

X_train_norm = pd.read_pickle(dir_path + "\\data\\X_train_norm.pkl")
X_test_norm = pd.read_pickle(dir_path + "\\data\\X_test_norm.pkl")

X_train_log = pd.read_pickle(dir_path + "\\data\\X_train_log.pkl")
X_test_log = pd.read_pickle(dir_path + "\\data\\X_test_log.pkl")

y_train = pd.read_pickle(dir_path + "\\data\\y_train.pkl")
y_test = pd.read_pickle(dir_path + "\\data\\y_test.pkl")

In [3]:
# Get columns names
column_names = list(X_train.columns)

In [4]:
# Check the data was imported correctly
print("X train norm:\n", X_train_norm.head(2))
print("X train log:\n", X_train_log.head(2))
print("y train:\n", y_train.head(2))

X train norm:
        LIMIT_BAL  SEX_MALE  AGE  PAY_9  PAY_8  PAY_7  PAY_6  PAY_5  PAY_4  \
21177   0.292929         1   31      0      0      0      0      0      0   
23942   0.010101         1   24      0      0      0      0      0      0   

       BILL_AMT_9  ...  EDUCATION_1  EDUCATION_2  EDUCATION_3  EDUCATION_4  \
21177    0.218131  ...            0            0            1            0   
23942    0.160438  ...            0            1            0            0   

       EDUCATION_5  EDUCATION_6  MARRIAGE_0  MARRIAGE_1  MARRIAGE_2  \
21177            0            0           0           0           1   
23942            0            0           0           0           1   

       MARRIAGE_3  
21177           0  
23942           0  

[2 rows x 32 columns]
X train log:
        LIMIT_BAL  SEX_MALE  AGE  PAY_9  PAY_8  PAY_7  PAY_6  PAY_5  PAY_4  \
21177   0.470005         1   31      0      0      0      0      0      0   
23942   0.039223         1   24      0      0      0 

In [5]:
# Split training into train-validation

### Test the performances of a Logistic Regression model with normalized and natural log transformed data, to choose one of the two to be used with the other models

In [6]:
# Start testing models, starting with a simple logistic regression
log_reg_norm = LogisticRegression(random_state=42, max_iter=10000)
log_reg_log = LogisticRegression(random_state=42, max_iter=10000)

log_reg_norm.fit(X_train_norm, y_train)
log_reg_log.fit(X_train_log, y_train)

print("Accuracy with normalized transformed data: ", log_reg_norm.score(X_test_norm, y_test))
print("Accuracy with natural log transformed data: ", log_reg_log.score(X_test_log, y_test))

y_pred_norm = log_reg_norm.predict(X_test_norm)
y_pred_log = log_reg_log.predict(X_test_log)
print("sklearn f1_score on normalized data: ", f1_score(y_test, y_pred_norm))
print("sklearn f1_score on natural log transformed data: ", f1_score(y_test, y_pred_log))

Accuracy with normalized transformed data:  0.8104
Accuracy with natural log transformed data:  0.8112
sklearn f1_score on normalized data:  0.3518687329079307
sklearn f1_score on natural log transformed data:  0.3557779799818016


From the F1-score and the accuracy it is clear the dataset is very unbalanced. 

We'll use SMOTENC to add samples from the least numerous classes and try again with a naive logistic regression.
We'll use SMOTENC and not SMOTE as we have some categorical features encoded as numerical bianry variables, and we don't want to denaturalize them.

Try to oversample the dataset to reach a balanced dataset

In [7]:
# Apply SMOTENC to try to balance the dataset
# We also tried to use RandomOverSampler, but SMOTENC seem to give better results, and we prefer it also because it generates new samples and doesn't just resample.
value_counts = y_train.value_counts()
tot_values = len(y_train)
cat_columns = [1, 3, 4, 5, 6, 7, 8, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]

print("Number of samples for each class: \n" + str(value_counts))
print("Percentage of class 1 on the total: " + str(round(value_counts[1]/tot_values*100)) + "%")

smote = SMOTENC(cat_columns, k_neighbors=5, sampling_strategy="auto")
#smote= RandomOverSampler(sampling_strategy='auto')
X_train_norm_aug, y_train_norm_aug = smote.fit_resample(X_train_norm, y_train)
X_train_log_aug, y_train_log_aug = smote.fit_resample(X_train_log, y_train)
X_train_aug, y_train_aug = smote.fit_resample(X_train, y_train)

value_counts = y_train_norm_aug.value_counts()
print("Number of samples for each class: \n" + str(value_counts))


# Shuffle the dataset
shuffle_idx = np.random.permutation(len(X_train_norm_aug)) 
X_train_norm_aug = pd.DataFrame(np.array(X_train_norm_aug, dtype=float)[shuffle_idx], columns=column_names) # Normalized
y_train_norm_aug = pd.Series(np.array(y_train_norm_aug, dtype=int)[shuffle_idx])

X_train_log_aug = pd.DataFrame(np.array(X_train_log_aug, dtype=float)[shuffle_idx], columns=column_names) # log transformed
y_train_log_aug = pd.Series(np.array(y_train_log_aug, dtype=int)[shuffle_idx])

X_train_aug = pd.DataFrame(np.array(X_train_aug, dtype=float)[shuffle_idx], columns=column_names) # simple
y_train_aug = pd.Series(np.array(y_train_aug, dtype=int)[shuffle_idx])

Number of samples for each class: 
0    17491
1     5009
Name: default payment next month, dtype: int64
Percentage of class 1 on the total: 22%
Number of samples for each class: 
0    17491
1    17491
Name: default payment next month, dtype: int64


Try to perform undersampling, despite we have just 5009 samples in the least numerous class. Probably it won't produce good results.

In [8]:
# Apply RandomUnderSampler to undersample the dataset
""" We used RandomUnderSampler and not other methods like Nearmiss as we have categorical features encoded as binary numerical features, 
and as it creates new samples based on distances of the given data, it can generate unwanted data for these categorical features.
RandomUnderSampler instead is safe from these problems as it just resamples data from the given data, in our case without replacement, 
trying to keep the data as similar to the original as possible. """
value_counts = y_train.value_counts()
tot_values = len(y_train)

print("Number of samples for each class: \n" + str(value_counts))
print("Percentage of class 1 on the total: " + str(round(value_counts[1]/tot_values*100)) + "%")

rand_unders_sampl = RandomUnderSampler(sampling_strategy="auto")
X_train_norm_und, y_train_norm_und = rand_unders_sampl.fit_resample(X_train_norm, y_train)
X_train_log_und, y_train_log_und = rand_unders_sampl.fit_resample(X_train_log, y_train)
X_train_und, y_train_und = rand_unders_sampl.fit_resample(X_train, y_train)

value_counts = y_train_norm_und.value_counts()
print("Number of samples for each class: \n" + str(value_counts))


# Shuffle the dataset
shuffle_idx = np.random.permutation(len(X_train_norm_und)) 
X_train_norm_und = pd.DataFrame(np.array(X_train_norm_und, dtype=float)[shuffle_idx], columns=column_names) # Normalized
y_train_norm_und = pd.Series(np.array(y_train_norm_und, dtype=int)[shuffle_idx])

X_train_log_und = pd.DataFrame(np.array(X_train_log_und, dtype=float)[shuffle_idx], columns=column_names) # log transformed
y_train_log_und = pd.Series(np.array(y_train_log_und, dtype=int)[shuffle_idx])

X_train_und = pd.DataFrame(np.array(X_train_und, dtype=float)[shuffle_idx], columns=column_names) # simple
y_train_und = pd.Series(np.array(y_train_und, dtype=int)[shuffle_idx])

Number of samples for each class: 
0    17491
1     5009
Name: default payment next month, dtype: int64
Percentage of class 1 on the total: 22%
Number of samples for each class: 
0    5009
1    5009
Name: default payment next month, dtype: int64


Try to mix over and under sampling to reach a balanced dataset with not too many artificial samples

In [9]:
# Apply SMOTENC to try to balance the dataset by oversampling
value_counts = y_train.value_counts()
tot_values = len(y_train)

print("Number of samples for each class: \n" + str(value_counts))
print("Percentage of class 1 on the total: " + str(round(value_counts[1]/tot_values*100)) + "%")

smote2 = SMOTENC(cat_columns, k_neighbors=5, sampling_strategy=0.3) # We tried many different percentages: np.linspace(0.0, 1.0, 10)
#smote2= RandomOverSampler(sampling_strategy=0.3)
X_train_norm_bal, y_train_norm_bal = smote2.fit_resample(X_train_norm, y_train)
X_train_log_bal, y_train_log_bal = smote2.fit_resample(X_train_log, y_train)
X_train_bal, y_train_bal = smote2.fit_resample(X_train, y_train)

value_counts = y_train_bal.value_counts()
print("Number of samples for each class: \n" + str(value_counts))

Number of samples for each class: 
0    17491
1     5009
Name: default payment next month, dtype: int64
Percentage of class 1 on the total: 22%
Number of samples for each class: 
0    17491
1     5247
Name: default payment next month, dtype: int64


In [10]:
# Apply RandomUnderSampler to undersample and balance the dataset
value_counts = y_train_bal.value_counts()
tot_values = len(y_train_bal)

print("Number of samples for each class: \n" + str(value_counts))
print("Percentage of class 1 on the total: " + str(round(value_counts[1]/tot_values*100)) + "%")

rand_unders_sampl = RandomUnderSampler(sampling_strategy="auto")
X_train_norm_bal, y_train_norm_bal = rand_unders_sampl.fit_resample(X_train_norm_bal, y_train_norm_bal)
X_train_log_bal, y_train_log_bal = rand_unders_sampl.fit_resample(X_train_log_bal, y_train_log_bal)
X_train_bal, y_train_bal = rand_unders_sampl.fit_resample(X_train_bal, y_train_bal)

value_counts = y_train_norm_bal.value_counts()
print("Number of samples for each class: \n" + str(value_counts))


# Shuffle the dataset
shuffle_idx = np.random.permutation(len(X_train_norm_bal)) 
X_train_norm_bal = pd.DataFrame(np.array(X_train_norm_bal, dtype=float)[shuffle_idx], columns=column_names) # Normalized
y_train_norm_bal = pd.Series(np.array(y_train_norm_bal, dtype=int)[shuffle_idx])

X_train_log_bal = pd.DataFrame(np.array(X_train_log_bal, dtype=float)[shuffle_idx], columns=column_names) # log transformed
y_train_log_bal = pd.Series(np.array(y_train_log_bal, dtype=int)[shuffle_idx])

X_train_bal = pd.DataFrame(np.array(X_train_bal, dtype=float)[shuffle_idx], columns=column_names) # simple
y_train_bal = pd.Series(np.array(y_train_bal, dtype=int)[shuffle_idx])

Number of samples for each class: 
0    17491
1     5247
Name: default payment next month, dtype: int64
Percentage of class 1 on the total: 23%
Number of samples for each class: 
0    5247
1    5247
Name: default payment next month, dtype: int64


### Test the performances of a Logistic Regression model with normalized and natural log transformed data. Both of them in 3 versions: oversampled, undersampled, a mix of the 2. 

In [11]:
# Test a simple logistic regression with the different datasets
# Create the different Logistic Regression models
log_reg_norm_aug = LogisticRegression(random_state=42, max_iter=10000)
log_reg_norm_und = LogisticRegression(random_state=42, max_iter=10000)
log_reg_norm_bal = LogisticRegression(random_state=42, max_iter=10000)
log_reg_log_aug = LogisticRegression(random_state=42, max_iter=10000)
log_reg_log_und = LogisticRegression(random_state=42, max_iter=10000)
log_reg_log_bal = LogisticRegression(random_state=42, max_iter=10000)

# Fit the different Logistic regressions
log_reg_norm_aug.fit(X_train_norm_aug, y_train_norm_aug)
log_reg_norm_und.fit(X_train_norm_und, y_train_norm_und)
log_reg_norm_bal.fit(X_train_norm_bal, y_train_norm_bal)
log_reg_log_aug.fit(X_train_log_aug, y_train_log_aug)
log_reg_log_und.fit(X_train_log_und, y_train_log_und)
log_reg_log_bal.fit(X_train_log_bal, y_train_log_bal)

"""Results using RandomOverSampler instead of SMOTENC:
Oversampled data:
Accuracy with normalized augmented data:  0.6885333333333333
Accuracy with natural log transformed augmented data:  0.6896
sklearn f1_score on normalized augmented data:  0.4729241877256318
sklearn f1_score on natural log transformed augmented data:  0.4733031674208145

Undersampled data:
Accuracy with normalized augmented data:  0.6958666666666666
Accuracy with natural log transformed augmented data:  0.6893333333333334
sklearn f1_score on normalized augmented data:  0.4757527005286141
sklearn f1_score on natural log transformed augmented data:  0.47069513857337575

Balanced data:
Accuracy with normalized augmented data:  0.6986666666666667
Accuracy with natural log transformed augmented data:  0.6924
sklearn f1_score on normalized augmented data:  0.47294776119402987
sklearn f1_score on natural log transformed augmented data:  0.4734078977402419
"""

# Test the Logistic regressions
print("Oversampled data:")
print("Accuracy with normalized augmented data: ", log_reg_norm_aug.score(X_test_norm, y_test))
print("Accuracy with natural log transformed augmented data: ", log_reg_log_aug.score(X_test_log, y_test))
y_pred_norm = log_reg_norm_aug.predict(X_test_norm)
y_pred_log = log_reg_log_aug.predict(X_test_log)
print("sklearn f1_score on normalized augmented data: ", f1_score(y_test, y_pred_norm))
print("sklearn f1_score on natural log transformed augmented data: ", f1_score(y_test, y_pred_log))

print("\nUndersampled data:")
print("Accuracy with normalized augmented data: ", log_reg_norm_und.score(X_test_norm, y_test))
print("Accuracy with natural log transformed augmented data: ", log_reg_log_und.score(X_test_log, y_test))
y_pred_norm = log_reg_norm_und.predict(X_test_norm)
y_pred_log = log_reg_log_und.predict(X_test_log)
print("sklearn f1_score on normalized augmented data: ", f1_score(y_test, y_pred_norm))
print("sklearn f1_score on natural log transformed augmented data: ", f1_score(y_test, y_pred_log))

print("\nBalanced data:")
print("Accuracy with normalized augmented data: ", log_reg_norm_bal.score(X_test_norm, y_test))
print("Accuracy with natural log transformed augmented data: ", log_reg_log_bal.score(X_test_log, y_test))
y_pred_norm_bal = log_reg_norm_bal.predict(X_test_norm)
y_pred_log_bal = log_reg_log_bal.predict(X_test_log)
print("sklearn f1_score on normalized augmented data: ", f1_score(y_test, y_pred_norm_bal))
print("sklearn f1_score on natural log transformed augmented data: ", f1_score(y_test, y_pred_log))

Oversampled data:
Accuracy with normalized augmented data:  0.6905333333333333
Accuracy with natural log transformed augmented data:  0.6918666666666666
sklearn f1_score on normalized augmented data:  0.470212280301301
sklearn f1_score on natural log transformed augmented data:  0.47584486278067584

Undersampled data:
Accuracy with normalized augmented data:  0.696
Accuracy with natural log transformed augmented data:  0.7010666666666666
sklearn f1_score on normalized augmented data:  0.4739270881402861
sklearn f1_score on natural log transformed augmented data:  0.47592332865825154

Balanced data:
Accuracy with normalized augmented data:  0.6898666666666666
Accuracy with natural log transformed augmented data:  0.6890666666666667
sklearn f1_score on normalized augmented data:  0.4706417842512517
sklearn f1_score on natural log transformed augmented data:  0.47592332865825154


### Test the performances of a Support Vector Classifier with rbf as kernel (non-linear) model with normalized and natural log transformed data. Both of them in 3 versions: oversampled, undersampled, a mix of the 2. 

In [12]:
# Test an SVC model having non-linear kernel with the different datasets
# Create the different SVC models
svc_norm_aug = SVC(kernel='rbf', gamma='scale', max_iter=-1)
svc_norm_und = SVC(kernel='rbf', gamma='scale', max_iter=-1)
svc_norm_bal = SVC(kernel='rbf', gamma='scale', max_iter=-1)
svc_log_aug = SVC(kernel='rbf', gamma='scale', max_iter=-1)
svc_log_und = SVC(kernel='rbf', gamma='scale', max_iter=-1)
svc_log_bal = SVC(kernel='rbf', gamma='scale', max_iter=-1)
# svcs = [svc_norm_aug, svc_norm_und, svc_norm_bal, svc_log_aug, svc_log_und, svc_log_bal]

# Fit the different SVCs
svc_norm_aug.fit(X_train_norm_aug, y_train_norm_aug)
svc_norm_und.fit(X_train_norm_und, y_train_norm_und)
svc_norm_bal.fit(X_train_norm_bal, y_train_norm_bal)
svc_log_aug.fit(X_train_log_aug, y_train_log_aug)
svc_log_und.fit(X_train_log_und, y_train_log_und)
svc_log_bal.fit(X_train_log_bal, y_train_log_bal)

"""Results using RandomOverSampler instead of SMOTENC:
Oversampled data:
Accuracy with normalized augmented data:  0.7794666666666666
Accuracy with natural log transformed augmented data:  0.7789333333333334
sklearn f1_score on normalized augmented data:  0.5044937088076693
sklearn f1_score on natural log transformed augmented data:  0.5044829647340108

Undersampled data:
Accuracy with normalized augmented data:  0.7869333333333334
Accuracy with natural log transformed augmented data:  0.7794666666666666
sklearn f1_score on normalized augmented data:  0.505569306930693
sklearn f1_score on natural log transformed augmented data:  0.5024067388688328

Balanced data:
Accuracy with normalized augmented data:  0.7809333333333334
Accuracy with natural log transformed augmented data:  0.7808
sklearn f1_score on normalized augmented data:  0.5043740573152338
sklearn f1_score on natural log transformed augmented data:  0.5051173991571343
"""

# Test the SVCs
print("Oversampled data:")
print("Accuracy with normalized augmented data: ", svc_norm_aug.score(X_test_norm, y_test))
print("Accuracy with natural log transformed augmented data: ", svc_log_aug.score(X_test_log, y_test))
y_pred_norm = svc_norm_aug.predict(X_test_norm)
y_pred_log = svc_log_aug.predict(X_test_log)
print("sklearn f1_score on normalized augmented data: ", f1_score(y_test, y_pred_norm))
print("sklearn f1_score on natural log transformed augmented data: ", f1_score(y_test, y_pred_log))

print("\nUndersampled data:")
print("Accuracy with normalized augmented data: ", svc_norm_und.score(X_test_norm, y_test))
print("Accuracy with natural log transformed augmented data: ", svc_log_und.score(X_test_log, y_test))
y_pred_norm = svc_norm_und.predict(X_test_norm)
y_pred_log = svc_log_und.predict(X_test_log)
print("sklearn f1_score on normalized augmented data: ", f1_score(y_test, y_pred_norm))
print("sklearn f1_score on natural log transformed augmented data: ", f1_score(y_test, y_pred_log))

print("\nBalanced data:")
print("Accuracy with normalized augmented data: ", svc_norm_bal.score(X_test_norm, y_test))
print("Accuracy with natural log transformed augmented data: ", svc_log_bal.score(X_test_log, y_test))
y_pred_norm = svc_norm_bal.predict(X_test_norm)
y_pred_log = svc_log_bal.predict(X_test_log)
print("sklearn f1_score on normalized augmented data: ", f1_score(y_test, y_pred_norm))
print("sklearn f1_score on natural log transformed augmented data: ", f1_score(y_test, y_pred_log))

Oversampled data:
Accuracy with normalized augmented data:  0.7826666666666666
Accuracy with natural log transformed augmented data:  0.778
sklearn f1_score on normalized augmented data:  0.5066585956416464
sklearn f1_score on natural log transformed augmented data:  0.5037257824143069

Undersampled data:
Accuracy with normalized augmented data:  0.7826666666666666
Accuracy with natural log transformed augmented data:  0.7858666666666667
sklearn f1_score on normalized augmented data:  0.5027455765710799
sklearn f1_score on natural log transformed augmented data:  0.5003111387678905

Balanced data:
Accuracy with normalized augmented data:  0.7878666666666667
Accuracy with natural log transformed augmented data:  0.7830666666666667
sklearn f1_score on normalized augmented data:  0.5010975227343994
sklearn f1_score on natural log transformed augmented data:  0.5038121378469046


We tested out many different percenteges for sampling_strategy in the making of the balanced dataset using both over and under sampling, but still using only oversampling performs better than the mix and than using undersampling.

The results from the normalized and the natural log transformed dataset are very similar, but usually the log transformed one performs slightly better, so we'll keep it.

We'll later on consider normalizing the already natural log transformed data for using certain models.

In [13]:
# Save the oversampled natural log transformed dataset
current_path = os.getcwd()
X_train_log_aug.to_pickle(current_path + "/data/X_train_log_aug.pkl")
y_train_log_aug.to_pickle(current_path + "/data/y_train_log_aug.pkl")

# X_train_norm_aug.to_pickle(current_path + "/data/X_train_norm_aug.pkl")
# y_train_norm_aug.to_pickle(current_path + "/data/y_train_norm_aug.pkl")
# X_train_log_aug.to_pickle(current_path + "/data/X_train_log_aug.pkl")
# y_train_log_aug.to_pickle(current_path + "/data/y_train_log_aug.pkl")