# Improving Classification Models using Feature Extraction

Logistic Regression is tested for all 3 Feature Selection Algorithms and all 5 Sampling Methods to compare Prediction Accuracies before and after those algorithms are implemented. The respective accuracies are stored in different variables for each algorithm and presented along the original accuracy score in the end, as can be seen below.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.stats import pearsonr
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import CondensedNearestNeighbour

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the dataset
data = pd.read_csv('dataset.csv')

# Separate the features and target variable
X = data.iloc[:, 2:-1]
Y = data.iloc[:, 1]

In [4]:
# Convert target variable to numeric labels
y_labels = Y.unique()
y_dict = {y_labels[i]: i for i in range(len(y_labels))}
Y = Y.map(y_dict)

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Logistic Regression Classification is used for Analysis

In [6]:
# Train a logistic regression model on the full dataset
lr_full = LogisticRegression(random_state=42)
lr_full.fit(X_train, Y_train)
Y_pred_full = lr_full.predict(X_test)
acc_full = accuracy_score(Y_test, Y_pred_full)

### Feature Selection using Pearson Correlation

In [7]:
# Calculate the correlation coefficients between each feature and the target variable
correlations = []
for col in X.columns:
    corr, _ = pearsonr(X[col], Y)
    correlations.append(abs(corr))

In [8]:
# Select the top k features based on their correlation coefficients
k = 5
pearson_k_features = X.columns[sorted(range(len(correlations)), key=lambda i: correlations[i], reverse=True)[:k]]
X_train_pearson = X_train[pearson_k_features]
X_test_pearson = X_test[pearson_k_features]

In [9]:
# Train a logistic regression model on the reduced dataset
lr_pearson = LogisticRegression(random_state=42)
lr_pearson.fit(X_train_pearson, Y_train)
Y_pred_pearson = lr_pearson.predict(X_test_pearson)
acc_pearson = accuracy_score(Y_test, Y_pred_pearson)

### Feature Selection using Recursive Feature Elimination 

In [10]:
# Perform recursive feature elimination
estimator = LogisticRegression(random_state=42)
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X_train, Y_train)

In [11]:
# Select the top k features based on the RFE results
rfe_k_features = X.columns[selector.support_]
X_train_rfe = X_train[rfe_k_features]
X_test_rfe = X_test[rfe_k_features]

In [12]:
# Train a logistic regression model on the reduced dataset
lr_rfe = LogisticRegression(random_state=42)
lr_rfe.fit(X_train_rfe, Y_train)
Y_pred_rfe = lr_rfe.predict(X_test_rfe)
acc_rfe = accuracy_score(Y_test, Y_pred_rfe)

### Feature Selection using Random Forests

In [13]:
from sklearn.feature_selection import SelectFromModel
rf = RandomForestClassifier(n_estimators=100, random_state=42)
sfm = SelectFromModel(rf, threshold=0.1)
sfm.fit(X_train, Y_train)
X_train_rf = sfm.transform(X_train)
X_test_rf = sfm.transform(X_test)

In [14]:
rf_new = RandomForestClassifier(n_estimators=100, random_state=42)
rf_new.fit(X_train_rf, Y_train)

RandomForestClassifier(random_state=42)

In [15]:
Y_pred_rf = rf_new.predict(X_test_rf)
acc_rf = accuracy_score(Y_test, Y_pred_rf)

### Random Undersampling

In [16]:
rus = RandomUnderSampler(random_state=42)
X_train_undersampled, Y_train_undersampled = rus.fit_resample(X_train, Y_train)

In [17]:
lr_undersampling = LogisticRegression(random_state=42)
lr_undersampling.fit(X_train_undersampled, Y_train_undersampled)

LogisticRegression(random_state=42)

In [18]:
Y_pred_beforeus = lr_undersampling.predict(X_test)
accuracy_before_under = accuracy_score(Y_test, Y_pred_beforeus)

Y_pred_undersampled = lr_undersampling.predict(X_test)
acc_undersampling = accuracy_score(Y_test, Y_pred_undersampled)

### Random Oversampling

In [19]:
ros = RandomOverSampler(random_state=42)
X_train_oversampled, Y_train_oversampled = ros.fit_resample(X_train, Y_train)

In [20]:
lr_oversampling = LogisticRegression(random_state=42)
lr_oversampling.fit(X_train_oversampled, Y_train_oversampled)

LogisticRegression(random_state=42)

In [21]:
Y_pred_beforeos = lr_oversampling.predict(X_test)
accuracy_before_over = accuracy_score(Y_test, Y_pred_beforeos)

Y_pred_oversampled = lr_oversampling.predict(X_test)
acc_oversampling = accuracy_score(Y_test, Y_pred_oversampled)

### SMOTE Sampling

In [22]:
smote = SMOTE(random_state=42)
X_train_smotesampled, Y_train_smotesampled = smote.fit_resample(X_train, Y_train)

In [23]:
lr_smotesampling = LogisticRegression(random_state=42)
lr_smotesampling.fit(X_train_smotesampled, Y_train_smotesampled)

LogisticRegression(random_state=42)

In [24]:
Y_pred_beforesmote = lr_smotesampling.predict(X_test)
accuracy_before_smote = accuracy_score(Y_test, Y_pred_beforesmote)

Y_pred_smotesampled = lr_smotesampling.predict(X_test)
acc_smotesampling = accuracy_score(Y_test, Y_pred_smotesampled)

### ADASYN Sampling

In [25]:
adasyn = ADASYN(random_state=42)
X_train_adasynsampled, Y_train_adasynsampled = adasyn.fit_resample(X_train, Y_train)

In [26]:
lr_adasynsampling = LogisticRegression(random_state=42)
lr_adasynsampling.fit(X_train_adasynsampled, Y_train_adasynsampled)

LogisticRegression(random_state=42)

In [27]:
Y_pred_beforeadasyn = lr_adasynsampling.predict(X_test)
accuracy_before_adasyn = accuracy_score(Y_test, Y_pred_beforeadasyn)

Y_pred_adasynsampled = lr_adasynsampling.predict(X_test)
acc_adasynsampling = accuracy_score(Y_test, Y_pred_adasynsampled)

### A-SUWO Sampling

In [29]:
cnn = CondensedNearestNeighbour(n_neighbors=1)
X_train_asuwosampled, Y_train_asuwosampled = cnn.fit_resample(X_train, Y_train)
adasyn = ADASYN(random_state=42)
X_train_asuwosampled, Y_train_asuwosampled = adasyn.fit_resample(X_train_asuwosampled, Y_train_asuwosampled)

In [30]:
lr_asuwosampling = LogisticRegression(random_state=42)
lr_asuwosampling.fit(X_train_asuwosampled, Y_train_asuwosampled)

LogisticRegression(random_state=42)

In [31]:
Y_pred_beforeasuwo = lr_asuwosampling.predict(X_test)
accuracy_before_asuwo = accuracy_score(Y_test, Y_pred_beforeasuwo)

Y_pred_asuwosampled = lr_asuwosampling.predict(X_test)
acc_asuwosampling = accuracy_score(Y_test, Y_pred_asuwosampled)

### Classification Accuracy Metrics Before/After Feature Selection & Sampling Methods

In [33]:
# Classification Accuracy Comparisons 
print("Original Accuracy of Logistic Regression Model: {:.2f}%".format(acc_full * 100))
print("")
print("Accuracy after Pearson Feature Selection: {:.2f}%".format(acc_pearson * 100))
print("Accuracy after RFE Feature Selection: {:.2f}%".format(acc_rfe * 100))
print("Accuracy after Random Forest Feature Selection: {:.2f}%".format(acc_rf * 100))
print("")
print("Accuracy after Random Undersampling: {:.2f}%".format(acc_undersampling * 100))
print("Accuracy after Random Oversampling: {:.2f}%".format(acc_oversampling * 100))
print("Accuracy after SMOTE Sampling: {:.2f}%".format(acc_smotesampling * 100))
print("Accuracy after ADASYN Sampling: {:.2f}%".format(acc_adasynsampling * 100))
print("Accuracy after A-SUWO Sampling: {:.2f}%".format(acc_asuwosampling * 100))

Original Accuracy of Logistic Regression Model: 96.49%

Accuracy after Pearson Feature Selection: 97.37%
Accuracy after RFE Feature Selection: 97.37%
Accuracy after Random Forest Feature Selection: 95.61%

Accuracy after Random Undersampling: 96.49%
Accuracy after Random Oversampling: 96.49%
Accuracy after SMOTE Sampling: 97.37%
Accuracy after ADASYN Sampling: 97.37%
Accuracy after A-SUWO Sampling: 93.86%
