# Logistic Regression Modelling

In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

### Read the CSV and Perform Basic Data Cleaning

In [1]:
# Ignore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import depedencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import os

In [2]:
# Load libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.feature_selection import SelectFromModel

In [3]:
# File path 
data_file = os.path.join("","data","featureData","feature_dataframe.csv")
model_result = os.path.join("","data","results","LogisticRegression.csv")

In [4]:
# Read data
df = pd.read_csv(data_file)

In [5]:
# Display Sample data
df.head()

Unnamed: 0,age,job,education,contact,duration,campaign,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,...,jun,mar,may,nov,oct,sep,mon,thu,tue,wed
0,56,4,2,0,261,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
1,57,7,5,0,149,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
2,37,7,5,0,226,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
3,40,10,3,0,151,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
4,56,7,5,0,307,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0


In [6]:
# Check data size
df.shape

(41188, 36)

Let's seperate categorical features and numerical features. Scaling will be applied only on numerical features

In [7]:
# Get Catagorial variables
categorical = df.nunique()[df.nunique() < 3].keys().tolist()

# Target 
target = ['y']

# Remove target from the list
categorical.remove(target[0])

# Get numberical features
numerical = [col for col in df.columns
if col not in target+categorical]

In [8]:
from sklearn.preprocessing import StandardScaler

# Get Scaler object
scaler = StandardScaler()

# Fit and transform data
scaled_numerical = scaler.fit_transform(df[numerical])

# Build a DataFrame
scaled_df = pd.DataFrame(scaled_numerical, columns=numerical)

In [9]:
# Drop non-scaled numerical columns
df = df.drop(columns=numerical, axis=1)

# Merge the non-numerical with the scaled numerical data
df_scaled = df.merge(right=scaled_df,how='left', left_index=True,right_index=True)

In [10]:
# Display results
df_scaled.head()

Unnamed: 0,contact,y,marital_married,marital_single,marital_unknown,default_unknown,default_yes,housing_unknown,housing_yes,loan_unknown,...,age,job,education,duration,campaign,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,0,0,1,0,0,0,0,0,0,0,...,1.533034,-1.575318,-1.650047,0.010471,-0.565922,0.648092,0.722722,0.886447,0.71246,0.33168
1,0,0,1,0,0,1,0,0,0,0,...,1.628993,-0.375817,0.007943,-0.421501,-0.565922,0.648092,0.722722,0.886447,0.71246,0.33168
2,0,0,1,0,0,0,0,0,1,0,...,-0.290186,-0.375817,0.007943,-0.12452,-0.565922,0.648092,0.722722,0.886447,0.71246,0.33168
3,0,0,1,0,0,0,0,0,0,0,...,-0.002309,0.823683,-1.097383,-0.413787,-0.565922,0.648092,0.722722,0.886447,0.71246,0.33168
4,0,0,1,0,0,0,0,0,0,0,...,1.533034,-0.375817,0.007943,0.187888,-0.565922,0.648092,0.722722,0.886447,0.71246,0.33168


In [11]:
# Let's seperate dependent and independent variables
X= df_scaled.drop(columns = ['y'])
y = df_scaled['y']

In [12]:
# Get feature list
feature_name = X.columns.tolist()

### Pre-processing model performace

In [13]:
# Shape of independent variable or features
X.shape

(41188, 35)

In [14]:
# Split data to train and test and check size of train data.
# Using 75-/25 split with random state as 420 (hyper parameter)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=420, test_size=0.25)

# Print shape of train data
print(X_train.shape)

(30891, 35)


In [15]:
# Build Model
base_model = LogisticRegression(solver='newton-cg', multi_class='auto')

In [16]:
# Train model
base_model.fit(X_train, y_train)

LogisticRegression(solver='newton-cg')

In [17]:
# Predict 
pred_base = base_model.predict(X_test)

In [18]:
# Get train and test scores
training_score = round(base_model.score(X_train, y_train)*100,3)
test_score = round(accuracy_score(y_test, pred_base)*100,3)

In [19]:
# Evaluate predictions
print(accuracy_score(y_test, pred_base))
print(confusion_matrix(y_test, pred_base))
print(classification_report(y_test, pred_base))

0.9127901330484607
[[8902  245]
 [ 653  497]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      9147
           1       0.67      0.43      0.53      1150

    accuracy                           0.91     10297
   macro avg       0.80      0.70      0.74     10297
weighted avg       0.90      0.91      0.90     10297



In [20]:
print(f"Training Data Score: {training_score} %")
print(f"Testing Data Score: {test_score} %")

Training Data Score: 91.072 %
Testing Data Score: 91.279 %


##### Overall Model accuracy looks good  but F1 score for class 1 is very poor. This is due to imbalanced data. 

#### Let' use different methods to handle imbalnced data and test model results

In [21]:
from sklearn.metrics import classification_report
def logisticModel(X_train, y_train, X_test, y_test):
    
    # Build model
    model = LogisticRegression(solver='newton-cg', multi_class='auto')
    
    # Fir model
    model.fit(X_train, y_train)
    
    # Scores and accurancy
    model_train_score = round(model.score(X_train, y_train)*100,3)
    model_test_score = round(model.score(X_test, y_test)*100,3)

    print("\nTraining model score: ",model_train_score)
    print("Testing model score: ",model_test_score)
    
    # Get accurancy
    accuracy = model.score(X_test, y_test)
    print("\nAccuracy", accuracy, "\n")
    
    # Get predictions
    y_pred = model.predict(X_test)

    # Get confusion matrix
    logi_matri = classification_report(y_test,y_pred)
    print(logi_matri)

In [22]:
# Get Model results
def getModelResult(X,y):
    
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    
    # Get model results 
    logisticModel(X_train, y_train, X_test, y_test)

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import shap

def featureSelection(X,y,method):
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    
    # Get Random Forest Regressor instance and fit training data
    rf = RandomForestRegressor(n_estimators=200)
    rf.fit(X_train, y_train)
    
    if method == 1:
        # Using RF to select features
        sorted_idx = rf.feature_importances_.argsort()
        
        # Select X and y
        X=X[X.columns[rf.feature_importances_>0.001]]
        
        # Get results witout feature selection
        print("\n-- Model performace after Random Forest feature selection -- ")
        getModelResult(X,y)
    
    elif method==2:
        
        # Get feature importance
        perm_importance = permutation_importance(rf, X_test, y_test)

        # Get Index of feature
        sorted_idx = perm_importance.importances_mean.argsort()

        # Recreate dependent and independent data set using selected features
        X=X[X.columns[perm_importance.importances_mean>0]]

        # Get results witout feature selection
        print("\n-- Model performace after Permutation feature selection -- ")
        getModelResult(X,y)
    
    elif method==3:
        explainer = shap.TreeExplainer(rf)
        shap_values = explainer.shap_values(X_test)
        
        # Get feature names
        vals= np.abs(shap_values).mean(0)
        feature_importance = pd.DataFrame(list(zip(X.columns,vals)),columns=['col_name','feature_importance_vals'])
        feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
        feature_importance = feature_importance[feature_importance.feature_importance_vals > 0.002 ]
        
        # Recreate dependent and independent data set using selected features
        X= X[feature_importance.col_name]

        # Get results witout feature selection
        print("\n -- Model performace after SHAP feature selection -- ")
        getModelResult(X,y)


#### Method 1: Undersampling

In [24]:
# Class count
count_class_0, count_class_1 = df.y.value_counts()

# Divide by class
df_class_0 = df[df['y'] == 0]
df_class_1 = df[df['y'] == 1]

In [25]:
# Undersample 0-class and concat the DataFrames of both class
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.y.value_counts())

Random under-sampling:
1    4640
0    4640
Name: y, dtype: int64


In [26]:
# Get features and dependent variable data 
X = df_test_under.drop('y',axis='columns')
y = df_test_under['y']

In [27]:
# Get results witout feature selection
print("-- Model performace without feature selection -- ")
getModelResult(X,y)

-- Model performace without feature selection -- 

Training model score:  68.831
Testing model score:  68.157

Accuracy 0.681573275862069 

              precision    recall  f1-score   support

           0       0.64      0.85      0.73       928
           1       0.78      0.51      0.62       928

    accuracy                           0.68      1856
   macro avg       0.71      0.68      0.67      1856
weighted avg       0.71      0.68      0.67      1856



##### Model accuracy is dropped significantly with under sampled data and F1 score for each class has wide gap.

In [28]:
# Get Model performance for undersampled data using feature selection
featureSelection(X,y,1)
featureSelection(X,y,2)
featureSelection(X,y,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  68.831
Testing model score:  68.157

Accuracy 0.681573275862069 

              precision    recall  f1-score   support

           0       0.64      0.85      0.73       928
           1       0.78      0.51      0.62       928

    accuracy                           0.68      1856
   macro avg       0.71      0.68      0.67      1856
weighted avg       0.71      0.68      0.67      1856


-- Model performace after Permutation feature selection -- 

Training model score:  68.952
Testing model score:  68.373

Accuracy 0.6837284482758621 

              precision    recall  f1-score   support

           0       0.64      0.86      0.73       928
           1       0.79      0.50      0.61       928

    accuracy                           0.68      1856
   macro avg       0.71      0.68      0.67      1856
weighted avg       0.71      0.68      0.67      1856


 -- Model performace after SHAP feature 

#### Method 2: Over sampling method

In [29]:
# Oversample 1-class and concat the DataFrames of both classes
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.y.value_counts())

Random over-sampling:
1    36548
0    36548
Name: y, dtype: int64


In [30]:
# Get features and dependent variable data 
X = df_test_over.drop('y',axis='columns')
y = df_test_over['y']

In [31]:
# Get model results for oversampled data
print("-- Model performace without feature selection -- ")
getModelResult(X,y)

-- Model performace without feature selection -- 

Training model score:  68.763
Testing model score:  68.536

Accuracy 0.6853625170998632 

              precision    recall  f1-score   support

           0       0.64      0.83      0.72      7310
           1       0.76      0.54      0.63      7310

    accuracy                           0.69     14620
   macro avg       0.70      0.69      0.68     14620
weighted avg       0.70      0.69      0.68     14620



In [32]:
# Get Model performance for oversampled data using feature selection
featureSelection(X,y,1)
featureSelection(X,y,2)
featureSelection(X,y,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  68.734
Testing model score:  68.509

Accuracy 0.6850889192886457 

              precision    recall  f1-score   support

           0       0.64      0.83      0.72      7310
           1       0.76      0.54      0.63      7310

    accuracy                           0.69     14620
   macro avg       0.70      0.69      0.68     14620
weighted avg       0.70      0.69      0.68     14620


-- Model performace after Permutation feature selection -- 

Training model score:  68.763
Testing model score:  68.536

Accuracy 0.6853625170998632 

              precision    recall  f1-score   support

           0       0.64      0.83      0.72      7310
           1       0.76      0.54      0.63      7310

    accuracy                           0.69     14620
   macro avg       0.70      0.69      0.68     14620
weighted avg       0.70      0.69      0.68     14620


 -- Model performace after SHAP feature

### Method 3: SMOTE

In [33]:
X = df_scaled.drop('y',axis='columns')
y = df_scaled['y']

In [34]:
from imblearn.over_sampling import SMOTE

# Get smote object
smote = SMOTE(sampling_strategy='minority')

# Fit data
X_smote, y_smote = smote.fit_sample(X, y)

# Check SMOTE results 
y_smote.value_counts()

1    36548
0    36548
Name: y, dtype: int64

In [35]:
# Get model results for oversampled data
print("-- Model performace without feature selection -- ")
getModelResult(X_smote, y_smote)

-- Model performace without feature selection -- 

Training model score:  88.706
Testing model score:  88.543

Accuracy 0.8854309165526676 

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      7310
           1       0.87      0.91      0.89      7310

    accuracy                           0.89     14620
   macro avg       0.89      0.89      0.89     14620
weighted avg       0.89      0.89      0.89     14620



In [36]:
# Get Model performance for SMOTE data using feature selection
featureSelection(X_smote, y_smote,1)
featureSelection(X_smote, y_smote,2)
featureSelection(X_smote, y_smote,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  88.556
Testing model score:  88.406

Accuracy 0.88406292749658 

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      7310
           1       0.87      0.90      0.89      7310

    accuracy                           0.88     14620
   macro avg       0.88      0.88      0.88     14620
weighted avg       0.88      0.88      0.88     14620


-- Model performace after Permutation feature selection -- 

Training model score:  88.706
Testing model score:  88.543

Accuracy 0.8854309165526676 

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      7310
           1       0.87      0.91      0.89      7310

    accuracy                           0.89     14620
   macro avg       0.89      0.89      0.89     14620
weighted avg       0.89      0.89      0.89     14620


 -- Model performace after SHAP feature s

In [42]:
feature_test_score = 88.543

#### Model overall accuracy improved and F1score also looks consitant with model accuracy.

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [37]:
# Let's use smote data for grid search
len(X_smote),len(y_smote)

(73096, 73096)

In [38]:
def featureCSVSelection(X,y):
    # Split data into train-test -- statify y will insure there will be equal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)
    
    # Get Random Forest Regressor instance and fit training data
    rf= RandomForestRegressor(n_estimators=200)
    rf.fit(X_train, y_train)
    
    explainer = shap.TreeExplainer(rf)
    shap_values = explainer.shap_values(X_test)

    # Get feature names
    vals= np.abs(shap_values).mean(0)
    feature_importance = pd.DataFrame(list(zip(X.columns,vals)),columns=['col_name','feature_importance_vals'])
    feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
    feature_importance = feature_importance[feature_importance.feature_importance_vals > 0.002 ]

    # Recreate dependent and independent data set using selected features
    X= X[feature_importance.col_name]

    # Get results witout feature selection
    print("\n -- Model performace after SHAP feature selection -- ")
    getCVModelResult(X,y)

In [39]:
# Use grid search to tune the model
from sklearn.model_selection import GridSearchCV

def getCVModelResult(X,y):
    
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    

    # Create regularization penalty space
    penalty = ['l2']

    # Create regularization hyperparameter space
    C = np.logspace(0, 4, 10)

    # Create hyperparameter options
    hyperparameters = dict(C=C, penalty=penalty)
    
    # Create a based model
    lr_model = LogisticRegression(solver='newton-cg', multi_class='auto',max_iter = 4000)

    # Instantiate the grid search model
    grid_search = GridSearchCV(lr_model, hyperparameters, cv=5, verbose=0)
    
    
    # Fit model
    grid_search.fit(X_train, y_train)
    
    # Scores and accurancy
    model_train_score = round(grid_search.score(X_train, y_train)*100,3)
    model_test_score = round(grid_search.score(X_test, y_test)*100,3)

    print("\nTraining model score: ",model_train_score)
    print("Testing model score: ",model_test_score)
    
    # Get accurancy
    accuracy = grid_search.score(X_test, y_test)
    print("\nAccuracy", accuracy, "\n")
    
    # Get predictions
    y_pred = grid_search.predict(X_test)

    # Get confusion matrix
    csv_matri = classification_report(y_test,y_pred)
    print(csv_matri)
    
    # Print grid param and best score
    print(grid_search.best_params_)
    print(grid_search.best_score_)

In [40]:
# Get Model performance for SMOTE data using feature selection
featureCSVSelection(X_smote, y_smote)


 -- Model performace after SHAP feature selection -- 

Training model score:  88.358
Testing model score:  88.311

Accuracy 0.8831053351573187 

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      7310
           1       0.87      0.90      0.89      7310

    accuracy                           0.88     14620
   macro avg       0.88      0.88      0.88     14620
weighted avg       0.88      0.88      0.88     14620

{'C': 1.0, 'penalty': 'l2'}
0.8834394090217096


In [41]:
test_tuned_score = 88.311

##### Train and test results of tuned model did not improve accuracy of the model. 

### Model Evaluation

In [97]:
# Save results in csv file

In [43]:
# Save results in csv file
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'],
               'Accuracy': [f"{test_score}%", f"{feature_test_score}%", f"{test_tuned_score}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv(model_result)
evaluations_df

Unnamed: 0,Accuracy
,
Base Model,91.279%
Select Features Model,88.543%
Tuned Model,88.311%


### Summary: 
- Base model accuracy of  91.279% was reduced to 88.543% with feature selection and slightly decreased to 88.311% with tuned model. Base model is biased towards one class while model with feature selection and tuned model are consistent with F1 score.