## KNeighborsClassifier Model

In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

### Read the CSV and Perform Basic Data Cleaning

In [2]:
# Import depedencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
# Load libraries
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler,LabelEncoder

In [66]:
# File path 
data_file = os.path.join("","data","featureData","feature_dataframe.csv")
model_result = os.path.join("","data","results","KNNClassifier.csv")

In [5]:
# Read data
df = pd.read_csv(data_file)

In [6]:
# Display Sample data
df.head()

Unnamed: 0,age,job,education,contact,duration,campaign,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,...,jun,mar,may,nov,oct,sep,mon,thu,tue,wed
0,56,4,2,0,261,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
1,57,7,5,0,149,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
2,37,7,5,0,226,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
3,40,10,3,0,151,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
4,56,7,5,0,307,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0


In [7]:
# Check data size
df.shape

(41188, 36)

- KNN is a distance based algorithm. If data contains features with varied scale, it would be difficult for the model to calculate distance for each and every point. Even if there is some distance calculated in training data, when you are fitting the same model in test data it will give incorrect predictions.

- In order to avoid these kind of scenarios, normalization is used, the data is either min max scaled or z scaled and you get all features in same scale for the analysis.

Let's seperate categorical features and numerical features. Scaling will be applied only on numerical features

In [8]:
df.columns

Index(['age', 'job', 'education', 'contact', 'duration', 'campaign',
       'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'nr.employed', 'y', 'marital_married', 'marital_single',
       'marital_unknown', 'default_unknown', 'default_yes', 'housing_unknown',
       'housing_yes', 'loan_unknown', 'loan_yes', 'poutcome_nonexistent',
       'poutcome_success', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov',
       'oct', 'sep', 'mon', 'thu', 'tue', 'wed'],
      dtype='object')

In [9]:
# Get Catagorial variables
categorical = df.nunique()[df.nunique() < 3].keys().tolist()

# Target 
target = ['y']

# Remove target from the list
categorical.remove(target[0])

# Get numberical features
numerical = [col for col in df.columns
if col not in target+categorical]

In [10]:
# Included ranked features for scaling

In [11]:
minmax_scale = MinMaxScaler()

In [12]:
scaled_numerical = minmax_scale.fit_transform(df[numerical])

In [13]:
# Build a DataFrame
scaled_numerical = pd.DataFrame(scaled_numerical, columns=numerical)

In [14]:
# Drop non-scaled numerical columns
df = df.drop(columns=numerical, axis=1)

# Merge the non-numerical with the scaled numerical data
df_scaled = df.merge(right=scaled_numerical,how='left', left_index=True,right_index=True)

In [15]:
df_scaled.head()

Unnamed: 0,contact,y,marital_married,marital_single,marital_unknown,default_unknown,default_yes,housing_unknown,housing_yes,loan_unknown,...,age,job,education,duration,campaign,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,0,0,1,0,0,0,0,0,0,0,...,0.481481,0.363636,0.285714,0.05307,0.0,0.9375,0.698753,0.60251,0.957379,0.859735
1,0,0,1,0,0,1,0,0,0,0,...,0.493827,0.636364,0.714286,0.030297,0.0,0.9375,0.698753,0.60251,0.957379,0.859735
2,0,0,1,0,0,0,0,0,1,0,...,0.246914,0.636364,0.714286,0.045954,0.0,0.9375,0.698753,0.60251,0.957379,0.859735
3,0,0,1,0,0,0,0,0,0,0,...,0.283951,0.909091,0.428571,0.030704,0.0,0.9375,0.698753,0.60251,0.957379,0.859735
4,0,0,1,0,0,0,0,0,0,0,...,0.481481,0.636364,0.714286,0.062424,0.0,0.9375,0.698753,0.60251,0.957379,0.859735


In [16]:
# Let's seperate dependent and independent variables
X= df_scaled.drop('y', axis=1)
y = df_scaled['y']

In [17]:
# Get feature list
feature_name = X.columns.tolist()

In [18]:
# Shape of independent variable or features
X.shape

(41188, 35)

### Pre-processing model performace

In [19]:
# Split data to train and test and check size of train data.
# Using 70-/30 split with random state as 420 (hyper parameter)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=420)

# Print shape of train data
print(X_train.shape)

(28831, 35)


In [20]:
# Build Model
base_model = KNeighborsClassifier(n_neighbors=35)

In [21]:
##### Train model
base_model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=35)

In [22]:
# Predict 
pred_base = base_model.predict(X_test)

In [23]:
# Get train and test scores
training_score = round(base_model.score(X_train, y_train)*100,3)
test_score = round(accuracy_score(y_test, pred_base)*100,3)

In [24]:
# Evaluate predictions
print(accuracy_score(y_test, pred_base))
print(confusion_matrix(y_test, pred_base))
print(classification_report(y_test, pred_base))

0.899166464352189
[[10865   116]
 [ 1130   246]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     10981
           1       0.68      0.18      0.28      1376

    accuracy                           0.90     12357
   macro avg       0.79      0.58      0.61     12357
weighted avg       0.88      0.90      0.87     12357



In [25]:
print(f"Training Data Score: {training_score} %")
print(f"Testing Data Score: {test_score} %")

Training Data Score: 89.782 %
Testing Data Score: 89.917 %


#### Training results and test results are close. But F1 score is very low for class 1. This could be due to imbalanced data/

In [26]:
# Let' use different methods to handle imbalnced data and test model results

In [27]:
def KNNModel(X_train, y_train, X_test, y_test):
    
    # Build model
    model = KNeighborsClassifier(n_neighbors=35,weights='uniform',algorithm='auto',
                                 leaf_size=30, p=2, metric='minkowski')
    
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Scores and accurancy
    model_train_score = round(model.score(X_train, y_train)*100,3)
    model_test_score = round(model.score(X_test, y_test)*100,3)

    print("\nTraining model score: ",model_train_score)
    print("Testing model score: ",model_test_score)
    
    # Get accurancy
    accuracy = model.score(X_test, y_test)
    print("\nAccuracy", accuracy, "\n")
    
    # Get predictions
    y_pred = model.predict(X_test)

    # Get confusion matrix
    knn_matri = classification_report(y_test,y_pred)
    print(knn_matri)

In [28]:
# Get Model results
def getModelResult(X,y):
    
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    
    # Get model results 
    KNNModel(X_train, y_train, X_test, y_test)

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import shap

def featureSelection(X,y,method):
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    
    # Get Random Forest Regressor instance and fit training data
    rf = RandomForestRegressor(n_estimators=200)
    rf.fit(X_train, y_train)
    
    if method == 1:
        # Using RF to select features
        sorted_idx = rf.feature_importances_.argsort()
        
        # Select X and y
        X=X[X.columns[rf.feature_importances_>0.001]]
        
        # Get results witout feature selection
        print("\n-- Model performace after Random Forest feature selection -- ")
        getModelResult(X,y)
    
    elif method==2:
        
        # Get feature importance
        perm_importance = permutation_importance(rf, X_test, y_test)

        # Get Index of feature
        sorted_idx = perm_importance.importances_mean.argsort()

        # Recreate dependent and independent data set using selected features
        X=X[X.columns[perm_importance.importances_mean>0]]

        # Get results witout feature selection
        print("\n-- Model performace after Permutation feature selection -- ")
        getModelResult(X,y)
    
    elif method==3:
        explainer = shap.TreeExplainer(rf)
        shap_values = explainer.shap_values(X_test)
        
        # Get feature names
        vals= np.abs(shap_values).mean(0)
        feature_importance = pd.DataFrame(list(zip(X.columns,vals)),columns=['col_name','feature_importance_vals'])
        feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
        feature_importance = feature_importance[feature_importance.feature_importance_vals > 0.002 ]
        
        # Recreate dependent and independent data set using selected features
        X= X[feature_importance.col_name]

        # Get results witout feature selection
        print("\n -- Model performace after SHAP feature selection -- ")
        getModelResult(X,y)

#### Method 1: Undersampling

In [30]:
# Class count
count_class_0, count_class_1 = df_scaled.y.value_counts()

# Divide by class
df_class_0 = df_scaled[df_scaled['y'] == 0]
df_class_1 = df_scaled[df_scaled['y'] == 1]

In [31]:
# Undersample 0-class and concat the DataFrames of both class
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.y.value_counts())

Random under-sampling:
1    4640
0    4640
Name: y, dtype: int64


In [32]:
# Get features and dependent variable data 
X = df_test_under.drop('y',axis='columns')
y = df_test_under['y']

In [33]:
# Get results witout feature selection
print("-- Model performace without feature selection -- ")
getModelResult(X,y)

-- Model performace without feature selection -- 

Training model score:  75.673
Testing model score:  74.407

Accuracy 0.744073275862069 

              precision    recall  f1-score   support

           0       0.70      0.85      0.77       928
           1       0.81      0.64      0.71       928

    accuracy                           0.74      1856
   macro avg       0.76      0.74      0.74      1856
weighted avg       0.76      0.74      0.74      1856



In [34]:
# Get Model performance for undersampled data using feature selection
featureSelection(X,y,1)
featureSelection(X,y,2)
featureSelection(X,y,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  75.35
Testing model score:  74.515

Accuracy 0.7451508620689655 

              precision    recall  f1-score   support

           0       0.71      0.84      0.77       928
           1       0.80      0.65      0.72       928

    accuracy                           0.75      1856
   macro avg       0.75      0.75      0.74      1856
weighted avg       0.75      0.75      0.74      1856


-- Model performace after Permutation feature selection -- 

Training model score:  74.502
Testing model score:  73.06

Accuracy 0.7306034482758621 

              precision    recall  f1-score   support

           0       0.70      0.81      0.75       928
           1       0.78      0.65      0.71       928

    accuracy                           0.73      1856
   macro avg       0.74      0.73      0.73      1856
weighted avg       0.74      0.73      0.73      1856


 -- Model performace after SHAP feature s

##### Model accuracy for train and test data is close for the under sampled data though there is big gap in F1 for each class. 

#### Method 2: Oversampling

In [35]:
# Oversample 1-class and concat the DataFrames of both classes
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.y.value_counts())

Random over-sampling:
1    36548
0    36548
Name: y, dtype: int64


In [36]:
# Get features and dependent variable data 
X = df_test_over.drop('y',axis='columns')
y = df_test_over['y']

In [37]:
# Get model results for oversampled data
print("-- Model performace without feature selection -- ")
getModelResult(X,y)

-- Model performace without feature selection -- 

Training model score:  80.129
Testing model score:  78.714

Accuracy 0.787140902872777 

              precision    recall  f1-score   support

           0       0.78      0.81      0.79      7310
           1       0.80      0.77      0.78      7310

    accuracy                           0.79     14620
   macro avg       0.79      0.79      0.79     14620
weighted avg       0.79      0.79      0.79     14620



In [38]:
# Get Model performance for oversampled data using feature selection
featureSelection(X,y,1)
featureSelection(X,y,2)
featureSelection(X,y,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  80.118
Testing model score:  78.741

Accuracy 0.7874145006839945 

              precision    recall  f1-score   support

           0       0.78      0.81      0.79      7310
           1       0.80      0.77      0.78      7310

    accuracy                           0.79     14620
   macro avg       0.79      0.79      0.79     14620
weighted avg       0.79      0.79      0.79     14620


-- Model performace after Permutation feature selection -- 

Training model score:  80.129
Testing model score:  78.714

Accuracy 0.787140902872777 

              precision    recall  f1-score   support

           0       0.78      0.81      0.79      7310
           1       0.80      0.77      0.78      7310

    accuracy                           0.79     14620
   macro avg       0.79      0.79      0.79     14620
weighted avg       0.79      0.79      0.79     14620


 -- Model performace after SHAP feature 

##### Model with over sampling gave slighlty better result than under sampling with feature selection. Also F1 score is fairly close for each class.

### Method 3: SMOTE

In [39]:
X = df_scaled.drop('y',axis='columns')
y = df_scaled['y']

In [40]:
from imblearn.over_sampling import SMOTE

# Get smote object
smote = SMOTE(sampling_strategy='minority')

# Fit data
X_smote, y_smote = smote.fit_sample(X, y)

# Check SMOTE results 
y_smote.value_counts()

1    36548
0    36548
Name: y, dtype: int64

In [41]:
# Get model results for oversampled data
print("-- Model performace without feature selection -- ")
getModelResult(X_smote, y_smote)

-- Model performace without feature selection -- 

Training model score:  83.391
Testing model score:  82.23

Accuracy 0.822298221614227 

              precision    recall  f1-score   support

           0       0.84      0.79      0.82      7310
           1       0.81      0.85      0.83      7310

    accuracy                           0.82     14620
   macro avg       0.82      0.82      0.82     14620
weighted avg       0.82      0.82      0.82     14620



In [42]:
# Get Model performance for SMOTE data using feature selection
featureSelection(X_smote, y_smote,1)
featureSelection(X_smote, y_smote,2)
featureSelection(X_smote, y_smote,3)


-- Model performace after Random Forest feature selection -- 

Training model score:  83.388
Testing model score:  82.373

Accuracy 0.823734610123119 

              precision    recall  f1-score   support

           0       0.84      0.80      0.82      7310
           1       0.81      0.85      0.83      7310

    accuracy                           0.82     14620
   macro avg       0.82      0.82      0.82     14620
weighted avg       0.82      0.82      0.82     14620


-- Model performace after Permutation feature selection -- 

Training model score:  83.391
Testing model score:  82.23

Accuracy 0.822298221614227 

              precision    recall  f1-score   support

           0       0.84      0.79      0.82      7310
           1       0.81      0.85      0.83      7310

    accuracy                           0.82     14620
   macro avg       0.82      0.82      0.82     14620
weighted avg       0.82      0.82      0.82     14620


 -- Model performace after SHAP feature se

In [64]:
test_selectFeature = 87.127

##### Model with SMOTE technique and SHAP feature selection gave much better results and also F1 score also looks good.

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [43]:
# Let's use smote data for grid search
len(X_smote),len(y_smote)

(73096, 73096)

In [44]:
def train_test(X,y):
    # Split data into train-test -- statify y will insure there will be equal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)
    return X_train, X_test, y_train, y_test

In [45]:
# Split data into train-test -- statify y will insure there will be equal represenation for each class
X_train, X_test, y_train, y_test = train_test(X_smote, y_smote)

In [46]:
from sklearn.model_selection import GridSearchCV

estimator_KNN = KNeighborsClassifier(algorithm='auto')

parameters_KNN = {
    'n_neighbors': (1,10, 1),
    'leaf_size': (20,40,1),
    'p': (1,2),
    'weights': ('uniform', 'distance'),
    'metric': ('minkowski', 'chebyshev')}
    
                   
# with GridSearch
grid_search_KNN = GridSearchCV(estimator=estimator_KNN, param_grid=parameters_KNN,
                                scoring = 'accuracy', n_jobs = -1, cv = 5)

In [47]:
grid_search_KNN.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'leaf_size': (20, 40, 1),
                         'metric': ('minkowski', 'chebyshev'),
                         'n_neighbors': (1, 10, 1), 'p': (1, 2),
                         'weights': ('uniform', 'distance')},
             scoring='accuracy')

In [48]:
# Parameter setting that gave the best results on the hold out data.
print(grid_search_KNN.best_params_ ) 

# Mean cross-validated score of the best_estimator
print('Best Score - KNN:', grid_search_KNN.best_score_ )

{'leaf_size': 20, 'metric': 'minkowski', 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
Best Score - KNN: 0.9270983103960735


In [58]:
# Predictions
pred_tuned = grid_search_KNN.predict(X_test)

In [59]:
# Get train and test scores
training_scoreTuned = round(grid_search_KNN.score(X_train, y_train)*100,3)
test_scoreTuned = round(accuracy_score(y_test, pred_tuned)*100,3)

In [60]:
# Evaluate predictions
print(accuracy_score(y_test, pred_tuned))
print(confusion_matrix(y_test, pred_tuned))
print(classification_report(y_test, pred_tuned))

0.9361833105335158
[[6610  700]
 [ 233 7077]]
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      7310
           1       0.91      0.97      0.94      7310

    accuracy                           0.94     14620
   macro avg       0.94      0.94      0.94     14620
weighted avg       0.94      0.94      0.94     14620



In [61]:
print(f"Training Data Score: {training_scoreTuned} %")
print(f"Testing Data Score: {test_scoreTuned} %")

Training Data Score: 100.0 %
Testing Data Score: 93.618 %


#### Significant improvement is the model accuracy and F1 score. 

### Model Evaluation

In [None]:
# Save results in csv file

In [67]:
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'],
               'Accuracy': [f"{test_score}%", f"{test_selectFeature}%", f"{test_scoreTuned}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv(model_result)
evaluations_df

Unnamed: 0,Accuracy
,
Base Model,89.917%
Select Features Model,87.127%
Tuned Model,93.618%


## Summary: 
- Base model accuracy of 89.91% was reduced to 87.127% with feature selection and improved further to 93.618% with tuned model (using gridSearch).