# KNN Closest Neighbour Model

## Imports

In [2]:
import pandas as pd
import numpy as np

In [3]:
# training df 
df_train = pd.read_csv('../../data/train/customer_churn_dataset-training-clean.csv')
#test df 
df_test = pd.read_csv('../../data/test/customer_churn_dataset-testing-clean.csv')

## Splitting Train and Test

In [4]:
# training data setting fitting variables and predictor
X_train = df_train[['age', 'gender', 'tenure', 'usage_frequency', 'support_calls',
       'payment_delay', 'subscription_type', 'contract_length', 'total_spend',
       'last_interaction']]
y_train = df_train['churn']

# test data setting fitting variables and predictor
X_test = df_test[['age', 'gender', 'tenure', 'usage_frequency', 'support_calls',
       'payment_delay', 'subscription_type', 'contract_length', 'total_spend',
       'last_interaction']]
y_test = df_test['churn']

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(353644, 10)
(151562, 10)
(353644,)
(151562,)


## Scaling

In [5]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()

X_train_scal = scaler.fit_transform(X_train)
X_test_scal = scaler.transform(X_test)

print(X_train_scal.min())
print(X_train_scal.max())

print(X_test_scal.min())
print(X_test_scal.max())

0.0
1.0
0.0
1.0


## Fitting KNN Model

In [6]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [7]:
knn.fit(X_train_scal, y_train)

## (Baseline) Training Metrics

In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, multilabel_confusion_matrix, classification_report

In [9]:
y_pred_train = knn.predict(X_train_scal)


In [10]:
print("accuracy_score", accuracy_score(y_train, y_pred_train))
print("confusion_matrix\n", confusion_matrix(y_train, y_pred_train))

accuracy_score 0.912281842757123
confusion_matrix
 [[139780  17520]
 [ 13501 182843]]


In [11]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90    157300
           1       0.91      0.93      0.92    196344

    accuracy                           0.91    353644
   macro avg       0.91      0.91      0.91    353644
weighted avg       0.91      0.91      0.91    353644



## (Baseline) Testing Metrics 

In [12]:
y_pred_test = knn.predict(X_test_scal)
print("accuracy_score", accuracy_score(y_test, y_pred_test))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred_test))

accuracy_score 0.8897612858104275
confusion_matrix
 [[59036  8378]
 [ 8330 75818]]


In [13]:
print(classification_report(y_test, y_pred_test))


              precision    recall  f1-score   support

           0       0.88      0.88      0.88     67414
           1       0.90      0.90      0.90     84148

    accuracy                           0.89    151562
   macro avg       0.89      0.89      0.89    151562
weighted avg       0.89      0.89      0.89    151562



## KNN HyperParams RandomSearch

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

In [15]:
pipe = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('estimator', KNeighborsClassifier()),  
])


In [16]:
params = {
    'scaler': [MinMaxScaler(), StandardScaler(), None],
    'estimator__n_neighbors': np.arange(1,10),
    'estimator__weights': ['uniform', 'distance']
}

In [17]:
knn_rsc = RandomizedSearchCV(pipe, params, cv=3, scoring='roc_auc', verbose=3)
knn_rsc.fit(X_train_scal, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END estimator__n_neighbors=2, estimator__weights=uniform, scaler=StandardScaler();, score=0.897 total time=  12.7s
[CV 2/3] END estimator__n_neighbors=2, estimator__weights=uniform, scaler=StandardScaler();, score=0.899 total time=  12.6s
[CV 3/3] END estimator__n_neighbors=2, estimator__weights=uniform, scaler=StandardScaler();, score=0.899 total time=  14.0s
[CV 1/3] END estimator__n_neighbors=6, estimator__weights=uniform, scaler=MinMaxScaler();, score=0.929 total time=  24.0s
[CV 2/3] END estimator__n_neighbors=6, estimator__weights=uniform, scaler=MinMaxScaler();, score=0.930 total time=  22.1s
[CV 3/3] END estimator__n_neighbors=6, estimator__weights=uniform, scaler=MinMaxScaler();, score=0.930 total time=  22.0s
[CV 1/3] END estimator__n_neighbors=8, estimator__weights=distance, scaler=StandardScaler();, score=0.930 total time=  21.6s
[CV 2/3] END estimator__n_neighbors=8, estimator__weights=distance, scaler=S

In [18]:
knn_best =  knn_rsc.best_estimator_

In [19]:
print('best model train roc_auc score', knn_rsc.best_score_)
print('best model params', knn_rsc.best_params_)

best model train roc_auc score 0.9305657004670812
best model params {'scaler': StandardScaler(), 'estimator__weights': 'distance', 'estimator__n_neighbors': 8}


In [20]:
y_pred = knn_best.predict(X_test_scal)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87     67414
           1       0.90      0.90      0.90     84148

    accuracy                           0.89    151562
   macro avg       0.89      0.89      0.89    151562
weighted avg       0.89      0.89      0.89    151562

[[59093  8321]
 [ 8592 75556]]


## Save best KNN model

In [28]:
import pickle

# Save the best model
with open('../../models/trained_model_04_KNN.pkl', 'wb') as f:
    pickle.dump(knn_best, f)