In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [4]:
df=pd.read_csv('data\PreprocessedBankChurners.csv')
df.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,...,Education_Level_Doctorate,Education_Level_Graduate,Education_Level_High School,Education_Level_Post-Graduate,Education_Level_Uneducated,Education_Level_Unknown,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Unknown
0,0,50,3,39,5,1,3,12691.0,777,11914.0,...,0,0,1,0,0,0,0,1,0,0
1,0,60,5,44,6,1,2,8256.0,864,7392.0,...,0,1,0,0,0,0,0,0,1,0
2,0,60,3,36,4,1,0,3418.0,0,3418.0,...,0,1,0,0,0,0,0,1,0,0
3,0,50,4,34,3,4,1,3313.0,2517,796.0,...,0,0,1,0,0,0,0,0,0,1
4,0,50,3,21,5,1,0,4716.0,0,4716.0,...,0,0,0,0,1,0,0,1,0,0


In [5]:
X = df.iloc[:, df.columns != 'Attrition_Flag']
y = df['Attrition_Flag']
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
X_train.columns

Index(['Customer_Age', 'Dependent_count', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Gender_F', 'Gender_M', 'Income_Category_$120K +',
       'Income_Category_$40K - $60K', 'Income_Category_$60K - $80K',
       'Income_Category_$80K - $120K', 'Income_Category_Less than $40K',
       'Income_Category_Unknown', 'Card_Category_Blue', 'Card_Category_Gold',
       'Card_Category_Platinum', 'Card_Category_Silver',
       'Education_Level_College', 'Education_Level_Doctorate',
       'Education_Level_Graduate', 'Education_Level_High School',
       'Education_Level_Post-Graduate', 'Education_Level_Uneducated',
       'Education_Level_Unknown', 'Marital_Status_Divorced',
       'Marital_Status_Married', 'Marital_Status_Single',
       'Marit

In [7]:
non_categ_columns= ['Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
X_train_non_categ = X_train[non_categ_columns]

In [8]:
scaler = StandardScaler()
scaler.fit(X_train_non_categ)
X_train_non_categ_scaled = scaler.transform(X_train_non_categ)

In [9]:
X_test_non_categ = X_test[non_categ_columns]
X_test_non_categ_scaled = scaler.transform(X_test_non_categ)

In [10]:
categ_columns= X_train.columns.difference(non_categ_columns)
categ_columns

Index(['Card_Category_Blue', 'Card_Category_Gold', 'Card_Category_Platinum',
       'Card_Category_Silver', 'Contacts_Count_12_mon', 'Customer_Age',
       'Dependent_count', 'Education_Level_College',
       'Education_Level_Doctorate', 'Education_Level_Graduate',
       'Education_Level_High School', 'Education_Level_Post-Graduate',
       'Education_Level_Uneducated', 'Education_Level_Unknown', 'Gender_F',
       'Gender_M', 'Income_Category_$120K +', 'Income_Category_$40K - $60K',
       'Income_Category_$60K - $80K', 'Income_Category_$80K - $120K',
       'Income_Category_Less than $40K', 'Income_Category_Unknown',
       'Marital_Status_Divorced', 'Marital_Status_Married',
       'Marital_Status_Single', 'Marital_Status_Unknown',
       'Months_Inactive_12_mon', 'Months_on_book', 'Total_Relationship_Count'],
      dtype='object')

In [40]:
X_train_non_categ_scaled = pd.DataFrame(X_train_non_categ_scaled)
X_train_non_categ_scaled.columns = non_categ_columns

In [30]:
X_train_categ = X_train[categ_columns]
X_train_db_noncategscaled = pd.DataFrame(X_train_non_categ_scaled)

In [32]:
X_train_db_noncategscaled.reset_index(drop=True, inplace=True)
X_train_categ.reset_index(drop=True, inplace=True)

In [33]:
X_train_update =pd.concat([X_train_db_noncategscaled, X_train_categ], axis=1, ignore_index=True)

In [34]:
X_train_update

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-0.690795,0.210621,-0.710069,-0.346001,-0.051462,0.671345,0.722395,1.049190,1,0,...,0,1,0,0,1,0,0,2,36,5
1,-0.484076,0.493552,-0.528606,0.057277,-0.744701,-0.684882,0.432636,0.345465,1,0,...,0,0,0,0,0,0,1,2,34,3
2,-0.793956,-1.427920,-0.666352,-0.220846,-0.596234,-0.684882,-1.473891,-0.996353,1,0,...,0,0,0,0,1,0,0,3,38,2
3,0.555584,1.360798,0.433866,2.184919,3.142057,1.603751,-0.096488,-0.391077,1,0,...,1,0,0,0,0,1,0,2,28,1
4,1.850914,0.451727,1.811442,-0.165221,2.610144,1.349458,0.352848,-0.777579,1,0,...,1,0,0,0,0,0,1,3,36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7590,-0.495872,1.668332,-0.645757,0.224151,-0.679527,-0.896793,0.508226,1.220563,1,0,...,0,0,0,0,0,1,0,2,38,3
7591,-0.077252,-1.427920,0.050753,-0.971778,-0.631305,-0.981557,-1.679661,-0.996353,1,0,...,0,0,1,1,0,0,0,3,36,4
7592,-0.697410,0.475100,-0.740405,0.200974,0.072748,0.374670,1.209525,1.442984,1,0,...,0,1,0,0,0,0,1,1,32,3
7593,-0.559707,0.671922,-0.620275,-1.180370,-0.861313,-0.981557,-1.679661,0.750198,1,0,...,0,0,0,0,0,1,0,1,28,4


In [41]:
X_test_categ = X_test[categ_columns]
X_test_db_noncategscaled = pd.DataFrame(X_test_non_categ_scaled)

X_test_db_noncategscaled.reset_index(drop=True, inplace=True)
X_test_categ.reset_index(drop=True, inplace=True)

X_test_update =pd.concat([X_test_db_noncategscaled, X_test_categ], axis=1, ignore_index=True)

In [42]:
knn = KNeighborsClassifier()
knn.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [43]:
param_grid={'n_neighbors':np.arange(1,20)} 
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X_train_update, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])})

In [44]:
knn_cv.best_params_

{'n_neighbors': 7}

In [48]:
knn_best_param = KNeighborsClassifier(n_neighbors=5)
knn_best_param.fit(X_train_update, y_train)
y_pred = knn_best_param.predict(X_test_update)

In [49]:
print(confusion_matrix(y_test, y_pred))

[[2067   46]
 [ 247  172]]


In [50]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      2113
           1       0.79      0.41      0.54       419

    accuracy                           0.88      2532
   macro avg       0.84      0.69      0.74      2532
weighted avg       0.88      0.88      0.87      2532



In [None]:
y_pred_prob = knn_best_param.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Gradient Boosting Case Study')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show();

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred_prob)

### Same with non-standard values

In [16]:
knn_best_param = KNeighborsClassifier(n_neighbors=5)
knn_best_param.fit(X_train, y_train)
y_pred = knn_best_param.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93      2113
           1       0.71      0.55      0.62       419

    accuracy                           0.89      2532
   macro avg       0.81      0.75      0.78      2532
weighted avg       0.88      0.89      0.88      2532

