In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
%matplotlib inline

In [3]:
df = pd.read_csv('../../data/Customer Churn Data.csv')

In [4]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [5]:
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [6]:
df['international plan'] = (df['international plan']=='yes').astype(int)
df['voice mail plan'] = (df['voice mail plan']=='yes').astype(int)
## using sparse to compite the matrix to one row. 
ohe = OneHotEncoder(sparse = False)
ohe_states = pd.DataFrame(ohe.fit_transform(pd.DataFrame(df['state'])), columns = ohe.get_feature_names())
df = pd.concat([df, ohe_states], axis = 1)
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,NJ,137,415,358-1921,0,0,0,243.4,114,41.38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,OH,84,408,375-9999,1,0,0,299.4,71,50.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OK,75,415,330-6626,1,0,0,166.7,113,28.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df = df.drop(['phone number', 'state', 'area code'], axis=1).copy()

In [8]:
df.head()

### 'area code' needs to dummy code. 

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY
0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,107,0,1,26,161.6,123,27.47,195.5,103,16.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,137,0,0,0,243.4,114,41.38,121.2,110,10.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,84,1,0,0,299.4,71,50.9,61.9,88,5.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75,1,0,0,166.7,113,28.34,148.3,122,12.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X = df.drop('churn', axis=1)
y = df['churn']

In [10]:
X_train, X_hold, y_train, y_hold = train_test_split(X, y)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train)

In [11]:
ss = StandardScaler()
X_train1 = ss.fit_transform(X_train1)
X_test1 = ss.transform(X_test1)

In [12]:
estimators = [('knn', KNeighborsClassifier(n_neighbors = 20)),   
              ('rf', RandomForestClassifier(n_estimators = 100)),
              ('log', LogisticRegression(solver = 'liblinear')),
              ('grad', GradientBoostingClassifier())]
stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)
stack.fit(X_train1, y_train1);
stack.score(X_train1, y_train1)

0.9893276414087513

In [13]:
def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))
    print('\n')

In [14]:
for i in stack.estimators_:
    print(i)
    metrics(y_test1, i.predict(X_test1))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')
Accuracy: 0.8496
Precision: 0.0
Recall: 0.0
F1: 0.0


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Accuracy: 0.9168
Precision: 0.9565217391304348
Recall: 0.46808510638297873
F1: 0.6285714285714286


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scali

  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
stack.final_estimator_.coef_

array([[0.13203913, 3.17699295, 1.14329825, 5.28500273]])

In [16]:
stack.estimators_[3].feature_importances_

array([8.59944729e-03, 1.07202030e-01, 3.72669994e-02, 3.18270511e-02,
       1.74237496e-01, 8.81540947e-03, 1.66550156e-01, 7.15812262e-02,
       5.46336575e-03, 6.34269165e-02, 1.39242082e-02, 4.78029496e-03,
       1.30498214e-02, 2.71147443e-02, 7.46244285e-02, 3.44185799e-02,
       1.35506298e-01, 0.00000000e+00, 7.60422753e-05, 0.00000000e+00,
       0.00000000e+00, 9.37574535e-04, 2.49836635e-04, 0.00000000e+00,
       1.66502575e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.51046233e-04, 2.98447973e-03, 0.00000000e+00, 2.50025981e-04,
       4.38640367e-04, 1.18528655e-04, 0.00000000e+00, 0.00000000e+00,
       1.04915222e-04, 4.59650174e-04, 8.81351942e-04, 1.23871913e-04,
       2.54029344e-04, 5.14051229e-08, 2.46869935e-04, 4.51216181e-04,
       0.00000000e+00, 9.67968754e-05, 0.00000000e+00, 0.00000000e+00,
       1.74166350e-03, 0.00000000e+00, 0.00000000e+00, 1.48804799e-03,
       1.11153939e-03, 0.00000000e+00, 4.27155571e-04, 0.00000000e+00,
      

In [17]:
stack.estimators_[1].feature_importances_

array([0.03313678, 0.0607442 , 0.01527416, 0.0212524 , 0.13639128,
       0.03652764, 0.14288708, 0.05256888, 0.03177297, 0.05358365,
       0.03900568, 0.03501115, 0.03716595, 0.04192182, 0.03921058,
       0.04008091, 0.09014508, 0.00125613, 0.0006642 , 0.00156663,
       0.00237227, 0.00259348, 0.00113846, 0.00110316, 0.00106191,
       0.00104092, 0.00137689, 0.00132381, 0.00043055, 0.00088409,
       0.00159415, 0.00124455, 0.00358137, 0.00102678, 0.00188056,
       0.00135571, 0.00174791, 0.00375001, 0.00341812, 0.0022434 ,
       0.00302544, 0.00067446, 0.00181075, 0.00338105, 0.0024261 ,
       0.00155425, 0.00153051, 0.00020451, 0.00356128, 0.0019093 ,
       0.00172198, 0.00313364, 0.00249936, 0.00191307, 0.00175322,
       0.00078447, 0.00077558, 0.00257453, 0.00179283, 0.00052149,
       0.00721896, 0.00114574, 0.00222064, 0.00088031, 0.00140612,
       0.00189151, 0.00166633, 0.0006873 ])

In [20]:
stack.estimators_[2].predict_proba(X_test1)

array([[0.9900777 , 0.0099223 ],
       [0.99869942, 0.00130058],
       [0.98725013, 0.01274987],
       ...,
       [0.74016174, 0.25983826],
       [0.9248706 , 0.0751294 ],
       [0.97356288, 0.02643712]])

In [21]:
from sklearn.metrics import roc_curve, auc