In [1]:
#Import Libraries

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
#Load data
df = pd.read_csv('../../data/Customer Churn Data.csv')
#Turn International Plan from a categorical variable to binary (yes = 1, no = 0)
df['international plan'] = (df['international plan'] == 'yes').astype(int)
#Turn Voice Mail Plan from a categorical variable to binary (yes = 1, no = 0)
df['voice mail plan'] = (df['voice mail plan'] == 'yes').astype(int)
#Initiate OneHotEncoder
ohe = OneHotEncoder(sparse = False)
#Create an ohe_states DF where you split the state column into new columns with the state name 
ohe_states = pd.DataFrame(ohe.fit_transform(pd.DataFrame(df['state'])), columns = ohe.get_feature_names())
#Combine the 2 dataframes 
df = pd.concat([df, ohe_states], axis = 1)
#Drop state and area code (irrelevant)
df = df.drop(['state'], axis = 1)

In [12]:
df

Unnamed: 0,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,...,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY
0,128,415,382-4657,0,1,25,265.1,110,45.07,197.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,107,415,371-7191,0,1,26,161.6,123,27.47,195.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,137,415,358-1921,0,0,0,243.4,114,41.38,121.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,84,408,375-9999,1,0,0,299.4,71,50.90,61.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75,415,330-6626,1,0,0,166.7,113,28.34,148.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,415,414-4276,0,1,36,156.2,77,26.55,215.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3329,68,415,370-3271,0,0,0,231.1,57,39.29,153.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3330,28,510,328-8230,0,0,0,180.8,109,30.74,288.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3331,184,510,364-6381,1,0,0,213.8,105,36.35,159.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
#Set target variable as churn
y = df['churn']
#Copy X
X = df.copy()
#Drop churn and phone number from X (could have dropped phone number earlier)
X.drop(['churn', 'area code','phone number'], axis = 1, inplace = True)

In [10]:
X ['customer acquisition cost'] 
X [' customer retention cost']
X ['customer lifetime revenue']
X ['customer lifetime value']

#Customer acquisition/retention is expensive because cell phone market is a zero-sum game. AT&T's loss, leads to Verizon's win. 
#TMUS 3.1 in Q4 2011 vs. 0.89 in Q4 2019 (quarter = 3 months)
#Customer acquisition cost: $350 
#Small marginal cost 
#70% decline in churn rate = 500% increase in stock value 
#Sprint: 100%
#Verizon: 60%
#AT&T: 30% 
#Companies try bundling to improve churn rate (T-Mobile & Netflix, Sprint & Hulu)
#In 2012, T-Mobile spent a lot of money to 
#CLTV = ARPU x (1/Churn Rate) + Referral Rate 
#Profit margin is 7.1%
#6-7x more to acquire a customer than retain a customer 
#What features lead to low churn, target those customers + use them in retention 
#Solution: Network extenders 

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,x0_SD,x0_TN,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY
0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,107,0,1,26,161.6,123,27.47,195.5,103,16.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,137,0,0,0,243.4,114,41.38,121.2,110,10.30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,84,1,0,0,299.4,71,50.90,61.9,88,5.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75,1,0,0,166.7,113,28.34,148.3,122,12.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,0,1,36,156.2,77,26.55,215.5,126,18.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3329,68,0,0,0,231.1,57,39.29,153.4,55,13.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3330,28,0,0,0,180.8,109,30.74,288.8,58,24.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3331,184,1,0,0,213.8,105,36.35,159.6,84,13.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#Split the initial data into train and holdout (holdout is for final evaluation)
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y)
#Split train into a train and test set (to build your model)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train)

#Initiate a standard scaler 
ss = StandardScaler()
#Scale X_train and X_test 
X_train1 = ss.fit_transform(X_train1)
X_test1 = ss.transform(X_test1)

In [5]:
#Set our estimators, 4 classification models
#Questions: What is solver "Liblinear"?

estimators = [('knn', KNeighborsClassifier(n_neighbors = 20)),   
              ('rf', RandomForestClassifier(n_estimators = 100)),
              ('log', LogisticRegression(solver = 'liblinear')),
              ('grad', GradientBoostingClassifier())]

#Initiate a stack classifier

stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5)

#Fit the model to our sub-train data 

stack.fit(X_train1, y_train1);

#Calculate accuracy score 

stack.score(X_train1, y_train1)

0.9845250800426895

In [8]:
#Evaluate metrics of our model based on the sub-test data 
#Accuracy is # of predictions our model got right (correct/total)
#Precision is when it guessed true, how many times was it correct (# of correct positive/total positive) 
#Recall is how many actual positives were guessed correctly (true positives/true positives + false negatives)
#Since false negatives are considered actual positives 
#F1 score is balance between precision and recall 

metrics(y_test1, stack.predict(X_test1))

Accuracy: 0.9456
Precision: 0.8117647058823529
Recall: 0.7931034482758621
F1: 0.8023255813953488


In [7]:
#Create a function that prints the scores 

def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))

In [9]:
#Write a for loop to print metrics for each model 

for i in stack.estimators_:
    metrics(y_test1, i.predict(X_test1))

Accuracy: 0.8608
Precision: 0.0
Recall: 0.0
F1: 0.0
Accuracy: 0.9376
Precision: 0.9285714285714286
Recall: 0.5977011494252874
F1: 0.7272727272727274
Accuracy: 0.8688
Precision: 0.5714285714285714
Recall: 0.22988505747126436
F1: 0.32786885245901637
Accuracy: 0.9424
Precision: 0.8
Recall: 0.7816091954022989
F1: 0.7906976744186047


  _warn_prf(average, modifier, msg_start, len(result))
