In [2]:
#import packages
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

#some settings
sns.set_palette("Paired")

In [3]:
import datetime

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#Scaler & Classifiers
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

#Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

In [4]:
df_model = pd.read_csv('data/TransformedTelcoCustomerChurn.csv')
df_model

Unnamed: 0,tenure,MonthlyCharges,servicesCount,PhoneService_Yes,MultipleLines_Yes,OnlineSecurity_Yes,OnlineBackup_Yes,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Churn,InternetService_DSL,InternetService_FiberOptic,Contract_MonthToMonth,Contract_OneYear,PaymentMethod_BankTransfer,PaymentMethod_ElectronicCheck,PaymentMethod_MailedCheck
0,1,29.85,2,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0
1,34,56.95,4,1,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1
2,2,53.85,4,1,0,1,1,0,0,0,0,1,1,0,1,0,0,0,1
3,45,42.30,4,0,0,1,0,1,1,0,0,0,1,0,0,1,1,0,0
4,2,70.70,2,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,24,84.80,8,1,1,1,0,1,1,1,1,0,1,0,0,1,0,0,1
7039,72,103.20,7,1,1,0,1,1,0,1,1,0,0,1,0,1,0,0,0
7040,11,29.60,2,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0
7041,4,74.40,3,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,1


In [5]:
X = df_model.drop(columns = ['Churn'])
y = df_model['Churn'].copy()

In [6]:
# Train, Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
#gridSearch scoring, equal for all the classifier used foward
scores = {'f1_binary': make_scorer(f1_score, average='binary'),'AUC': 'roc_auc', 'Accuracy': 'accuracy'}

In [65]:
## Random Forest Classifier
time_start = datetime.datetime.now()
clf = RandomForestClassifier()
model = clf.fit(X_train, y_train)

y_pred = model.predict(X_test)
time_end = datetime.datetime.now()

print("Random Forest Classifier")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Random Forest Classifier
auc: 0.6815183407211677
f1-score: 0.5345132743362832
accuracy: 0.7737634408602151
Time: 0.558717


In [71]:
## Random Forest Classifier
time_start = datetime.datetime.now()

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier())
])
parameters = {
    'scaler__with_mean': [True, False],
    'scaler__with_std': [True, False],
    'clf__n_estimators': [32,64,100],
    'clf__max_depth': [6,7,8,9,10,11,12]
}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring=scores,refit='f1_binary')
cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)
time_end = datetime.datetime.now()

print("Random Forest Classifier")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Random Forest Classifier
auc: 0.721408424007897
f1-score: 0.5987708516242318
accuracy: 0.8034408602150538
Time: 113.099823


In [74]:
## Gradient-Boosted Tree
time_start = datetime.datetime.now()
clf = GradientBoostingClassifier()
model = clf.fit(X_train, y_train)

y_pred = model.predict(X_test)
time_end = datetime.datetime.now()

print("Gradient-Boosted Classifier")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Gradient-Boosted Classifier
auc: 0.7175715669090077
f1-score: 0.5937219730941704
accuracy: 0.8051612903225807
Time: 0.709242


In [101]:
## Gradient-Boosted Tree
time_start = datetime.datetime.now()

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', GradientBoostingClassifier())
])
parameters = {
    'scaler__with_mean': [True, False],
    'scaler__with_std': [True, False],
    'clf__n_estimators': [32,64,100],
    'clf__max_depth': [6,7,8,9,10,11,12]
}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring=scores,refit='f1_binary')
cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)
time_end = datetime.datetime.now()

print("Gradient-Boosted Classifier")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Gradient-Boosted Classifier
auc: 0.7123558246287003
f1-score: 0.5845881310894596
accuracy: 0.7982795698924732
Time: 490.082485


In [102]:
cv.best_params_

{'clf__max_depth': 6,
 'clf__n_estimators': 64,
 'scaler__with_mean': True,
 'scaler__with_std': False}

In [76]:
## Desicion Tree Classifier
time_start = datetime.datetime.now()
clf = DecisionTreeClassifier()
model = clf.fit(X_train, y_train)

#y_pred = clf.predict(X_test)
y_pred = model.predict(X_test)
time_end = datetime.datetime.now()

print("Desicion Tree Classifier")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Desicion Tree Classifier
auc: 0.6483120268439246
f1-score: 0.48587570621468923
accuracy: 0.7260215053763441
Time: 0.122994


In [77]:
## Desicion Tree Classifier
time_start = datetime.datetime.now()

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', DecisionTreeClassifier())
])
parameters = {
    'scaler__with_mean': [True, False],
    'scaler__with_std': [True, False],
    'clf__max_depth': [1,2,3,4,5,6,7,8],
    'clf__min_samples_split': [0.1,0.2,0.3,0.4],
    'clf__min_samples_leaf': [0.2]
}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring=scores,refit='f1_binary')
cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)
time_end = datetime.datetime.now()

print("Desicion Tree Classifier")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Desicion Tree Classifier
auc: 0.7324437279725555
f1-score: 0.604274134119381
accuracy: 0.7690322580645161
Time: 11.092814


In [91]:
## Logistic Regression
time_start = datetime.datetime.now()
clf = LogisticRegression(max_iter = 1000)
model = clf.fit(X_train, y_train)

#y_pred = clf.predict(X_test)
y_pred = model.predict(X_test)
time_end = datetime.datetime.now()

print("Logistic Regression")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Logistic Regression
auc: 0.7405167042626741
f1-score: 0.6276870163370593
accuracy: 0.813763440860215
Time: 0.655809


In [9]:
## Logistic Regression
time_start = datetime.datetime.now()

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())
])
parameters = {
    'scaler__with_mean': [True, False],
    'scaler__with_std': [True, False],
    'clf__solver': ['newton-cg','lbfgs','liblinear'],
    'clf__max_iter': [1000]
}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring=scores,refit='AUC')
cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)
time_end = datetime.datetime.now()

print("Logistic Regression")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Logistic Regression
auc: 0.7380404347875044
f1-score: 0.6237113402061856
accuracy: 0.8116129032258065
Time: 4.008724


In [98]:
## SVC
time_start = datetime.datetime.now()
clf = SVC()
model = clf.fit(X_train, y_train)

#y_pred = clf.predict(X_test)
y_pred = model.predict(X_test)
time_end = datetime.datetime.now()

print("Support Vectors Classifier")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Support Vectors Classifier
auc: 0.6710291484785815
f1-score: 0.5145145145145146
accuracy: 0.7913978494623656
Time: 1.04142


In [99]:
## SVC
time_start = datetime.datetime.now()

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC())
])
parameters = {
    'scaler__with_mean': [True, False],
    'scaler__with_std': [True, False],
    'clf__kernel': ['linear','rbf','poly','sigmoid']
}

cv = GridSearchCV(pipeline, param_grid=parameters,scoring=scores,refit='f1_binary')
cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)
time_end = datetime.datetime.now()

print("Support Vectors Classifier")
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
print("auc: {}".format(auc(false_positive_rate, true_positive_rate)))
print("f1-score: {}".format(f1_score(y_test, y_pred, average='binary')))
print("accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("Time: {}".format((time_end-time_start).total_seconds()))

Support Vectors Classifier
auc: 0.7372442564435553
f1-score: 0.6225279449699054
accuracy: 0.8111827956989247
Time: 71.510578
