In [None]:
## Decision Tree Modeling in Python

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate, ShuffleSplit, LeaveOneOut
from sklearn import metrics
from matplotlib import pyplot as plt
np.random.seed(66)

churn = pd.read_csv('https://raw.githubusercontent.com/yhat/demo-churn-pred/master/model/churn.csv')
churn["Int'l Plan"] = churn["Int'l Plan"].map(dict(yes=1, no=0))
churn['VMail Plan'] = churn['VMail Plan'].replace({"yes": 1, "no": 0})
churn.select_dtypes('object').columns

In [None]:
## Model Training/Testing

num_vars = churn.select_dtypes('number').columns
X = churn[num_vars]
y = churn['Churn?'].map({'True.': 1, 'False.': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


plt.hist(y_pred, bins=2)
plt.xticks(range(0, 2))
plt.show()

In [None]:
## Performance Reporting

print(f"Accuracy: {round(metrics.accuracy_score(y_test, y_pred)*100, 2)}%")
df_confusion = pd.crosstab(y_test, y_pred)
df_confusion.index = [['Real', 'Real'], ['Stay', 'Leave']]
df_confusion.columns = [['Predict'] * 2, ['Stay', 'Leave']]
df_confusion
print(metrics.classification_report(y_test, y_pred))

In [None]:
## Hyperparameters' Grid Search

param_grid = {'criterion': ['gini', 'entropy'],
              'min_samples_split': [2, 10, 20, 30],
              'max_depth': [4, 5, 6, 10, 15, 20],
              'min_samples_leaf': [ 1, 5, 10],
              'max_leaf_nodes': [2, 5, 10, 20]}
grid = GridSearchCV(clf, param_grid, cv=5)
grid.fit(X_train, y_train)
print(grid)

In [None]:
## Repeated Hold-Out Method

bstrap = ShuffleSplit(n_splits=10, test_size=0.3, random_state=16)
grid_bstrap = GridSearchCV(clf, param_grid, cv=bstrap)
grid_bstrap.fit(X_train, y_train)

In [None]:
## Hyperparameters for Best Performinig Model

print(f"Accuracy: {round(grid_bstrap.best_score_*100, 2)}%")
for key, value in grid_bstrap.best_params_.items():
  print(f"Hyperparameter: {key}; Value: {value}")

In [None]:
## Leave One Out

loocv = LeaveOneOut()
lv_score = cross_val_score(clf, X, y, cv=loocv)
print(f"Leave One Out accuracy is {round(lv_score.mean(), 2)}")

In [None]:
## ROC-AUC Curve
from sklearn.metrics import roc_auc_score, roc_curve

y_pred_prob = grid.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC = {auc}")
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)

import seaborn as sns
sns.lineplot(x=fpr, y=tpr)
plt.xlabel('1 - Specificity')
plt.ylabel('Sensitivity')