In [None]:
## Decision Tree Modeling in Python

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
np.random.seed(66)

In [None]:
# Load dataset and preprocessing: The dataset includes 21 varibales and one target feature (Churn?).
#Decision Tree Induction needs numeric values, therefore columns with object types were either coverted or dropped.
churn = pd.read_csv('https://raw.githubusercontent.com/yhat/demo-churn-pred/master/model/churn.csv')
churn.columns
churn["Int'l Plan"] = churn["Int'l Plan"].map(dict(yes=1, no=0))
churn['VMail Plan'] = churn['VMail Plan'].replace({"yes": 1, "no": 0})
churn.select_dtypes('object').columns

# sklearn expect all numerical attributes
churn.drop(['State', 'Phone'], inplace=True, axis=1)
churn.head()

In [None]:
## Model Training: Dropped the target feature from the X dataset and keep the rest of the dataset for testing. For the y dataset, keep only the target feature for prediction.
## Split the X and y dataset into tranining (70%) and testing (30%) and setting a random seed of 1
X = churn.drop('Churn?', axis=1)
y = churn['Churn?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(f"train data size is {X_train.shape}")

#Create decision tree classifier function
clf = DecisionTreeClassifier()
clf

In [None]:
#Data information for the X training dataset with all varibales having a numeric datatype
X_train.info()

In [None]:
## Model Hyperparamter Fine Tuning:
#Criterion is used to split the decision tree to gain information
#min_samples_split: minimum # of samples to split at internal node
#max_depth: maximum depth of the decision tree
#min_samples_leaf: minimum # of samples att leaf node
#max_leaf_nodes: maximum number of leaf nodes in decision tree


param_grid = {'criterion': ['gini', 'entropy'],
              'min_samples_split': [2,10,20],
              'max_depth': [5,10,20,25,30],
              'min_samples_leaf': [1,5,10],
              'max_leaf_nodes': [2,5,10,20]}

#GridSearchCV: Searching the paramter grid to find the best hypermeters to crosss-validate the model performance
#cv: cross-validation by dividing the dataset into equal parts to train and evaluate it
#Accuarcy for model perofrmance metric
grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')

#Using the grid to fit the training dataset to iterates through the hyperparamters set above
grid.fit(X_train, y_train)


In [None]:
## Output Best Model Hyperparameters
# Entropy evaluated the quality of the model with an accuracy score of 0.94

print(grid.best_score_)
for hps, values in grid.best_params_.items():
  print(f"{hps}: {values}")

In [None]:
# Visualization: There are over 80% of False compared to true
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 8))
churn['Churn?'].value_counts(normalize=True).plot.bar(rot=45);
plt.xlabel("Churn Status")
plt.ylabel("Distribution")