In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 3 DECISIONTREES/DECISIONTREES/DECISIONTREES DECISIONTREES 2 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 4: Loading packages  ####

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

#import graphviz
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from matplotlib.legend_handler import HandlerLine2D




In [None]:
#=================================================-
#### Slide 5: Directory settings  ####

# Set 'main_dir' to location of the project folder
home_dir = Path(".").resolve()
main_dir = home_dir.parent.parent
print(main_dir)
data_dir = str(main_dir) + "/data"
print(data_dir)
plot_dir = str(main_dir) + "/plots"
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)
print(plot_dir)





In [None]:
#=================================================-
#### Slide 6: Load the dataset  ####

df = pd.read_csv(str(data_dir)+"/"+ 'healthcare-dataset-stroke-data.csv')
print(df.head())




In [None]:
#=================================================-
#### Slide 7: Subset data  ####

df = df[['age', 'avg_glucose_level', 'heart_disease', 'ever_married', 'hypertension', 'Residence_type', 'gender', 'smoking_status', 'work_type', 'stroke', 'id']]
print(df.head())




In [None]:
#=================================================-
#### Slide 8: Data prep: check for NAs  ####

 # Check for NAs.
print(df.isnull().sum())
percent_missing = df.isnull().sum() * 100 / len(df)
print(percent_missing)




In [None]:
#=================================================-
#### Slide 9: Data prep: check for NAs  ####

# Delete columns containing either 50% or more than 50% NaN Values
perc = 50.0
min_count =  int(((100-perc)/100)*df.shape[0] + 1)
df = df.dropna(axis=1,
               thresh=min_count)
print(df.shape)
# Function to impute NA in both numeric and categorical columns
def fillna(df):
    numeric_columns = df.select_dtypes(include='number').columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean().to_dict())

    categorical_columns = df.select_dtypes(exclude='number').columns
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

    return df

df = fillna(df)




In [None]:
#=================================================-
#### Slide 10: Data prep: target  ####

print(df['stroke'].dtypes)
# Identify the the two unique classes
threshold = df['stroke'].mean()
df['stroke'] = np.where(df['stroke'] > threshold, 1,0)
unique_values = sorted(df['stroke'].unique())
df['stroke'] = np.where(df['stroke'] == unique_values[0],  False,True)
# Check class again.
print(df['stroke'].dtypes)




In [None]:
#=================================================-
#### Slide 11: Summarize the data  ####

print(df.describe())




In [None]:
#=================================================-
#### Slide 15: Decision Tree: splitting the data  ####

# Split the data into X and y
columns_to_drop_from_X = ['stroke'] + ['id']
X = df.drop(columns_to_drop_from_X, axis = 1)
y = np.array(df['stroke'])




In [None]:
#=================================================-
#### Slide 16: Data prep: numeric variables  ####

X = pd.get_dummies(X, columns = ['heart_disease', 'ever_married', 'hypertension', 'Residence_type', 'gender', 'smoking_status', 'work_type'], dtype=float, drop_first=True)
print(X.dtypes)




In [None]:
#=================================================-
#### Slide 17: Decision Tree: running the algorithm  ####

# Implement the decision tree on X.
clf = tree.DecisionTreeClassifier()
clf_fit = clf.fit(X, y)

# Look at our generated model:
print(clf_fit)




In [None]:
#=================================================-
#### Slide 18: Visualize: plot_tree  ####

# Set figure size
fig = plt.figure(figsize=(25,20))
# Visualize `clf_fit_small`
tree.plot_tree(clf_fit,
              feature_names= X.columns,
              filled=True)
# Save figure
plt.savefig(str(plot_dir)+'/tree.png',format='png',bbox_inches = "tight")




In [None]:
#=================================================-
#### Slide 21: Split into train and test sets  ####

# Split into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)




In [None]:
#=================================================-
#### Slide 22: Fit Decision Tree and predict  ####

# Implement the decision tree on X_train.
clf = tree.DecisionTreeClassifier()
clf_fit = clf.fit(X_train, y_train)

# Predict on X_test.
y_predict = clf_fit.predict(X_test)
y_predict[:20]


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################


In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 3 DECISIONTREES/DECISIONTREES/DECISIONTREES DECISIONTREES 3 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 5: Evaluate the model (cont'd)  ####

# Confusion matrix for first model.
cm_tree = confusion_matrix(y_test,y_predict)
# Accuracy score.
acc_score = accuracy_score(y_test, y_predict)
print(acc_score)




In [None]:
#=================================================-
#### Slide 6: Plot confusion matrix  ####

plt.clf()
plt.imshow(cm_tree, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['Negative','Positive']
plt.title('Confusion Matrix - Test Data')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j]) + " = " + str(cm_tree[i][j]))
plt.show()




In [None]:
#=================================================-
#### Slide 7: Plot ROC and calculate AUC  ####


# Calculate metrics for ROC (fpr, tpr) and calculate AUC.
fpr, tpr, threshold = metrics.roc_curve(y_test, y_predict)
roc_auc = metrics.auc(fpr, tpr)

# Plot ROC.
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()




In [None]:
#=================================================-
#### Slide 9: Decision Tree: build  ####

# Set up logistic regression model.
clf = tree.DecisionTreeClassifier()
print(clf)




In [None]:
#=================================================-
#### Slide 10: Decision Tree: fit  ####

# Fit the model.
clf_fit = clf.fit(X_train, y_train)




In [None]:
#=================================================-
#### Slide 11: Decision Tree: predict  ####

# Predict on X_test.
y_predict = clf_fit.predict(X_test)
print(y_predict[:20])




In [None]:
#=================================================-
#### Slide 12: Decision Tree: accuracy score  ####

# Compute test model accuracy score.
tree_accuracy_score = metrics.accuracy_score(y_test, y_predict)
print("Accuracy on test data: ", tree_accuracy_score)




In [None]:
#=================================================-
#### Slide 13: Decision Tree: train accuracy  ####

# Compute accuracy using training data.
acc_train_tree = clf_fit.score(X_train,
                                 y_train)
print ("Train Accuracy:", acc_train_tree)





In [None]:
#=================================================-
#### Slide 14: Decision Tree: accuracy  ####

# Save this model to use later if needed
model_final_tree = {'metrics' : "accuracy" ,
                                  'values' : round(tree_accuracy_score,4),
                                  'model':'tree_all_variables' }
print(model_final_tree)




In [None]:
#=================================================-
#### Slide 20: Cross-validation scores  ####

clf = tree.DecisionTreeClassifier()
cv_scores = cross_val_score(clf, X, y, cv = 10)
# Print each cv score (accuracy) and average them.
print(cv_scores)
print("cv_scores mean:{}".format(np.mean(cv_scores)))
mean = np.mean(cv_scores)
print("Optimal cv score is:", round(mean, 4))


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################


In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 3 DECISIONTREES/DECISIONTREES/DECISIONTREES DECISIONTREES 4 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 4: Define an optimal number function  ####

# Define function that will determine the optimal number for each parameter.
def optimal_parameter(values,test_results):
    best_test_value = max(test_results)
    best_test_index = test_results.index(best_test_value)
    best_value = values[best_test_index]
    return(best_value)




In [None]:
#=================================================-
#### Slide 6: Optimize: max depth  ####

# Max depth:
max_depths = range(1, 33)
train_results = []
test_results = []

for max_depth in max_depths:
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(X_train, y_train)

    train_pred = dt.predict(X_train)
    acc_train = accuracy_score(y_train, train_pred)
    train_results.append(acc_train)

    y_pred = dt.predict(X_test);
    acc_test = accuracy_score(y_test, y_pred)
    test_results.append(acc_test);
# Store optimal max_depth.
optimal_max_depth = optimal_parameter(max_depths, test_results);




In [None]:
#=================================================-
#### Slide 7: Plot: max depth  ####

# Plot max depth over 1 - 32.
line1, = plt.plot(max_depths, train_results, 'b', label= "Train accuracy")
line2, = plt.plot(max_depths, test_results, 'r', label= "Test accuracy")

plt.legend(handler_map={line1: HandlerLine2D(numpoints = 2)})
plt.ylabel('Accuracy')
plt.xlabel('Tree depth')
plt.show()




In [None]:
#=================================================-
#### Slide 9: Optimize: min samples split  ####

min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
train_results = []
test_results = []

for min_samples_split in min_samples_splits:
   dt = DecisionTreeClassifier(min_samples_split=min_samples_split)
   dt.fit(X_train, y_train)
   train_pred = dt.predict(X_train)
   acc_train = accuracy_score(y_train, train_pred)
   # Add accuracy score to previous train results
   train_results.append(acc_train)
   y_pred = dt.predict(X_test)
   acc_test = accuracy_score(y_test, y_pred)
   # Add accuracy score to previous test results
   test_results.append(acc_test)
# Store optimal max_depth.
optimal_min_samples_split = optimal_parameter(min_samples_splits,test_results)




In [None]:
#=================================================-
#### Slide 10: Plot: min samples split  ####

# Plot min_sample split.
line1, = plt.plot(min_samples_splits, train_results, 'b', label = "Train accuracy")
line2, = plt.plot(min_samples_splits, test_results, 'r', label = "Test accuracy")

plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('min samples split')
plt.show()





In [None]:
#=================================================-
#### Slide 12: Optimize: min samples leaf  ####

# Min_samples_leaf:
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint = True)
train_results = []
test_results = []

for min_samples_leaf in min_samples_leafs:
   dt = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
   dt.fit(X_train, y_train)
   train_pred = dt.predict(X_train)
   acc_train = accuracy_score(y_train, train_pred)
   # Add accuracy score to previous train results
   train_results.append(acc_train)
   y_pred = dt.predict(X_test)
   acc_test = accuracy_score(y_test, y_pred)
   # Add accuracy score to previous test results
   test_results.append(acc_test)

optimal_min_samples_leafs = optimal_parameter(min_samples_leafs,test_results)




In [None]:
#=================================================-
#### Slide 13: Plot: min samples leaf  ####

# Plot min_sample split.
line1, = plt.plot(min_samples_leafs, train_results, 'b', label= "Train accuracy")
line2, = plt.plot(min_samples_leafs, test_results, 'r', label= "Test accuracy")

plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('min samples leafs')
plt.show()




In [None]:
#=================================================-
#### Slide 15: Optimize: max features  ####

# Max_features:
max_features = list(range(1,X.shape[1]))
train_results = []
test_results = []

for max_feature in max_features:
   dt = DecisionTreeClassifier(max_features=max_feature)
   dt.fit(X_train, y_train)
   train_pred = dt.predict(X_train)
   acc_train = accuracy_score(y_train, train_pred)
   # Add accuracy score to previous train results
   train_results.append(acc_train)
   y_pred = dt.predict(X_test)
   acc_test = accuracy_score(y_test, y_pred)

   # Add accuracy score to previous test results
   test_results.append(acc_test)

optimal_max_features = optimal_parameter(max_features,test_results)




In [None]:
#=================================================-
#### Slide 16: Plot: max features  ####

# Plot min_sample split.
line1, = plt.plot(max_features, train_results, 'b', label= "Train accuracy")
line2, = plt.plot(max_features, test_results, 'r', label= "Test accuracy")

plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('Accuracy')
plt.xlabel('max features')
plt.show()




In [None]:
#=================================================-
#### Slide 18: Optimized model  ####

print("The optimal max depth is:", optimal_max_depth)
print("The optimal min samples split is:", optimal_min_samples_split)
print("The optimal min samples leaf is:", optimal_min_samples_leafs)
print("The optimal max features is:", optimal_max_features)




In [None]:
#=================================================-
#### Slide 19: Build optimized model  ####

# Set the seed.
np.random.seed(1)

# Implement the Decision Tree on X_train.
clf_optimized = tree.DecisionTreeClassifier(max_depth = optimal_max_depth,
                                            min_samples_split = optimal_min_samples_split,
                                            min_samples_leaf = optimal_min_samples_leafs,
                                            max_features = optimal_max_features)

# We can now see our optimized features where before they were just default:
print(clf_optimized)

clf_optimized_fit = clf_optimized.fit(X_train, y_train)




In [None]:
#=================================================-
#### Slide 20: Predict with optimized model  ####

# Predict on X_test.
y_predict_optimized = clf_optimized_fit.predict(X_test)

# Get the accuracy score.
acc_score_tree_optimized = accuracy_score(y_test, y_predict_optimized)

print(acc_score_tree_optimized)




In [None]:
#=================================================-
#### Slide 21: Train accuracy  ####

# Compute accuracy using training data.
acc_train_tree_optimized = clf_optimized_fit.score(X_train,
                                         y_train)

print ("Train Accuracy:", acc_train_tree_optimized)




In [None]:
#=================================================-
#### Slide 23: Predict and save results  ####

# Add the optimized model to our dataframe.
model_final_tree = {'metrics' : "accuracy" ,
             'values' : round(acc_score_tree_optimized,4),
             'model':'tree_all_variables_optimized' }
print(model_final_tree)




In [None]:
#=================================================-
#### Slide 25: Exercise  ####




#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################
