In [None]:
from IPython.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>")) # Increase cell width
display(HTML("<style>.rendered_html { font-size: 16px; }</style>")) # Increase font size

# Larger figures

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  

import warnings

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import auc, roc_curve, f1_score, accuracy_score, precision_recall_curve, confusion_matrix, classification_report
from sklearn.metrics import recall_score, precision_score
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

# Practice #1

Classification trees with Titanic data file

## Data Loading

Variable description:

- **survived:	Survival (target variable) - 0 = No, 1 = Yes**
- pclass:	Ticket class - 1 = 1st, 2 = 2nd, 3 = 3rd
- sex:	    Sex
- age:	    Age in years
- sibsp:	# of siblings / spouses aboard the Titanic
- parch:	# of parents / children aboard the Titanic
- ticket:	Ticket number
- fare:	    Passenger fare
- cabin:	Cabin number
- embarked:	Port of Embarkation - C = Cherbourg, Q = Queenstown, S = Southampton


Let's load the dataset and review null values

In [None]:
import openpyxl

# Red data
filename = 'titanic.xlsx'
df = pd.read_excel(filename, 1) #it has two sheets, we load the 2nd one
df.head()

In [None]:
print ("Records and variables: ", df.shape)
print ("Column names: ", df.columns.to_list())

In [None]:
df.isnull().sum() #null values

4 variables with null values, mostly cabin and age. <BR>
We can discard cabin since it appears to be meaningless (and also ticket). <BR> 
We will need to delete records with missing values, otherwise the model will fail

In [None]:
df['cabin'].unique() 

In [None]:
# droppping cabin and ticket
df.drop(labels=['cabin', 'ticket'], axis=1, inplace=True)

Show % of records from each of the survived class

In [None]:
from collections import Counter

def class_perc(data):
    lendata = len(data)
    classes = Counter(data)
    
    for sclass, freq in classes.items():
        perc = (freq / lendata) * 100
        print(f"Class '{sclass}': {perc:.2f}%")

class_perc(df['survived'])

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna(subset=['embarked', 'age', 'fare'])
df.describe()

We are going to split it into training and test (with same class distribution of survived variable in each set)

In [None]:
from sklearn.model_selection import train_test_split

def split_df(dataframe, seed=None, percentage=0.8):
    
    X = df.loc[:, dataframe.columns != 'survived']
    y = df['survived']

    return train_test_split(X, y, test_size=1-percentage, random_state=seed, stratify=y) # note the stratify parameter

X_train, X_test, Y_train, Y_test = split_df(df, seed=42, percentage=0.5)

In [None]:
print("Training data set: ", X_train.shape)
print("Testing data set: ", X_test.shape)
class_perc(Y_train.to_frame(name='survived')["survived"])
class_perc(Y_test.to_frame(name='survived')["survived"])

## Classification Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

titanic_tree = DecisionTreeClassifier(random_state=42)
titanic_tree.fit(X_train, Y_train)

Let's take a look to the performance of the classifier (by using initially Accuracy)

In [None]:
from sklearn.metrics import mean_squared_error
predictions = titanic_tree.predict(X_test)
print("Accuracy = {0:.4f}".format(accuracy_score(Y_test, predictions)))

Now we will take a look to the tree itself. This is a bit complex since sklearn does not provide a way to visualize the models. To that end, we will need to make use of an external library: `pydotplus`.

In [None]:
# ! pip install pydotplus

# ! pip install graphviz 
# you may need to install this library directly from https://graphviz.gitlab.io/_pages/Download/Download_windows.html 
# and then uncomment following two lines
#import os
#os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38/bin/'(installation folder)

from io import StringIO
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

def plot_tree(tree, feature_names):
    dot_data = StringIO()
    export_graphviz(tree, out_file=dot_data, feature_names=feature_names,
                    filled=True, rounded=True,special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())

In [None]:
plot_tree(titanic_tree, X_train.columns)

In [None]:
# How many nodes and leaves?
print("Number of nodes: ", titanic_tree.tree_.node_count)
print("Number of leaves: ", titanic_tree.get_n_leaves())

We will plot the feature importance

In [None]:
plt.figure(figsize=(10,10))
plt.bar(X_train.columns, titanic_tree.feature_importances_) #average reduction in impurity resulting from splitting at each node of the tree using that feature
plt.title('Feature Importance', fontsize=16);

### Pruning the tree

Now we will prune the tree to see if we can improve performance.

There are different Pruning Parameters:

 - max_leaf_nodes: Reduce the number of leaf nodes
 - min_samples_leaf: Restrict the size of sample leaf. Minimum sample size in terminal nodes can be fixed to 30, 100, 300 or 5% of total 
 - max_depth: Reduce the depth of the tree to build a generalized tree. Set the depth of the tree to 3, 5, 10 depending after verification on test data
 - etc..

Let's focus on the depth of the tree. We will test different depth thresholds via CV by using the `GridSearchCV` provided by sklearn.

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': range(1,16)} # 15 different depth levels

titanic_tree_pruned_cv = GridSearchCV(titanic_tree, 
                   param_grid,
                   scoring='accuracy',
                   cv=5 , n_jobs=1, verbose=1)

titanic_tree_pruned_cv.fit(X_train,Y_train);

In [None]:
print("Best parameters set found on development set:")
print()
print(titanic_tree_pruned_cv.best_params_)
print()
print("Grid scores on development set:")
print()
means = titanic_tree_pruned_cv.cv_results_['mean_test_score']
stds = titanic_tree_pruned_cv.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, titanic_tree_pruned_cv.cv_results_['params']):
    print("Accuracy = %0.3f (+/%0.03f) for %r" % (mean, std * 2, params))

In [None]:
plt.figure(figsize=(13,10))
plt.errorbar(range(1,16,1), [m for m in means], yerr=stds, fmt='--o')
plt.title('Accuracy for different Depths', fontsize=20)
plt.xlabel("Depth", fontsize=16)
plt.ylabel("Accuracy", fontsize=16);

As can be seen in the plot, the optimal value for the depth of the decision tree is 3.

### Test data set

Let's now calculate accuracy for test data set

In [None]:
titanic_tree_pruned = DecisionTreeClassifier(random_state=42, max_depth=3)
tree=titanic_tree_pruned.fit(X_train, Y_train)
predictions = titanic_tree_pruned.predict(X_test)
print("Accuracy = {0:.4f}".format(accuracy_score(Y_test, predictions)))

from sklearn.metrics import recall_score, precision_score
prob_pred = tree.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.0, 1.0, step=0.1)
recall_scores = [metrics.recall_score(Y_test, prob_pred > t) for t in thresholds]
precis_scores = [metrics.precision_score(Y_test, prob_pred > t) for t in thresholds]


Great! We have increase the accuracy with a smaller tree.

Now, we represent the different values for the metrics obtained using different thresholds

In [None]:
fig, ax = plt.subplots(1, 1)
ax.plot(thresholds, recall_scores, label="Recall @ t")
ax.plot(thresholds, precis_scores, label="Precision @ t")
ax.axvline(0.5, c="gray", linestyle="--", label="Default Threshold")
ax.set_xlabel("Threshold")
ax.set_ylabel("Metric @ Threshold")
ax.set_box_aspect(1)
ax.legend()
plt.show()

In [None]:
pred = tree.predict_proba(X_test) > 0.3
pred

In [None]:
# How many nodes and leaves?
print("Number of nodes: ", titanic_tree_pruned.tree_.node_count)
print("Number of leaves: ", titanic_tree_pruned.get_n_leaves())

And we can represent the set of rules

In [None]:
from sklearn import tree

text_representation = tree.export_text(titanic_tree_pruned)
print(text_representation)
with open("decision_tree.log", "w") as fout:
    fout.write(text_representation)

### Visualization

We are going to deep down a bit more on the tree.<BR>


In [None]:
import graphviz
# DOT data
dot_data = tree.export_graphviz(titanic_tree_pruned, out_file=None, 
                                class_names=["No","Yes"],
                                feature_names=X_train.columns,  
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph

In [None]:
# Save it into a high resolution image
graph.render("titanic_tree_graphivz")

## Alternative visualization: dtreeviz
we need to install the powerful library *dtreeviz* for this

In [None]:
## ! pip install dtreeviz
import dtreeviz

viz_cmodel = dtreeviz.model(titanic_tree_pruned,
                           X_train=X_train,
                           y_train=Y_train,
                           feature_names=X_train.columns,
                           target_name='survived')
viz_cmodel.view(scale=1.5)

Display now frequencies for each node.<BR>

In [None]:
viz_cmodel.leaf_sizes(figsize=(12,6))

We can see node id=6 is the one with maximum amount of people

In [None]:
# Compare it with target classes
viz_cmodel.ctree_leaf_distributions(figsize=(12,6))

Let's explore that node

In [None]:
viz_cmodel.node_stats(node_id=6)

And highlight it in the tree so that we can take a closer look at it

In [None]:
x = X_train[X_train.columns].iloc[1]
viz_cmodel.view(x=x, scale=1.5)

## Model performance

Finally, let's going to analyze model performance (on test data set)

In [None]:
import seaborn as sns

conf_mat = confusion_matrix(Y_test, predictions)

fig, ax = plt.subplots()
label_names = ['TN','FN','FP','TP']
label_counts = ['{0:0.0f}'.format(value) for value in conf_mat.flatten()]
labels = [f'{v1}\n{v2}' for v1, v2 in zip(label_names,label_counts)]
labels = np.asarray(labels).reshape(2,2)

# Draw heatmap using confusion matrix
sns.heatmap(conf_mat, annot=labels, fmt='')
ax.set_title('Confusion matrix')
ax.set_xlabel('Actual Values')
ax.set_ylabel('Predicted Values')
