# Decisiton Tree interpretability notebook

In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.tree import plot_tree
from dtreeviz.trees import *
from pycaret import classification

### Exploratory data analysis
Import to specify correctly the data path. Initally we can make an easy exploration.

In [2]:
data_folder_path = os.path.join('..', 'data')
data_file = 'ds.csv'

In [3]:
df = pd.read_csv(os.path.join(data_folder_path, data_file))
df.describe()
df.gender.value_counts()

1    2700
0    1511
Name: gender, dtype: int64

In [None]:
playerTypes = pd.get_dummies(df['PlayerType'])
df = pd.concat([df.drop("PlayerType", axis=1), playerTypes], axis=1)
df.head()

### Classification Set-up
Definition of main model hyperparameters. Numeric features and target with full description available.

In [4]:
classification_setup = classification.setup(
    data=df,
    target='gender',
    numeric_features=[c for c in df.columns if c not in ['gender', 'matchPeriod', 'PlayerType']]
)

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,3472
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(4211, 33)"
4,Missing Values,False
5,Numeric Features,30
6,Categorical Features,2
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Initial exploration to understand general performance metrics for different classification algorithms (Focus at Accuracy and AUC)

In [None]:
classification.compare_models()

Decision tree implementation, prunning of the tree at 40 samples per leaf and looking at the Entropy gain of each split.

Model evaluated with a 10-fold cross validation.

In [5]:
classification.set_config('seed', 7940)
dt_model = classification.create_model('dt', min_samples_leaf=40, criterion='entropy')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8508,0.9235,0.9206,0.8571,0.8878,0.6664,0.6702
1,0.8441,0.9101,0.8889,0.8705,0.8796,0.6585,0.6588
2,0.8407,0.926,0.8889,0.866,0.8773,0.6503,0.6508
3,0.8407,0.9208,0.8519,0.8944,0.8726,0.6603,0.6617
4,0.8441,0.9241,0.8836,0.8743,0.8789,0.6599,0.66
5,0.8542,0.9298,0.8942,0.8802,0.8871,0.6814,0.6816
6,0.8542,0.9197,0.8836,0.8883,0.8859,0.6841,0.6841
7,0.8741,0.936,0.9101,0.8958,0.9029,0.7242,0.7244
8,0.8265,0.918,0.8677,0.8632,0.8654,0.6214,0.6214
9,0.8469,0.9177,0.873,0.8871,0.88,0.6688,0.6689


In [None]:
dt_model

In [None]:
tuned_dt_model = classification.tune_model(dt_model)

In [None]:
dt_model = classification.load_model('./static_models/dt_6_10')

In [None]:
dt_model

### Decision Tree implementation
- Feature importance as an aggregated from each split.
- Full Tree visualization.

In [None]:
classification_setup[0].columns

In [None]:
plot_options = ["auc","threshold","pr","confusion_matrix","error","class_report","boundary","rfe","learning","manifold","calibration","vc","dimension","feature","parameter"]
classification.plot_model(dt_model, plot='feature', save=True)
#Importance of the features measured by how much the nod purity is imporved on average.

In [None]:
interpretation_options = ['summary', 'correlation', 'reason']
classification.interpret_model(dt_model, interpretation_options[0])

In [None]:
classification.interpret_model(dt_model, interpretation_options[1], feature='Clearances')

In [7]:
viz = dtreeviz(dt_model, classification_setup[0], df.gender, target_name='gender', feature_names=classification_setup[0].columns, class_names=['Female', 'Male'], orientation='TD', fontname='serif')
viz.view()

In [None]:
classification.plot_model(dt_model, plot='boundary')

In [None]:

plot_tree(dt_model, filled=True)

### Scientific Reporting

In [None]:
coef_df = pd.DataFrame({'Feature': classification_setup[0].columns, 'Coefficients': dt_model.feature_importances_})

In [None]:
coef_df.sort_values(by=['Coefficients'], ascending=False).head(10).to_latex(index=False)