readings:
https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py

https://roamanalytics.com/2016/10/28/are-categorical-variables-getting-lost-in-your-random-forests/

In [None]:
!pip install seaborn

In [None]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

### Titanic data-set
Can you predict who survived and who did not?

In [None]:
# data
test = pd.read_csv("../data/titanic/test.csv")
train = pd.read_csv("../data/titanic/train.csv")

In [None]:
total = train.isnull().sum().sort_values(ascending=False)
display(train.shape, test.shape)

In [None]:
total

In [None]:
train.info()

In [None]:
# we have to get rid of the objects somehow:
train[['Name', 'Ticket', 'Cabin']].head()

In [None]:
# Name is unique; Cabin hast to many Nans, Ticket is somehow unique too
train = train.drop(['Name', 'Ticket', 'Cabin'], axis = 1)
test = test.drop(['Name', 'Ticket', 'Cabin'], axis = 1)

Next, we replace the string-variables with numeric (integer) variables and fill missing values in age with the most frequent age.

In [None]:
unique_embarked = set(train.Embarked.unique()).union(test.Embarked.unique())
embarked = dict(zip(unique_embarked, list(range(len(unique_embarked)))))
unique_sex = set(train.Sex.unique()).union(test.Sex.unique())
sex = dict(zip(unique_sex, list(range(len(unique_sex)))))

data = [train, test]
for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked)
    dataset['Sex'] = dataset['Sex'].map(sex)
    dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mode().values[0])    

In [None]:
embarked

not necessary, but we cast our categorical variables to data-type 'category'

In [None]:
categorical = [var for var in train.columns if train[var].dtype=='O']
numerical = [var for var in train.columns if train[var].dtype!='O']
train[categorical] = train[categorical].astype('category')
test[categorical] = test[categorical].astype('category')

In [None]:
categorical

### we fit a RandomForest Classifier

In [None]:
survived = train['Survived']
train = train.drop('Survived', axis=1)
rf = RandomForestClassifier()
rf.fit(train, survived)

### The RandomForest Classifier comes with an feature_importances_ attribute

In [None]:
tree_feature_importances = rf.feature_importances_
feature_names = train.columns
sorted_idx = tree_feature_importances.argsort()
y_ticks = np.arange(0, len(feature_names))
fig, ax = plt.subplots()
ax.barh(y_ticks, tree_feature_importances[sorted_idx])
ax.set_yticklabels(feature_names[sorted_idx])
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances (Mean Decrease Impurity)")
fig.tight_layout()
plt.show()

### Why is PassengerId such an important variable?

We could even enhance the information contained in PassengerId by target-encoding this variable.<br> No information is lost, results should not be worse.

By the way, what exactly are we doing in the for-loop?

In [None]:
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
k=5

encoder = CatBoostEncoder(cols = 'PassengerId')
folds = StratifiedKFold(n_splits = k, shuffle=True, random_state=1234)

test_PassId = np.zeros(test.shape[0])
train_trg = train.copy()
test_trg = test.copy()
for train_fold_idx, test_fold_idx in folds.split(train, survived):
    _ = encoder.fit_transform(train.loc[train_fold_idx], survived[train_fold_idx])
    train_trg.loc[test_fold_idx, :] = encoder.transform(train.loc[test_fold_idx])
    test_PassId += (encoder.transform(test)['PassengerId']/k)
test_trg['PassengerId'] = test_PassId

In [None]:
display(test.head(), test_trg.head())

In [None]:
rf = RandomForestClassifier()
rf.fit(train_trg, survived)
tree_feature_importances = rf.feature_importances_
feature_names = train.columns
sorted_idx = tree_feature_importances.argsort()
y_ticks = np.arange(0, len(feature_names))
fig, ax = plt.subplots()
ax.barh(y_ticks, tree_feature_importances[sorted_idx])
ax.set_yticklabels(feature_names[sorted_idx])
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances (Mean Decrease Impurity)")
fig.tight_layout()
plt.show()

### Surprise!
What exactly happend? The relevant information about the target should still be contained in the mean-encoded PassengerId-variable.

In [None]:
display(train_trg['PassengerId'].unique(), test_trg['PassengerId'].unique())

### Can we make the old PassengerId-variable shine even brighter?

In [None]:
encoder = OneHotEncoder(cols = ['Age', 'Sex'])
train_oh = encoder.fit_transform(train)
test_oh = encoder.transform(test)

In [None]:
rf = RandomForestClassifier()
rf.fit(train_oh, survived)
tree_feature_importances = rf.feature_importances_
feature_names = train_oh.columns
sorted_idx = tree_feature_importances.argsort()
y_ticks = np.arange(0, len(feature_names))
fig, ax = plt.subplots(figsize=(4, 12))
ax.barh(y_ticks, tree_feature_importances[sorted_idx])
ax.set_yticklabels(feature_names[sorted_idx])
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances (Mean Decrease Impurity)")
fig.tight_layout()
plt.show()

# Gini-coefficient (gini impurity) for splitting variables

\begin{equation*}
\text{gini}=1-\sum_i^n P_i^2 
\end{equation*}
where $i$ is the class index;
The gini impurity measueres the purity of a node. If all elements within one node fall in one class, $P_i=1$ and the impurity is at its minimum.<br>
Gini impurity favours splitting repeatedly on continuous variables and thus cutting out regions of very low impurity. The continuous variable itself might even contain no real information at all (PassengerId).

# Permutation variable importance

In [None]:
display(train.head(), survived.head())

In [None]:
from sklearn.inspection import permutation_importance

rf_model = RandomForestClassifier()
rf_model.fit(train, survived)
result = permutation_importance(rf_model, train, survived,
        n_repeats=30,
        random_state=0)

sorted_idx = result.importances_mean.argsort()

fig, ax = plt.subplots()
ax.boxplot(result.importances[sorted_idx].T,
           vert=False, labels=train.columns[sorted_idx])
ax.set_title("Permutation Importances")
fig.tight_layout()
plt.show()

In [None]:
import dtreeviz
from sklearn import tree
from dtreeviz.trees import *

classifier = tree.DecisionTreeClassifier(max_depth=5)  # limit depth of tree
classifier.fit(train, survived)

viz = dtreeviz(classifier, 
               train, 
               survived,
               target_name='survived',
               feature_names=train.columns, 
               class_names=["died", "survived"]  # need class_names for classifier
              )  
              
viz.view() 

In [None]:
viz

# experiment
How well would a random variable perform?
  - some values are repeating but random
  - all values are unique and random

In [None]:
# simple random variable not all values unique (vs. PassengerId)

from numpy.random import randint
train_rand = train.copy()
train_rand['my_rand_var'] = randint(0, train.shape[0], train.shape[0])
rf_rand = RandomForestClassifier()
rf_rand.fit(train_rand, survived)
tree_feature_importances = rf_rand.feature_importances_
feature_names = train_rand.columns
sorted_idx = tree_feature_importances.argsort()
y_ticks = np.arange(0, len(feature_names))
fig, ax = plt.subplots()
ax.barh(y_ticks, tree_feature_importances[sorted_idx])
ax.set_yticklabels(feature_names[sorted_idx])
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances (MDI)")
fig.tight_layout()
plt.show()

### Cardinality of Variable in relation to number of observations
If the ratio is 1.0, that means that we have as many different values in the variable as there are observations. The number of possible splits that just randomly lead to pure nodes is very high.

In [None]:
display(len(train_rand.Age.unique())/train.shape[0], len(train_rand.my_rand_var.unique())/train.shape[0],
       len(train_rand.PassengerId.unique())/train.shape[0])

When we repeatedly execute the cell below, in most cases the variable 'my_unique_rand_var' has a higher feature_importance than the 'my_unique_rand_var_sorted' variable.<br>
Why is this?<br>
Values of both variables are unique. There should be no sorting in the target-variable within the data-frame.

In [None]:
# simple random variable all values unique (vs. PassengerId)

from numpy.random import choice
train_rand_all_unique = train.copy()
train_rand_all_unique['my_unique_rand_var'] = choice(np.arange(train.shape[0]), train.shape[0], replace=False)
train_rand_all_unique['my_unique_rand_var_sorted'] = np.arange(train.shape[0])
rf_rand_all_unique = RandomForestClassifier()
rf_rand_all_unique.fit(train_rand_all_unique, survived)
tree_feature_importances = rf_rand_all_unique.feature_importances_
feature_names = train_rand_all_unique.columns
sorted_idx = tree_feature_importances.argsort()
y_ticks = np.arange(0, len(feature_names))
fig, ax = plt.subplots()
ax.barh(y_ticks, tree_feature_importances[sorted_idx])
ax.set_yticklabels(feature_names[sorted_idx])
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances (MDI)")
fig.tight_layout()
plt.show()

In [None]:
sns.heatmap(np.round(train_rand_all_unique.corr(), 2), annot=True)

In [None]:
from sklearn.inspection import permutation_importance

rf_model = RandomForestClassifier()
rf_model.fit(train_rand_all_unique, survived)
result = permutation_importance(rf_model, train_rand_all_unique, survived,
        n_repeats=30,
        random_state=0)

sorted_idx = result.importances_mean.argsort()

fig, ax = plt.subplots()
ax.boxplot(result.importances[sorted_idx].T,
           vert=False, labels=train_rand_all_unique.columns[sorted_idx])
ax.set_title("Permutation Importances")
fig.tight_layout()
plt.show()

In [None]:
train_rand_all_unique.head()

### Interpretation
For the interpretation of the importance-scores, this is highly relevant. As we see, PassengerId and my_unique_rand_var_sorted are highly redundant. Nonetheless, both have very high importance-scores; The accuracy of the model will not degrade if we remove one of the variables - even though both have high importance.