<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import ml_inspector # From https://gitlab.nist.gov/gitlab/nam4/ml_inspector

In [None]:
%load_ext watermark
%watermark -t -m -h -v --iversions

In [None]:
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Train model in first round
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # 97%

In [None]:
# Look at pfi --> EVERYTHING comes out as irrelevant because many features highly correlated
ml_inspector.model.InspectModel.pfi(clf, X_test, y_test, n_repeats=30, feature_names=data.feature_names.tolist())

In [None]:
# Look at multicollinearity
selected_features, cluster_id_to_feature_ids = ml_inspector.data.InspectData.cluster_collinear(X, # Can use entire dataset since this is unsupervised
                                                                              figsize=(12, 8), 
                                                                              display=True, 
                                                                              t=2,
                                                                              feature_names=None) # None returns indices, otherwise can specify: data.feature_names.tolist())

In [None]:
# Fit again just using these selected features
X_train, X_test = X_train[:,selected_features], X_test[:,selected_features]
clf.fit(X_train, y_train) 
clf.score(X_test, y_test) # 96%, almost identical as expected

In [None]:
# Top is 'mean radius', which according to dendogram above, is highly correlated with other "size" metrics
ml_inspector.model.InspectModel.pfi(clf, X_test, y_test, n_repeats=30, 
                                    feature_names=data.feature_names[selected_features].tolist())