# How to use MeanDecreaseImpurity class

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier,
                              GradientBoostingClassifier, IsolationForest)

from eml.importances.mdi import MeanDecreaseImpurity

In [34]:
clf = IsolationForest()

In [35]:
clf.fit(iris.data, iris.target)

IsolationForest(behaviour='deprecated', bootstrap=False, contamination='auto',
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=None, verbose=0, warm_start=False)

In [36]:
clf.n_features_

4

In [3]:
iris = load_iris()

In [4]:
estimators = (DecisionTreeClassifier(), ExtraTreeClassifier(),
              RandomForestClassifier(), AdaBoostClassifier(),
              ExtraTreesClassifier(), GradientBoostingClassifier())

for e in estimators:
    e.fit(iris.data, iris.target)
    try:
        mdi = MeanDecreaseImpurity(features=iris.feature_names, use_precompute=False)
        mdi.fit(e)
        importances = mdi.predict(X=None)
    except Exception as err:
        print(type(e))
        raise err
    np.testing.assert_allclose(importances, e.feature_importances_)

In [5]:
mdi.estimator

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [11]:
mdi.predict(X=None)

array([0.00522039, 0.01304223, 0.18960321, 0.79213417])

In [15]:
mdi.predict(X=iris.data[0].reshape((1, -1))).sum()

1.0

In [12]:
mdi.predict(X=iris.data)

array([0.00522039, 0.01304223, 0.18960321, 0.79213417])

In [9]:
mdi = MeanDecreaseImpurity(features=iris.feature_names, use_precompute=False)

In [39]:
clf = RandomForestClassifier()

In [40]:
clf.fit(iris.data, iris.target)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [41]:
inference = iris.data[0].reshape((1, 4))

In [45]:
tree = clf.estimators_[0]

In [46]:
dp = tree.decision_path(inference).toarray()

In [47]:
dp

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [48]:
tree.tree_.n_node_samples

array([94, 30, 64, 33, 32, 28,  4,  2,  1,  1,  2,  1, 31])

In [49]:
tree.tree_.weighted_n_node_samples

array([150.,  48., 102.,  54.,  52.,  48.,   4.,   2.,   1.,   1.,   2.,
         2.,  48.])

In [50]:
nodes_weight = tree.tree_.weighted_n_node_samples / tree.tree_.n_node_samples

In [52]:
nodes_weight * dp

array([[1.59574468, 1.6       , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

In [10]:
mdi.fit(e)

In [11]:
e.feature_importances_

array([0.00743728, 0.00977304, 0.27665255, 0.70613713])

In [12]:
mdi.predict(e)

array([0.00743728, 0.00977304, 0.27665255, 0.70613713])

In [112]:
mdi.predict(forest, weighted=True)

array([0.11030335, 0.02928489, 0.45879072, 0.40162104])

In [113]:
forest.feature_importances_

array([0.11030335, 0.02928489, 0.45879072, 0.40162104])

In [114]:
mdi.predict(tree, weighted=True)

array([0.02666667, 0.        , 0.05072262, 0.92261071])

In [115]:
tree.feature_importances_

array([0.02666667, 0.        , 0.05072262, 0.92261071])