# Setup

In [1]:
from __future__ import division

In [2]:
import numpy as np

In [3]:
import scipy as sp

In [4]:
import matplotlib.pyplot as pltp

In [5]:
from warnings import warn

In [6]:
from sklearn.utils.fixes import euler_gamma

In [7]:
from sklearn.ensemble import IsolationForest

In [8]:
from scipy.sparse import issparse

In [9]:
import numbers

In [10]:
from sklearn.externals import six

In [11]:
from sklearn.tree import ExtraTreeRegressor

In [12]:
from sklearn.utils import check_random_state, check_array

In [13]:
from sklearn.ensemble.bagging import BaseBagging

In [14]:
__all__ = ["IsolationForest"]

In [15]:
INTEGER_TYPES = (numbers.Integral, np,integer)

# Isolation Forest의 차별화된 점
1. A small sample size produces better iTrees Because the swamping and masking effects are reduced.
2. iForest has a linear time complexity with a low constant and a low memory requirement.

# Create on Class

In [16]:
## Create on class
class IsolationForest(BaseBagging):
     def __init__(self,
                 n_estimators=100,
                 max_samples="auto",
                 contamination=0.1,
                 max_features=1.,
                 bootstrap=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(IsolationForest, self).__init__(
            base_estimator=ExtraTreeRegressor(
                max_features=1,
                splitter='random',
                random_state=random_state),
## here above max_features has no links with self.max_features
            bootstrap=bootstrap,
            bootstrap_features=False,
            n_estimators=n_estimators,
            max_samples=max_samples,
            max_features=max_features,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose)
        self.contamination = contamination



# Fit Estimator

In [17]:
def _set_oob_score(self, X, y):
        raise NotImplementedError("OOB score not supported by iforest")

## Fit Estimator
def fit(self, X, y=None, sample_weight=None):
    X = check_array(X, accept_sparse=['csc'])
    if issparse(X):
            # Pre-sort indices to avoid that each individual tree of the
            # ensemble sorts the indices.
        X.sort_indices()

        rnd = check_random_state(self.random_state)
        y = rnd.uniform(size=X.shape[0])

        # ensure that max_sample is in [1, n_samples]:
        n_samples = X.shape[0]

    if isinstance(self.max_samples, six.string_types):
        if self.max_samples == 'auto':
                max_samples = min(256, n_samples)
        else:
            raise ValueError('max_samples (%s) is not supported.'
                                 'Valid choices are: "auto", int or'
                                 'float' % self.max_samples)

    elif isinstance(self.max_samples, INTEGER_TYPES):
           if self.max_samples > n_samples:
               warn("max_samples (%s) is greater than the "
                    "total number of samples (%s). max_samples "
                    "will be set to n_samples for estimation."
                     % (self.max_samples, n_samples))
               max_samples = n_samples
           else:
               max_samples = self.max_samples
    else:  ## float ##
        if not (0. < self.max_samples <= 1.):
           raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples)
        max_samples = int(self.max_samples * X.shape[0])
        
    self.max_samples_ = max_samples
    max_depth = int(np.ceil(np.log2(max(max_samples, 2))))
    super(IsolationForest, self)._fit(X, y, max_samples, max_depth=max_depth, sample_weight=sample_weight)

    
    self.threshold_ = -sp.stats.scoreatpercentile(-self.decision_function(X), 100. * (1. - self.contamination))

    return self

# Predict the model

In [18]:
## Predict the model
## Predict if a particular sample is an outlier or not.

def predict(self, X):
    X = check_array(X, accept_sparse='csr')
    is_inlier = np.ones(X.shape[0], dtype=int)
    is_inlier[self.decision_function(X) <= self.threshold_] = -1
    return is_inlier


# Decision Function : Average anomaly score of X of the base classifiers.

the Average path length of a n_left samples that isolation tree is added.

In [19]:
## Average anomaly score of X of the base classifiers
def decision_function(self, X):

    ## code structure from ForestClassifier/predict_proba
    ## Check data
    X = check_array(X, accept_sparse='csr')
    n_samples = X.shape[0]

    n_samples_leaf = np.zeros((n_samples, self.n_estimators), order="f")
    depths = np.zeros((n_samples, self.n_estimators), order="f")

    if self._max_features == X.shape[1]:
        subsample_features = False
    else:
         subsample_features = True

    for i, (tree, features) in enumerate(zip(self.estimators_, self.estimators_features_)):
        if subsample_features:
            X_subset = X[:, features]
        else:
             X_subset = X
        leaves_index = tree.apply(X_subset)
        node_indicator = tree.decision_path(X_subset)
        n_samples_leaf[:, i] = tree.tree_.n_node_samples[leaves_index]
        depths[:, i] = np.ravel(node_indicator.sum(axis=1))
        depths[:, i] -= 1

    depths += _average_path_length(n_samples_leaf)

    scores = 2 ** (-depths.mean(axis=1) / _average_path_length(self.max_samples_))

## Take the opposite of the scores as bigger is better (here less abnormal) 
## And add 0.5 (this value plays a special role as described in the original paper) 
## to give a sense to scores = 0:
    return 0.5 - scores 

# Average path Length

![image.png](attachment:image.png)

In [20]:
## The average path length in a n_samples iTree, which is equal to
## the average path length of an unsuccessful BST search since the
## latter has the same structure as an isolation tree.
## average_path_length : array, same shape as n_samples_leaf

def _average_path_length(n_samples_leaf):
    if isinstance(n_samples_leaf, INTEGER_TYPES):
        if n_samples_leaf <= 1:
            return 1.
        else:
            return 2. * (np.log(n_samples_leaf - 1.) + euler_gamma) - 2. * (
                n_samples_leaf - 1.) / n_samples_leaf

    else:

        n_samples_leaf_shape = n_samples_leaf.shape
        n_samples_leaf = n_samples_leaf.reshape((1, -1))
        average_path_length = np.zeros(n_samples_leaf.shape)

        mask = (n_samples_leaf <= 1)
        not_mask = np.logical_not(mask)

        average_path_length[mask] = 1.
        average_path_length[not_mask] = 2. * (
            np.log(n_samples_leaf[not_mask] - 1.) + euler_gamma) - 2. * (
                n_samples_leaf[not_mask] - 1.) / n_samples_leaf[not_mask]

        return average_path_length.reshape(n_samples_leaf_shape)
    

![image.png](attachment:image.png)