In [1]:
""" Proof of concept for the random-cut-hyperplanes idea """
import sys
import numpy as np
from scipy.stats import scoreatpercentile
from sklearn.metrics import confusion_matrix

In [2]:
def _gen_hard_data(n, p, infection_pct, variance=10.0, mu=5.0):
    X = np.random.randn(n, p)

    # hard data
    # Weight it to the number of features
    is_anomaly = np.random.rand(n, p) < (infection_pct / p)
    X[is_anomaly] = variance * np.random.randn() + mu

    y = np.zeros(shape=(n,))

    tmp = np.array([np.any(r) for r in is_anomaly])
    y[tmp] = 1.0

    return (X, y)

def _gen_easy_data(n, p, infection_pct, variance=10.0, mu=5.0):
    X = np.random.randn(n, p)
    is_anomaly = np.random.choice(n, size=int(infection_pct*n), replace=False)
    X[is_anomaly] = variance * np.random.randn(is_anomaly.shape[0], p) + mu
    y = np.zeros(shape=(n,))
    y[is_anomaly] = 1.0
    return (X, y)

In [3]:
def run_plane_simul(points, y):
    print("Beginning plane fit...")
    rhp = RandomHyperplanes(n_estimators=N_ESTIMATORS)
    rhp = rhp.fit(points)
    print("done fitting")

#     scores = rhp.decision_function(points)
#     threshold = scoreatpercentile(scores, 100 - SCORE_AT)
#     anomalies = scores >= threshold
#     y_pred = np.zeros(shape=anomalies.shape)
#     y_pred[anomalies] = 1
    
#     """
#     correct_guesses = np.count_nonzero(y[np.where(scores <= threshold)])
#     incorrect_guesses = y[np.where(scores <= threshold)].shape[0] - \
#         correct_guesses

#     print("Correct guesses:", correct_guesses)
#     print("Incorrect guesses:", incorrect_guesses)
#     print("Expected", np.count_nonzero(y), "anomalies")
#     """
#     cnf_matrix = confusion_matrix(y, y_pred)

#     """
#     tn, fp, fn, tp = cnf_matrix.ravel()
#     print(f"tp: {tp} \ntn: {tn} \nfp: {fp} \nfn: {fn}")
#     """
#     cnf_matrix = cnf_matrix.astype('float') / \
#         cnf_matrix.sum(axis=1)[:, np.newaxis]

#     """
#     tn, fp, fn, tp = cnf_matrix.ravel()
#     print(f"Normalized \ntp: {tp} \ntn: {tn} \nfp: {fp} \nfn: {fn}")
#     """
#     print(cnf_matrix)

    depths = rhp.get_depths(points)
    anomalous_depths = depths[np.where(y==1.0)]
    non_anomalous_depths = depths[np.where(y==0.0)]    
    print("Average anomalous depth:", np.mean(anomalous_depths))
    print("Average non-anomalous depth:", np.mean(non_anomalous_depths))
    return (None, depths, None, y)


def run_iforest_simul(points, y):
    print("Beginning iforest fit...")
    iforest = IsolationForest(n_estimators=N_ESTIMATORS)
    iforest = iforest.fit(points)
    print("done fitting")

#     scores = iforest.decision_function(points)
#     threshold = scoreatpercentile(scores, 100 - SCORE_AT)
#     anomalies = scores >= threshold
#     y_pred = np.zeros(shape=anomalies.shape)
#     y_pred[anomalies] = 1

#     """
#     correct_guesses = np.count_nonzero(y[np.where(scores <= threshold)])
#     incorrect_guesses = y[np.where(scores <= threshold)].shape[0] - \
#         correct_guesses

#     print("iforest Correct guesses:", correct_guesses)
#     print("iforest Incorrect guesses:", incorrect_guesses)
#     print("Expected", np.count_nonzero(y), "anomalies")
#     """
#     iforest_cnf_matrix = confusion_matrix(y, y_pred)
#     """
#     tn, fp, fn, tp = iforest_cnf_matrix.ravel()
#     print(f"tp: {tp} \ntn: {tn} \nfp: {fp} \nfn: {fn}")
#     """
#     iforest_cnf_matrix = iforest_cnf_matrix.astype('float') / \
#             iforest_cnf_matrix.sum(axis=1)[:, np.newaxis]

#     print(iforest_cnf_matrix)

#     """
#     tn, fp, fn, tp = iforest_cnf_matrix.ravel()
#     print(f"Normalized \ntp: {tp} \ntn: {tn} \nfp: {fp} \nfn: {fn}")
#     """
    depths = iforest.get_depths(points)
    anomalous_depths = depths[np.where(y==1.0)]
    non_anomalous_depths = depths[np.where(y==0.0)]    
    print("Average anomalous depth:", np.mean(anomalous_depths))
    print("Average non-anomalous depth:", np.mean(non_anomalous_depths))
    return (None, depths, None, y)

In [4]:
from planes import RandomHyperplanes

N_ESTIMATORS = 5
SCORE_AT = 2.5

n = 1000 # number of entries
p = 2    # features

infection_pct = 0.05
X, y = _gen_easy_data(n, p, infection_pct)

scores_r, depths_r, y_pred_r, y_r = run_plane_simul(X, y)
# print("\nDone plane simul-----\n")
# scores_i, depths_i, y_pred_i, y_i = run_iforest_simul(X, y)

Beginning plane fit...
done fitting
Average anomalous depth: 11.68
Average non-anomalous depth: 31.0010526316


In [5]:
# X[np.where(y==1.0)]

In [6]:
rhp = RandomHyperplanes(n_estimators=N_ESTIMATORS)
rhp = rhp.fit(X)
depths = rhp.get_depths(X)

In [7]:
### print(np.unique(depths))