In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import random
import os
import pickle

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve

In [3]:
import eif

In [4]:
random.seed(42)
np.random.seed(42)

## Data Preparation

- Load the data

In [5]:
df = pd.read_parquet('OLDTv3 Datasets')
df.set_index('hashed_tln', inplace=True)

df.head()

Unnamed: 0_level_0,META_for_month,is_impacted,1_total_P,1_total_Q,1_total_KC,1_total_CC,1_total_KC_per_day,1_total_S,1_max_P,1_min_P,...,weather_sum10_heatindex_12pm_pressure,weather_sum10_heatindex_12pm_cloudcover,weather_sum10_heatindex_12pm_HeatIndexC,weather_sum10_heatindex_12pm_WindChillC,total_P,total_Q,total_KC,total_CC,total_KC_per_day,total_S
hashed_tln,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4534789187331872806,2020-09,0,1316.754028,645.172974,1001.0,3.0,32.290321,1466.318359,665.301025,18.938999,...,10090.0,238.0,396.0,335.0,1280.036042,627.182001,941.0,3.0,31.366666,1425.429565
-8784325762925109637,2021-01,0,2252.943115,1103.880981,2134.0,10.16,68.838715,2508.845459,917.642029,82.764,...,10106.0,447.0,342.0,305.0,2022.800987,991.116993,1617.0,10.16,52.16129,2252.562256
-3488696552317893546,2020-12,0,14301.871094,7007.523926,10366.0,43.049999,345.533325,15926.358398,2906.98999,0.0,...,10100.0,298.0,363.0,317.0,14077.274963,6897.480029,9776.0,43.05,315.354839,15676.25293
9097892585216145639,2020-12,0,18760.558594,9192.15918,14890.0,45.32,496.333313,20891.490234,4774.932129,89.026001,...,10100.0,298.0,363.0,317.0,18409.270348,9020.040039,12748.0,45.32,411.225811,20500.300781
6405709742834628611,2020-04,0,8687.954102,4256.86084,5666.0,88.919998,182.7742,9674.782227,2220.969971,5.009,...,10121.0,222.0,386.0,340.0,8650.27107,4238.395975,6120.0,88.919999,204.000001,9632.818359


- Remove Weather
- FillNA with mean

In [6]:
data = df.drop([c for c in df.columns if 'weather' in c], axis=1)


In [7]:
data.fillna(data.mean(),inplace=True)

In [8]:
data.count().isnull()

META_for_month      False
is_impacted         False
1_total_P           False
1_total_Q           False
1_total_KC          False
                    ...  
total_Q             False
total_KC            False
total_CC            False
total_KC_per_day    False
total_S             False
Length: 524, dtype: bool

### Convert date from string to float

In [9]:
l = data['META_for_month']
[i.replace("-","") for i in l]

['202009',
 '202101',
 '202012',
 '202012',
 '202004',
 '202010',
 '202004',
 '202012',
 '202004',
 '202009',
 '202003',
 '202101',
 '202012',
 '202011',
 '202009',
 '202005',
 '202011',
 '202007',
 '202011',
 '202012',
 '202009',
 '202010',
 '202003',
 '202003',
 '202010',
 '202102',
 '202102',
 '202101',
 '202005',
 '202007',
 '202011',
 '202012',
 '202003',
 '202102',
 '202005',
 '202007',
 '202102',
 '202006',
 '202102',
 '202010',
 '202004',
 '202103',
 '202006',
 '202103',
 '202010',
 '202006',
 '202003',
 '202006',
 '202004',
 '202004',
 '202005',
 '202010',
 '202103',
 '202004',
 '202006',
 '202010',
 '202008',
 '202101',
 '202006',
 '202008',
 '202004',
 '202103',
 '202003',
 '202005',
 '202005',
 '202006',
 '202102',
 '202011',
 '202103',
 '202011',
 '202102',
 '202009',
 '202012',
 '202012',
 '202008',
 '202006',
 '202007',
 '202006',
 '202004',
 '202007',
 '202004',
 '202008',
 '202003',
 '202008',
 '202011',
 '202004',
 '202007',
 '202004',
 '202009',
 '202011',
 '202012',

In [10]:
data['META_for_month'] = [i.replace("-","") for i in l]

In [11]:
data['META_for_month']

hashed_tln
4534789187331872806     202009
-8784325762925109637    202101
-3488696552317893546    202012
9097892585216145639     202012
6405709742834628611     202004
                         ...  
2016640256515698290     202102
-6112154507571650299    202007
-3283261322135907284    202004
-5202372100688753902    202003
-691718465068812655     202102
Name: META_for_month, Length: 223604, dtype: object

## Split dataframes by year 2020 and 2021

In [12]:
df1 = data[data['META_for_month'] <= '202012']
print(df1)

                     META_for_month  is_impacted     1_total_P     1_total_Q  \
hashed_tln                                                                     
4534789187331872806          202009            0   1316.754028    645.172974   
-3488696552317893546         202012            0  14301.871094   7007.523926   
9097892585216145639          202012            0  18760.558594   9192.159180   
6405709742834628611          202004            0   8687.954102   4256.860840   
6405709742834628611          202010            0   7917.811035   3879.510010   
...                             ...          ...           ...           ...   
-4253228264645747286         202011            0   3025.643066   1482.483032   
-3241915922368082053         202004            0   7741.348145   5810.564941   
-6112154507571650299         202007            0   2133.650146   1045.430054   
-3283261322135907284         202004            0   5082.576172   2490.320068   
-5202372100688753902         202003     

In [13]:
df2 = data[data['META_for_month'] >= '202101']
print(df2)

                     META_for_month  is_impacted     1_total_P     1_total_Q  \
hashed_tln                                                                     
-8784325762925109637         202101            0   2252.943115   1103.880981   
7418352048354131283          202101            0  31046.421875  15211.903320   
8916298948368098698          202102            0   5893.199707   2887.506104   
-7851381369700126444         202102            0  34799.750000   4374.969238   
6817948976443589203          202101            0  17051.699219   8354.865234   
...                             ...          ...           ...           ...   
4050269268982024314          202102            0   3190.730957   1563.369995   
1483199892431355253          202103            0   9788.250977   4795.975586   
-7371851919761512748         202101            0  12463.164062   6106.608887   
2016640256515698290          202102            0   2695.545898   1320.744019   
-691718465068812655          202102     

In [14]:
df1.drop('META_for_month', axis=1, inplace=True)
df2.drop('META_for_month', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


## Train-Val-Test Split

In [15]:
df1_X = df1.drop('is_impacted',axis=1)
df1_y = df1['is_impacted']

In [16]:
df2_X = df2.drop('is_impacted',axis=1)
df2_y = df2['is_impacted']

In [17]:
RANDOM_SEED = 42

Original Split

In [18]:
def getCounts(df, col='is_impacted'):
    df_counts = df[col].value_counts().to_frame()
    df_counts['pct'] = df[col].value_counts(normalize=True)
    return df_counts

In [19]:
getCounts(df1)

Unnamed: 0,is_impacted,pct
0,167948,0.970719
1,5066,0.029281


In [20]:
train, val = train_test_split(df1,
                                                        test_size=0.4,
                                                        shuffle=True,
                                                        random_state=RANDOM_SEED)

New Splits

In [21]:
getCounts(train)

Unnamed: 0,is_impacted,pct
0,100745,0.970494
1,3063,0.029506


In [22]:
getCounts(val)

Unnamed: 0,is_impacted,pct
0,67203,0.971057
1,2003,0.028943


Separate the Features and Target variables

In [23]:
X_train = train.drop('is_impacted', axis=1)
y_train = train['is_impacted']

X_val = val.drop('is_impacted', axis=1)
y_val = val['is_impacted']

X_test = df2.drop('is_impacted', axis=1)
y_test = df2['is_impacted']

## Feature Selection

In [24]:
from sklearn.feature_selection import VarianceThreshold

In [25]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

In [26]:
#X_train = sel.fit_transform(X=X_train)

In [27]:
#X_train

In [28]:
#X_val = sel.fit_transform(X=X_val)

In [29]:
#X_val

In [30]:
#X_train[:,0] = X_train[:,0].astype(float)

In [31]:
#X_train

In [32]:
#X_val[:,0] = X_val[:,0].astype(float)

In [33]:
#X_val

## Functions

In [34]:
def plotConfMat(clf, X, y, **kwargs):
    y_pred = clf.predict(X)
    
    # map predictions to 0, 1
    if 'mapper' in kwargs:
        y_pred = kwargs['mapper'](y_pred)
    
    # plot the confusion matrix
    confmat = confusion_matrix(y_true=y, y_pred=y_pred)
    
    fig, ax = plt.subplots(figsize=(4,4))
    ax.matshow(confmat, cmap='Blues', alpha=0.3)
    
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('Actual Label')
    ax.grid(False)
    ax.vlines(x=0.5, ymin=-0.5, ymax=1.5, color=(0.8, 0.8, 0.8))
    ax.hlines(y=0.5, xmin=-0.5, xmax=1.5, color=(0.8, 0.8, 0.8))
    
    # design
    if 'title' in kwargs:
        fig.suptitle(kwargs['title'], )
        print(kwargs['title'])
        
    if 'ticklabels' in kwargs:
        ticklabels = kwargs['ticklabels']
        ax.set_xticklabels(['']+ticklabels)
        ax.set_yticklabels(['']+ticklabels)
        print(classification_report(y, y_pred, target_names=kwargs['ticklabels']))
    else:
        print(classification_report(y, y_pred))

    plt.tight_layout()
    
    return fig

In [35]:
def mapPreds(x):
    if x == -1:
        return 1
    elif x == 1:
        return 0
    else:
        return x
    
mapPreds = np.vectorize(mapPreds)

## Modelling

In [36]:
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE

### AdaBoost

In [68]:
def adaboost(X_train, X_val, y_train):
    model = AdaBoostClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return y_pred

Vanilla Adaboost (no resampling)

In [69]:
y_baseline = adaboost(X_train, X_val, y_train)

In [39]:
#plotConfMat(y_baseline, X_val, y_val,
         #   title='Adaboost', mapper=mapPreds,
          #  ticklabels=['Not Impacted', 'Impacted']);

In [40]:
print('Vanilla AdaBoost')
print(classification_report(y_baseline, y_val))

Vanilla AdaBoost
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     68522
           1       0.17      0.48      0.25       684

    accuracy                           0.97     69206
   macro avg       0.58      0.73      0.62     69206
weighted avg       0.99      0.97      0.98     69206



In [70]:
from sklearn.metrics import confusion_matrix

In [71]:
confusion_matrix(y_val,y_baseline)

array([[66850,   353],
       [ 1672,   331]], dtype=int64)

In [43]:
y_baseline = adaboost(X_train, X_test, y_train)

In [44]:
print('Vanilla AdaBoost')
print(classification_report(y_baseline, y_test))

Vanilla AdaBoost
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     50570
           1       0.00      0.00      0.00        20

    accuracy                           1.00     50590
   macro avg       0.50      0.50      0.50     50590
weighted avg       1.00      1.00      1.00     50590



In [45]:
confusion_matrix(y_test,y_baseline)


array([[50554,    20],
       [   16,     0]], dtype=int64)

### SMOTE (Synthetic Minority Oversampling Technique) Adaboost

In [46]:
#SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
y_smote = adaboost(X_train_sm, X_val, y_train_sm)

In [47]:
#plotConfMat(y_smote, X_val, y_val,
      #      title='Adaboost', mapper=mapPreds,
       #     ticklabels=['Not Impacted', 'Impacted']);

In [48]:
print('SMOTE AdaBoost')
print(classification_report(y_smote, y_val))

SMOTE AdaBoost
              precision    recall  f1-score   support

           0       0.95      0.98      0.97     65147
           1       0.48      0.24      0.32      4059

    accuracy                           0.94     69206
   macro avg       0.72      0.61      0.64     69206
weighted avg       0.93      0.94      0.93     69206



In [49]:
confusion_matrix(y_val,y_smote)

array([[64103,  3100],
       [ 1044,   959]], dtype=int64)

In [50]:
#SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
y_smote = adaboost(X_train_sm, X_test, y_train_sm)

In [51]:
print('SMOTE AdaBoost')
print(classification_report(y_smote, y_test))

SMOTE AdaBoost
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     50064
           1       0.00      0.00      0.00       526

    accuracy                           0.99     50590
   macro avg       0.49      0.50      0.50     50590
weighted avg       0.98      0.99      0.98     50590



In [52]:
confusion_matrix(y_test,y_smote)

array([[50048,   526],
       [   16,     0]], dtype=int64)

### RUS (Randomly Undersampling) Adaboost

In [53]:
from sklearn.utils import resample

In [55]:
#RUS
X_maj = df1[df1.is_impacted == 0]
X_min = df1[df1.is_impacted == 1]
X_maj_rus = resample(X_maj, replace=False, n_samples=len(X_min), random_state=44)
X_rus = pd.concat([X_maj_rus, X_min])
X_train_rus = X_rus.drop(['is_impacted'], axis=1)
y_train_rus = X_rus.is_impacted
y_rus = adaboost(X_train_rus, X_val, y_train_rus)

In [56]:
confusion_matrix(y_val,y_rus)

array([[57238,  9965],
       [  383,  1620]], dtype=int64)

In [58]:
print('RUS Adaboost')
print(classification_report(y_rus, y_val))

RUS Adaboost
              precision    recall  f1-score   support

           0       0.85      0.99      0.92     57621
           1       0.81      0.14      0.24     11585

    accuracy                           0.85     69206
   macro avg       0.83      0.57      0.58     69206
weighted avg       0.84      0.85      0.80     69206



In [59]:
#RUS
X_maj = df1[df1.is_impacted == 0]
X_min = df1[df1.is_impacted == 1]
X_maj_rus = resample(X_maj, replace=False, n_samples=len(X_min), random_state=44)
X_rus = pd.concat([X_maj_rus, X_min])
X_train_rus = X_rus.drop(['is_impacted'], axis=1)
y_train_rus = X_rus.is_impacted
y_rus = adaboost(X_train_rus, X_test, y_train_rus)

In [60]:
print('RUS Adaboost')
print(classification_report(y_rus, y_test))

RUS Adaboost
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     45405
           1       0.38      0.00      0.00      5185

    accuracy                           0.90     50590
   macro avg       0.64      0.50      0.47     50590
weighted avg       0.84      0.90      0.85     50590



In [61]:
confusion_matrix(y_test,y_rus)

array([[45395,  5179],
       [   10,     6]], dtype=int64)

### One Class SVM

In [62]:
clf_svm = OneClassSVM(gamma='scale', nu=0.05, verbose=True)
svm_model = Pipeline([('scaler', scaler), ('pca', pca), ('clf', clf_svm)], verbose=True)
svm_model.fit(X_train)

NameError: name 'scaler' is not defined

In [None]:
# plotConfMat(svm_model, X_train, y_train, title='OneClassSVM Train Set')
plotConfMat(svm_model, X_val, y_val,
            title='OneClassSVM Validation Set', mapper=mapPreds,
            ticklabels=['Not Impacted', 'Impacted']);

In [None]:
#os.makedirs('models', exist_ok=True)

#with open('models/oneClassSVM.pkl', 'wb') as fp:
   # pickle.dump(svm_model, fp)

### Isolation Forest

In [None]:
iso_forest = IsolationForest(random_state=RANDOM_SEED, contamination=0.03, n_jobs=-1, verbose=True)
if_model = Pipeline([('scaler', scaler), ('pca', pca), ('clf', iso_forest)], verbose=True)
if_model.fit(X_train)

In [None]:
#with open('models/iso_forest.pkl', 'wb') as fp:
  #  pickle.dump(if_model, fp)

In [None]:
plotConfMat(if_model, X_val, y_val,
            title='Isolation Forest Validation Set', mapper=mapPreds,
            ticklabels=['Not Impacted', 'Impacted']);
plt.tight_layout()

### Extended Isolation Forest Level 1

In [None]:
ext_iso_forest_lvl1 = eif.iForest(X_train.values, ntrees=100, sample_size=256, ExtensionLevel=1)

We define the threshold using the anomaly scores

In [None]:
anomaly_scores = ext_iso_forest_lvl1.compute_paths(X_in=X_val.values)

In [None]:
df_yval = y_val.to_frame()
df_yval['anomaly_score_pred'] = anomaly_scores
df_yval.head()

Since EIF does not return a prediction, we create our own threshold using the validation set

We scale the anomaly scores using y_val (sklearn precision_recall_curve accepts 0-1 range) then plot the precision-recall curve to get the threshold

In [None]:
yval_scaler = MinMaxScaler()

yval_anomaly_scaled = yval_scaler.fit_transform(df_yval.anomaly_score_pred.values.reshape(-1, 1))

In [None]:
df_yval['impacted_proba'] = yval_anomaly_scaled
df_yval['not_impacted_proba'] = 1-yval_anomaly_scaled

In [None]:
p, r, t = precision_recall_curve(y_true=df_yval.is_impacted, probas_pred=df_yval['impacted_proba'])

In [None]:
fig, ax = plt.subplots()

ax.step(x=r, y=p)
ax.set_facecolor('white')
ax.set(xlabel='Recall',
       ylabel='Precision',
       title='Precision-Recall Curve for Extended Isolation Forest Level 1 - Val Set Predictions')

ax.grid(True, linewidth=0.2)

inverse transform the threshold

In [None]:
thresh_idx = np.argmax(p[:-203])
print(f'Optimal Precision {p[thresh_idx]:.02%}')
print(f'Recall @ Optimal Precision {r[thresh_idx]:.02%}')

anomaly_thresh = yval_scaler.inverse_transform(np.array(t[thresh_idx]).reshape(1,-1))
print(f'Anomaly Score Threshold {anomaly_thresh[0,0]:.04f}')

Plot Threshold for Optimal Precision

In [None]:
fig, ax = plt.subplots()

sns.histplot(data=df_yval.loc[df_yval.is_impacted==0, 'anomaly_score_pred'],
             ax=ax,
             color='blue',
             binwidth=0.01,
             label='Not Impacted')
sns.histplot(data=df_yval.loc[df_yval.is_impacted==1, 'anomaly_score_pred'],
             ax=ax, color='orange',
             binwidth=0.01,
             label='Impacted')
ax.vlines(x=anomaly_thresh[0,0], ymin=0, ymax=600, ls='--', colors='red')
ax.set_ylim(0, 600)
ax.grid(lw=0.2)
ax.legend(facecolor='white')
ax.set_facecolor('white')

For Threshold at Max Precision

In [None]:
df_yval['y_pred'] = df_yval.anomaly_score_pred.apply(lambda x: 1 if x > anomaly_thresh else 0)

In [None]:
def plotConfMat_EIF(y_true, y_pred, **kwargs):
    # plot the confusion matrix
    confmat = confusion_matrix(y_true=y_true, y_pred=y_pred)
    
    fig, ax = plt.subplots(figsize=(4,4))
    ax.matshow(confmat, cmap='Blues', alpha=0.3)
    
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('Actual Label')
    ax.grid(False)
    ax.vlines(x=0.5, ymin=-0.5, ymax=1.5, color=(0.8, 0.8, 0.8))
    ax.hlines(y=0.5, xmin=-0.5, xmax=1.5, color=(0.8, 0.8, 0.8))
    
    # design
    if 'title' in kwargs:
        fig.suptitle(kwargs['title'], )
        print(kwargs['title'])
        
    if 'ticklabels' in kwargs:
        ticklabels = kwargs['ticklabels']
        ax.set_xticklabels(['']+ticklabels)
        ax.set_yticklabels(['']+ticklabels)
        print(classification_report(y_true, y_pred, target_names=kwargs['ticklabels']))
    else:
        print(classification_report(y_true, y_pred))

    plt.tight_layout()
    
    return fig

In [None]:
plotConfMat_EIF(df_yval.is_impacted, df_yval.y_pred, 
                title='Extended Isolation Forest Level 1 Validation Set - Max Precision', 
                ticklabels=['Not Impacted', 'Impacted']);

In [None]:
#with open('models/extended_isoForest_1_p.pkl', 'wb') as fp:
 #   pickle.dump({'model':ext_iso_forest_lvl1, 'threshold':anomaly_thresh[0,0]}, fp)

In [None]:
#with open('models/extended_isoForest_1_r.pkl', 'wb') as fp:
   # pickle.dump({'model':ext_iso_forest_lvl1, 'threshold':anomaly_thresh_r[0,0]}, fp)

**Predict on the Test Set**

- We can now use the threshold taken above, no need to scale the anomaly scores

For Threshold at Optimal Precision

In [None]:
df_ytest = y_test.to_frame()

test_anomaly_scores = ext_iso_forest_lvl1.compute_paths(X_in=X_test.values)

df_ytest['anomaly_score_pred'] = test_anomaly_scores
df_ytest['y_pred'] = df_ytest.anomaly_score_pred.apply(lambda x: 1 if x > anomaly_thresh else 0)

In [None]:
plotConfMat_EIF(df_ytest.is_impacted, df_ytest.y_pred, 
                title='Extended Isolation Forest Level 1 Test Set - Max Precision', 
                ticklabels=['Not Impacted', 'Impacted']);

**Extended Isolation Forest Max Extended Level**

In [None]:
ext_iso_forest_maxLvl = eif.iForest(X_train.values, ntrees=100, sample_size=256, ExtensionLevel=min_pcs-1)

In [None]:
anomaly_scores = ext_iso_forest_maxLvl.compute_paths(X_in=X_val.values)

df_yval = y_val.to_frame()
df_yval['anomaly_score_pred'] = anomaly_scores
df_yval.head()

In [None]:
yval_scaler = MinMaxScaler()

yval_anomaly_scaled = yval_scaler.fit_transform(df_yval.anomaly_score_pred.values.reshape(-1, 1))

df_yval['impacted_proba'] = yval_anomaly_scaled
df_yval['not_impacted_proba'] = 1-yval_anomaly_scaled

In [None]:
p, r, t = precision_recall_curve(y_true=df_yval.is_impacted, probas_pred=df_yval['impacted_proba'])

fig, ax = plt.subplots()

ax.step(x=r, y=p)
ax.set_facecolor('white')
ax.set(xlabel='Recall',
       ylabel='Precision',
       title='Precision-Recall Curve for Extended Isolation Forest Max Level - Val Set Predictions')

ax.grid(True, linewidth=0.2)

inverse transform the threshold

In [None]:
thresh_idx = np.argmax(p[:-228])
print(f'Optimal Precision {p[thresh_idx]:.02%}')
print(f'Recall @ Optimal Precision {r[thresh_idx]:.02%}')

anomaly_thresh = yval_scaler.inverse_transform(np.array(t[thresh_idx]).reshape(1,-1))
print(f'Anomaly Score Threshold {anomaly_thresh[0,0]:.04f}')

Plot Threshold for Optimal Precision

In [None]:
fig, ax = plt.subplots()

sns.histplot(data=df_yval.loc[df_yval.is_impacted==0, 'anomaly_score_pred'],
             ax=ax,
             color='blue',
             binwidth=0.01,
             label='Not Impacted')
sns.histplot(data=df_yval.loc[df_yval.is_impacted==1, 'anomaly_score_pred'],
             ax=ax, color='orange',
             binwidth=0.01,
             label='Impacted')
ax.vlines(x=anomaly_thresh[0,0], ymin=0, ymax=600, ls='--', colors='red')
ax.set_ylim(0, 600)
ax.grid(lw=0.2)
ax.legend(facecolor='white')
ax.set_facecolor('white')

For Threshold at Optimal Precision

In [None]:
df_yval['y_pred'] = df_yval.anomaly_score_pred.apply(lambda x: 1 if x > anomaly_thresh else 0)

plotConfMat_EIF(df_yval.is_impacted, df_yval.y_pred, 
                title='Extended Isolation Forest Max Level Validation Set - Max Precision', 
                ticklabels=['Not Impacted', 'Impacted']);

In [None]:
#with open('models/extended_isoForest_max_p.pkl', 'wb') as fp:
   # pickle.dump({'model':ext_iso_forest_maxLvl, 'threshold':anomaly_thresh[0,0]}, fp)

In [None]:
#with open('models/extended_isoForest_max_r.pkl', 'wb') as fp:
  #  pickle.dump({'model':ext_iso_forest_maxLvl, 'threshold':anomaly_thresh_r[0,0]}, fp)

**Predict on Test Set**

For Threshold at Optimal Precision

In [None]:
df_ytest = y_test.to_frame()

test_anomaly_scores = ext_iso_forest_maxLvl.compute_paths(X_in=X_test.values)

df_ytest['anomaly_score_pred'] = test_anomaly_scores
df_ytest['y_pred'] = df_ytest.anomaly_score_pred.apply(lambda x: 1 if x > anomaly_thresh else 0)

In [None]:
plotConfMat_EIF(df_ytest.is_impacted, df_ytest.y_pred, 
                title='Extended Isolation Forest Max Level Test Set - Max Precision', 
                ticklabels=['Not Impacted', 'Impacted']);