# An example of how to test our IForestASD Implementation & Compare IForestASD against HSTrees

## Install skmultiflow if needed
You need to install git

In [43]:
try:
    import skmultiflow
except ImportError as e:
    print("scikit-multiflow package installation")
    !pip install -U git+https://github.com/scikit-multiflow/scikit-multiflow

## Importations and configurations

In [42]:
%matplotlib notebook
import matplotlib as plt
plt.interactive(True)
from source import functions
func = functions.Comparison()
import datetime

## General parameters for the evaluation

In [61]:
window_sizes = [50, 100, 500, 1000]
n_estimators = [30, 50, 100]
anomaly_threshold = 0.5
max_sample = 10000 # We have gotten the size of the min dataset (Shuttle) to evaluate all dataset on the same basis.
n_wait = max_sample # The evaluation step size
# Used metric in the evaluation. Attention to use the metrics availlable in skmultiflow
metrics=['accuracy', 'f1', 'precision', 'recall', 'true_vs_predicted', 'kappa', 'kappa_m', 'running_time', 'model_size'] 


## Using Shuttle dataset

Dataset Name : Shuttle
Instances : 49097
Attributes : 9
Anomaly Percentage : 7.15%

In [79]:
dataset_name = "Shuttle"
test_name = dataset_name+'_'+str(datetime.datetime.now())
drift_rate = 7.15
stream = func.get_dataset(dataset_name=dataset_name)
for window in window_sizes:
    for n_estimator in n_estimators:
        print("")
        print("******************************** Window = "+str(window)+" and n_estimator = "+str(n_estimator)+" ********************************")
        func.run_comparison(stream=stream, n_features=stream.n_features, window = window, estimators = n_estimator, anomaly = anomaly_threshold, drift_rate = drift_rate, result_folder=test_name, max_sample=max_sample, n_wait=n_wait, metrics=metrics)


******************************** Window = 50 and n_estimator = 30 ********************************


TypeError: run_comparison() got an unexpected keyword argument 'n_features'

## Using SMTP dataset
   
Dataset Name : Smtp
Instances : 95156
Attributes : 3
Anomaly Percentage : 0.03%

In [38]:
dataset_name = "SMTP"
#n_features=stream.n_features
test_name = dataset_name+'_'+str(datetime.datetime.now())
drift_rate = 0.03
stream = func.get_dataset(dataset_name=dataset_name)
for window in window_sizes:
    for n_estimator in n_estimators:
        print("")
        print("******************************** Window = "+str(window)+" and n_estimator = "+str(n_estimator)+" ********************************")
        func.run_comparison(stream=stream, stream_n_features=stream.n_features, window = window, estimators = n_estimator, anomaly = anomaly_threshold, drift_rate = drift_rate, result_folder=test_name, max_sample=max_sample, n_wait=n_wait, metrics=metrics)
    


******************************** Window = 50 and n_estimator = 30 ********************************


TypeError: __init__() got an unexpected keyword argument 'n_features'

## Using ForestCover dataset
   
Dataset Name : ForestCover
Instances : 286048
Attributes : 10
Anomaly Percentage : 0.96%

In [35]:
dataset_name = "ForestCover"
test_name = dataset_name+'_'+str(datetime.datetime.now())
drift_rate = 0.96
stream = func.get_dataset(dataset_name=dataset_name)
for window in window_sizes:
    for n_estimator in n_estimators:
        print("")
        print("******************************** Window = "+str(window)+" and n_estimator = "+str(n_estimator)+" ********************************")
        func.run_comparison(stream=stream, stream_n_features=stream.n_features, window = window, 
                             estimators = n_estimator, anomaly = anomaly_threshold, drift_rate = drift_rate, 
                             result_folder=test_name, max_sample=max_sample, n_wait=n_wait, metrics=metrics)


******************************** Window = 50 and n_estimator = 30 ********************************


TypeError: __init__() got an unexpected keyword argument 'n_features'

# Plot differents results

## Fusion of differents results file

## Model Size

In [16]:
df_shuttle.sort_values(by = ['window', 'estimators'], inplace= True)
ax1 =  df_shuttle.plot(x="Windows_Trees_set_up", y=['model_size_[HSTrees]'], kind="bar", figsize=(10, 5),
                     title= ' Model Size : HS-Trees (SHUTTLE) vs iForestASD (3 datasets)')
 
#ax1.plot(df_forest.Windows_Trees_set_up, df_forest['model_size_[iForestASD]'], color = 'yellow')
 
#ax1.legend('HSTrees_size',loc="upper left")
ax2 = ax1.twinx()

ax2.plot(df_shuttle.Windows_Trees_set_up, df_shuttle['model_size_[iForestASD]'], color = 'red', marker="o")
ax2.plot(df_shuttle.Windows_Trees_set_up, df_forest['model_size_[iForestASD]'], color = 'green', marker="x")
ax2.plot(df_shuttle.Windows_Trees_set_up, df_smtp['model_size_[iForestASD]'], color = 'orange', marker="o")

ax2.set_ylabel("model_size_[iForestASD]",color="red",fontsize=14)
ax1.set_xlabel("Parameters Set up - Windows__Trees",color="black",fontsize=14)
 
ax2.legend(['SHUTTLE_IFA', 'FOREST_IFA', 'SMTP_IFA'], loc = 'upper right')
ax1.legend(['HSTrees_Shuttle_Model_Size'])
 
#ax3 = ax1.twinx()
 
#ax2.legend([ax1.get_lines()[0], ax2.get_lines()[0]],\
#          ['model_size_[HSTrees]',"model_size_[iForestASD]"])

NameError: name 'df_shuttle' is not defined

## Processing Time

In [18]:
ax1 = df_forest.plot(x="Windows_Trees_set_up", y=[ 'training_time_[HSTrees]',  'training_time_[iForestASD]', 'testing_time_[HSTrees]',
     
                                         ], kind="bar", title='FOREST COVER - Running Time (IFA Testing in right axis)')
ax2 = ax1.twinx()
 
ax2.plot(df_forest.Windows_Trees_set_up, df_forest['testing_time_[iForestASD]'], color = 'red', marker="o")
ax2.legend(['IFA_Testing'],loc = "upper right")
ax1.set_xlabel("Parameters Set up - Windows__Trees",color="black",fontsize=14)

ax2.set_ylabel("testing_time_[iForestASD]",color="red",fontsize=14)
ax1.legend(['HST_Training', 'IFA_Training', 'HST_Testing'], loc = 'upper center')

NameError: name 'df_forest' is not defined