# Import Libraries

In [None]:
import sys
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD, Adam
from keras.regularizers import l2
import keras.backend as K
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.metrics import roc_curve, auc, fbeta_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import random_projection
from sklearn.decomposition import PCA

import xgboost as xgb
from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn
import time
import pickle
import json
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

# Data Preparation

In [None]:
features = ['dimension', 
            'size',
            'fps',
            'temporal_difference-euclidean', 
            #'temporal_difference-manhattan',
            #'temporal_difference-max', 
            #'temporal_difference-mean',
            #'temporal_difference-std', 
            'temporal_dct-euclidean', 
            #'temporal_dct-manhattan',
            #'temporal_dct-max', 
            #'temporal_dct-mean',
            #'temporal_dct-std',
            'temporal_gaussian-euclidean', 
            #'temporal_gaussian-manhattan',
            #'temporal_gaussian-max', 
            #'temporal_gaussian-mean',
            #'temporal_gaussian-std',
            'temporal_histogram_distance-euclidean',
            #'temporal_histogram_distance-manhattan',
            #'temporal_histogram_distance-max', 
            #'temporal_histogram_distance-mean',
            #'temporal_histogram_distance-std'
               ]
path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(features,'SL', path, reduced=False)
df = metric_processor.read_and_process_data()
df.shape

In [None]:
df.head(50)

In [None]:
(x_test_all, y_test_all), (x_train, y_train), (x_test, y_test) = metric_processor.split_test_and_train(df)

In [None]:
np.unique(y_test)

In [None]:
# Scaling the data
MinMax_scaler = MinMaxScaler()

X_train_scaled_MinMax = MinMax_scaler.fit_transform(x_train) 
X_test_scaled_MinMax = MinMax_scaler.transform(x_test) 
X_test_scaled_MinMax_all = MinMax_scaler.transform(x_test_all) 

Standard_scaler = StandardScaler()
X_train_scaled_standard = Standard_scaler.fit_transform(x_train)
X_test_scaled_standard = Standard_scaler.transform(x_test)
X_test_scaled_standard_all = Standard_scaler.transform(x_test_all)

# Save the scaler for inference
pickle.dump(MinMax_scaler, open('../output/models/SL_MinMaxScaler.pickle.dat', 'wb'))
pickle.dump(Standard_scaler, open('../output/models/SL_StandardScaler.pickle.dat', 'wb'))

# Random Forest

In [None]:
random_forest_results = pd.DataFrame(columns=['n_components', 'TPR', 'TNR', 'model',
                                              'auc', 'f_beta', 'projection', 'estimators'])
random_forest_results = evaluation.random_forest(X_train_scaled_MinMax, y_train,
                                      X_test_scaled_MinMax, y_test, random_forest_results)

In [None]:
random_forest_results.sort_values('f_beta', ascending=False).head()

In [None]:
# Save the best model
best_random_forest = random_forest_results.sort_values('f_beta', ascending=False).iloc[0]
projection = best_random_forest['projection']

if projection == 'PCA':
    reduction = PCA(n_components=best_random_forest['n_components'])
elif projection == 'RP':
    reduction = random_projection.SparseRandomProjection(n_components=best_random_forest['n_components'])
else:
    print('Unknown projection type')
    
X_reduced = reduction.fit_transform(X_train_scaled_MinMax)
test_reduced = reduction.transform(X_test_scaled_MinMax)

pickle.dump(reduction, open('../output/models/reduction_RF.pickle.dat', 'wb'))

RF = RandomForestClassifier(n_estimators=int(best_random_forest['estimators']), n_jobs=7)

RF.fit(X_reduced, y_train)

pickle.dump(RF, open('../output/models/RF.pickle.dat', 'wb'))

best_random_forest = best_random_forest.to_dict()
best_random_forest['features'] = features
with open('../output/models/param_RF.json', 'w') as fp:
    json.dump(best_random_forest, fp)

In [None]:
evaluation.plot_roc_supervised(RF, test_reduced, y_test, 'RF ROC')

# AdaBoost

In [None]:
ada_boost_results = pd.DataFrame(columns=['n_components', 'TPR', 'TNR', 'model',
                                              'auc', 'f_beta', 'projection', 'LR'])
ada_boost_results = evaluation.ada_boost(X_train_scaled_MinMax, y_train,
                                      X_test_scaled_MinMax, y_test, ada_boost_results)

In [None]:
ada_boost_results.sort_values('f_beta', ascending=False).head()

In [None]:
# Save the best model
best_ada_boost_results = ada_boost_results.sort_values('f_beta', ascending=False).iloc[0]
projection = best_ada_boost_results['projection']

if projection == 'PCA':
    reduction = PCA(n_components=best_ada_boost_results['n_components'])

elif projection == 'RP':
    reduction = random_projection.SparseRandomProjection(n_components=best_ada_boost_results['n_components'])
else:
    print('Unknown projection type')
    
X_reduced = reduction.fit_transform(X_train_scaled_MinMax)
test_reduced = reduction.transform(X_test_scaled_MinMax)
pickle.dump(reduction, open('../output/models/reduction_AdaBoost.pickle.dat', 'wb'))


adaBoost  = AdaBoostClassifier(learning_rate=best_ada_boost_results['LR'])
adaBoost.fit(X_reduced, y_train)

pickle.dump(adaBoost, open('../output/models/AdaBoost.pickle.dat', 'wb'))

best_ada_boost_results = best_ada_boost_results.to_dict()
best_ada_boost_results['features'] = features
with open('../output/models/param_AdaBoost.json', 'w') as fp:
    json.dump(best_ada_boost_results, fp)

In [None]:
evaluation.plot_roc_supervised(adaBoost, test_reduced, y_test, 'AdaBoost ROC')

# SVM

In [None]:
svm_results = pd.DataFrame(columns=['n_components', 'TPR', 'TNR', 'model',
                                              'auc', 'f_beta', 'projection'])
svm_results = evaluation.svm_classifier(X_train_scaled_MinMax, y_train,
                                        X_test_scaled_MinMax, y_test, svm_results)

In [None]:
svm_results.sort_values('f_beta', ascending=False).head()

In [None]:
# Save the best model
best_svm_results = svm_results.sort_values('f_beta', ascending=False).iloc[0]
projection = best_svm_results['projection']

if projection == 'PCA':
    reduction = PCA(n_components=best_svm_results['n_components'])
elif projection == 'RP':
    reduction = random_projection.SparseRandomProjection(n_components=best_svm_results['n_components'])
else:
    print('Unknown projection type')
    
X_reduced = reduction.fit_transform(X_train_scaled_MinMax)
test_reduced = reduction.transform(X_test_scaled_MinMax)

pickle.dump(reduction, open('../output/models/reduction_SVM.pickle.dat', 'wb'))

svc  = svm.SVC(gamma='auto', cache_size=7000)
svc.fit(X_reduced, y_train)

pickle.dump(svc, open('../output/models/SVM.pickle.dat', 'wb'))

best_svm_results = best_svm_results.to_dict()
best_svm_results['features'] = features
with open('../output/models/param_SVM.json', 'w') as fp:
    json.dump(best_svm_results, fp)

In [None]:
evaluation.plot_roc_supervised(svc, test_reduced, y_test, 'SVM ROC')

# XGBoost

In [None]:
xgboost_results = pd.DataFrame(columns=['n_components', 'TPR', 'TNR', 'model',
                                              'auc', 'f_beta', 'projection'])
xgboost_results = evaluation.xg_boost(X_train_scaled_MinMax, y_train,
                                      X_test_scaled_MinMax, y_test, xgboost_results)

In [None]:
xgboost_results.sort_values('f_beta', ascending=False).head()

In [None]:
# Save the best model
best_xgboost_results = xgboost_results.sort_values('f_beta', ascending=False).iloc[0]
projection = best_xgboost_results['projection']

if projection == 'PCA':
    reduction = PCA(n_components=best_xgboost_results['n_components'])

elif projection == 'RP':
    reduction = random_projection.SparseRandomProjection(n_components=best_xgboost_results['n_components'])
else:
    print('Unknown projection type')
    
X_reduced = reduction.fit_transform(X_train_scaled_MinMax)
test_reduced = reduction.transform(X_test_scaled_MinMax)
pickle.dump(reduction, open('../output/models/reduction_XGBoost.pickle.dat', 'wb'))


XGB = xgb.XGBClassifier()
grid = {'max_depth':10}
XGB.set_params(**grid)

XGB.fit(X_reduced, y_train)

pickle.dump(XGB, open('../output/models/XGBoost.pickle.dat', 'wb'))

best_xgboost_results = best_xgboost_results.to_dict()
best_xgboost_results['features'] = features
with open('../output/models/param_XGBoost.json', 'w') as fp:
    json.dump(best_xgboost_results, fp)

In [None]:
evaluation.plot_roc_supervised(XGB, test_reduced, y_test, 'XGB ROC')