# Import Libraries

In [None]:
import sys
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn import random_projection
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_curve, auc
from sklearn import svm
from sklearn.ensemble import IsolationForest

import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline

from keras.layers import Input, Dense
from keras.models import Model
from keras import regularizers
from keras.optimizers import Adam

from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import pickle
import json


offline.init_notebook_mode(connected=False)

pd.options.display.max_columns = 999

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

# Data Preparation

In [None]:
input_features = ['dimension',
                  'size',
                  'temporal_dct-mean', 
                  #'temporal_gaussian_mse-mean', 
                  #'temporal_gaussian_difference-mean',
                  #'temporal_threshold_gaussian_difference-mean',
                  #'temporal_match-mean'
                 ]


path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(input_features,'SL', path, reduced=False, scale=True, bins=0)
input_df = metric_processor.read_and_process_data(unique_ID=True)

untampered_df = input_df[input_df['attack_ID'] < 10]
untampered_df.shape

In [None]:
display(input_df.head(50))

# QoE metrics-based classifier

In [None]:
path = '../../machine_learning/cloud_functions/data-qoe-large.csv'

features_qoe = ['dimension',
            'size',
            'temporal_ssim-mean', 
            'temporal_psnr-mean',
            'temporal_ssim-euclidean', 
            'temporal_psnr-euclidean'
           ]

metric_processor = MetricProcessor(features_qoe, 'SL', path, reduced=False, bins=0, scale=False)
df_qoe = metric_processor.read_and_process_data(unique_ID=True)

untampered_qoe_df = df_qoe[df_qoe['attack_ID'] < 10]

Now we have to merge QoE dataframe and add features from training dataframe

In [None]:
untampered_qoe_df = pd.merge(left=untampered_df, right=untampered_qoe_df, left_on='unique_ID', right_on='unique_ID')

untampered_qoe_df = untampered_qoe_df.rename(columns={'attack_ID_x': 'attack_ID',
                                                       'title_x': 'title',
                                                       'attack_x': 'attack',
                                                       'dimension_x': 'dimension',
                                                       'size_x': 'size',
                                                               })

untampered_qoe_df['color'] = untampered_qoe_df['attack_ID'].apply(lambda x: 'red' if x>=10 else 'green')

# Convert PSNR to a linear value so we can establish a threshold
max_error = np.log10(255*255)
untampered_qoe_df['mse'] = untampered_qoe_df['temporal_psnr-mean'].apply(lambda x: 10**((10 * max_error - x)/10))

In [None]:
print(untampered_qoe_df.shape)
untampered_qoe_df.head(20)

In [None]:
output_features = [
                   'temporal_ssim-mean'
                  ]

model_features = input_features + output_features
display(untampered_qoe_df[model_features].head())


# XGBoost for supervised learning

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X = untampered_qoe_df[input_features]
y = untampered_qoe_df[output_features]

data_dmatrix = xgb.DMatrix(data=X,label=y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


In [None]:
xg_reg = xgb.XGBRegressor(objective='reg:squarederror',
                          n_estimators=1000,
                          learning_rate=0.08,
                          gamma=0,
                          alpha=1,
                          subsample=0.75,
                          colsample_bytree=1,
                          max_depth=7,
                          seed=42) 

In [None]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
params = {"objective":"reg:squarederror",
          'colsample_bytree': 1,
          'n_estimators': 1000,
          'learning_rate': 0.08,
          'max_depth': 7,
          'gamma': 0,
          'alpha': 1}

cv_results = xgb.cv(dtrain=data_dmatrix,
                    params=params,
                    nfold=5,
                    num_boost_round=50,
                    early_stopping_rounds=10,
                    metrics="rmse",
                    as_pandas=True,
                    seed=100)


In [None]:

print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()