In [1]:
# let's put everything together in this code
import numpy as np
%matplotlib inline 
import pandas as pd
import tensorflow as tf
from IPython.display import display, Image
import cv2
from PIL import Image
from tqdm import tqdm
from keras.applications.vgg19 import VGG19
from keras.utils.vis_utils import plot_model
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from yaml import ScalarEvent
import seaborn as sns
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error
from scipy.optimize import minimize
import statsmodels.tsa.api as smt
import statsmodels.api as sm
from tqdm import tqdm_notebook
from itertools import product
import ipywidgets as widgets
import warnings
from ipywidgets import fixed, Box, Layout
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from datetime import datetime
from datetime import timedelta
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from time import time
import seaborn as sns
sns.set(style="whitegrid")

RANDOM_SEED = np.random.seed(0)


In [2]:
# let's define all the functions that I will be using

### STATISTICS FUNCTIONS ###


def build_dataset(dataset):
    # function that will help build the required dataset
    if dataset == "Azure CPU Usage" :
        df = pd.read_csv("./Azure_Dataset/azure.csv")
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.set_index('timestamp')
        df.drop('min cpu', inplace=True, axis=1)
        df.drop('max cpu', inplace=True, axis=1)
        print(df.head())
        df_value = df["avg cpu"]
        x_axis_name = "Timestamp"
        title = "Azure Dataset CPU Utilization"
        y_axis_name = "CPU Utilization [HZ]"
    elif dataset == "BitBrains" : 
        df = pd.read_csv("./BitBrains_Dataset/1.csv", ";\t")
        df['Timestamp [ms]'] = pd.to_datetime(df['Timestamp [ms]'])
        df = df.set_index('Timestamp [ms]')
        df.drop('CPU cores', inplace=True, axis=1)
        df.drop('CPU usage [%]', inplace=True, axis=1)
        df.drop('CPU capacity provisioned [MHZ]', inplace=True, axis=1)
        df.drop('Memory capacity provisioned [KB]', inplace=True, axis=1)
        df.drop('Memory usage [KB]', inplace=True, axis=1)
        df.drop('Disk read throughput [KB/s]', inplace=True, axis=1)
        df.drop('Disk write throughput [KB/s]', inplace=True, axis=1)
        df.drop('Network received throughput [KB/s]', inplace=True, axis=1)
        df.drop('Network transmitted throughput [KB/s]', inplace=True, axis=1)
        print(df.head())
        df_value = df["CPU usage [MHZ]"]
        x_axis_name = "Timestamp"
        title = "BitBrains Dataset CPU Utilization"
        y_axis_name = "CPU Utilization [MHZ]"      
    elif dataset ==  "KSA" :
        df = pd.read_excel("./KSA_Dataset/KSA_test.xlsx")
        df.drop('Response Time', inplace=True, axis=1)
        df.drop('Class Name', inplace=True, axis=1) 
        print(df.head())
        df_value = df["CPU Utilization"]
        x_axis_name = "Timestamp"
        title = "KSA Dataset CPU Utilization"
        y_axis_name = "CPU Utilization [%]"
    else : 
        df_value = []
        x_axis_name = ""
        y_axis_name = ""
        title = ""
    
    
    return df_value, x_axis_name, y_axis_name, title

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def eda(df_value, x_axis_name, y_axis_name, title):
    plt.figure(figsize=(17, 8))
    plt.plot(df_value)
    plt.title(title)
    plt.xlabel(x_axis_name)
    plt.ylabel(y_axis_name)
    plt.grid(False)
    plt.show()

def plot_moving_average(series, window):

    rolling_mean = series.rolling(window=window).mean()
    
    plt.figure(figsize=(17,8))
    plt.title('Moving average\n window size = {}'.format(window))
    plt.plot(rolling_mean, 'r', label='Rolling mean trend')
        
            
    plt.plot(series[window:], alpha=0.5, label='Actual values')
    plt.legend(loc='best')
    plt.grid(True)
    return 10


def exponential_smoothing(series, alpha):

    result = [series[0]] # first value is same as series
    for n in range(1, len(series)):
        result.append(alpha * series[n] + (1 - alpha) * result[n-1])
    return result
  
def plot_exponential_smoothing(series, alpha):
 
    plt.figure(figsize=(17, 8))
    plt.plot(exponential_smoothing(series, alpha), label="Alpha {}".format(alpha))
    plt.plot(series.values, "c", label = "Actual")
    plt.legend(loc="best")
    plt.axis('tight')
    plt.title("Exponential Smoothing")
    plt.grid(True)



def parser(s, frmat='%Y-%m-%d %H:%M:%S'):
    return datetime.strptime(s, frmat)


def plot_stl_decomposition(dataset):

    
    # function that will help build the required dataset
    if dataset == "Azure CPU Usage" :
        df = pd.read_csv("./Azure_Dataset/azure.csv", parse_dates=[0], index_col=0, date_parser=parser)
        df = df.asfreq(pd.infer_freq(df.index))
        df.drop('min cpu', inplace=True, axis=1)
        df.drop('max cpu', inplace=True, axis=1)
    elif dataset == "BitBrains" : 
        df = pd.read_csv("./BitBrains_Dataset/1.csv", ";\t")
        df['Timestamp [ms]'] = pd.to_datetime(df['Timestamp [ms]'])
        df = df.set_index('Timestamp [ms]')
#         df = df.asfreq(pd.infer_freq(df.index))
        df.drop('CPU cores', inplace=True, axis=1)
        df.drop('CPU usage [%]', inplace=True, axis=1)
        df.drop('CPU capacity provisioned [MHZ]', inplace=True, axis=1)
        df.drop('Memory capacity provisioned [KB]', inplace=True, axis=1)
        df.drop('Memory usage [KB]', inplace=True, axis=1)
        df.drop('Disk read throughput [KB/s]', inplace=True, axis=1)
        df.drop('Disk write throughput [KB/s]', inplace=True, axis=1)
        df.drop('Network received throughput [KB/s]', inplace=True, axis=1)
        df.drop('Network transmitted throughput [KB/s]', inplace=True, axis=1)
      
    else : 
        return None

    
    
    plt.rc('figure',figsize=(14,8))
    plt.rc('font',size=15)

    result = seasonal_decompose(df,model='additive', period = 60*24)
    fig = result.plot()  
    

def tsplot(y, lags=None):
    
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
        
    with plt.style.context(style='bmh'):
        fig = plt.figure(figsize=(12,7))
        layout = (2,2)
        ts_ax = plt.subplot2grid(layout, (0,0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1,0))
        pacf_ax = plt.subplot2grid(layout, (1,1))
        
        y.plot(ax=ts_ax)
        p_value = sm.tsa.stattools.adfuller(y)[1]
        ts_ax.set_title('Time Series Analysis Plots\n Dickey-Fuller: p={0:.5f}'.format(p_value))
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
        plt.tight_layout()

### SARIMA ###

# Train many SARIMA models to find the best set of parameters

def optimize_SARIMA(parameters_list, d, D, s, data):
    """
        Return dataframe with parameters and corresponding AIC
        
        parameters_list - list with (p, q, P, Q) tuples
        d - integration order
        D - seasonal integration order
        s - length of season
    """
    
    results = []
    best_aic = float('inf')
    
    for param in tqdm_notebook(parameters_list):
        try: model = sm.tsa.statespace.SARIMAX(data, order=(param[0], d, param[1]),
                                               seasonal_order=(param[2], D, param[3], s)).fit(disp=-1)
        except:
            continue
            
        aic = model.aic
        
        #Save best model, AIC and parameters
        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        results.append([param, model.aic])
        
    result_table = pd.DataFrame(results)
    result_table.columns = ['parameters', 'aic']
    #Sort in ascending order, lower AIC is better
    result_table = result_table.sort_values(by='aic', ascending=True).reset_index(drop=True)
    
    return result_table


def train_SARIMA(d, D, s, data):
    #Set initial values and some bounds
    ps = range(0, 2)
    d = d
    qs = range(0, 2)
    Ps = range(0, 2)
    D = D
    Qs = range(0, 2)
    s = s

    #Create a list with all possible combinations of parameters
    parameters = product(ps, qs, Ps, Qs)
    parameters_list = list(parameters)
    len(parameters_list)

    result_table = optimize_SARIMA(parameters_list, d, D, s, data)

    #Set parameters that give the lowest AIC (Akaike Information Criteria)
    p, q, P, Q = result_table.parameters[0]

    best_model = sm.tsa.statespace.SARIMAX(data, order=(p, d, q),
                                        seasonal_order=(P, D, Q, s)).fit(disp=-1)

    print(best_model.summary())


def sarima(dataset):
    df_value, x_axis_name, y_axis_name, title = build_dataset(dataset)
    if x_axis_name == "" :
        return None
    sarima = SARIMAX(df_value, 
                order=(1,1,1), 
                seasonal_order=(1,1,0,12))
    print("training started ...")
    predictions = sarima.fit().predict()

    plt.figure(figsize=(16,4))
    plt.plot(df_value, label="Actual")
    plt.plot(predictions, label="Predicted")
    plt.title('SARIMA prediction on selected dataset', fontsize=20)
    plt.ylabel('CPU Usage', fontsize=16)
    plt.legend()    
    plt.axis('tight')
    plt.grid(True)


    testScore_1 = math.sqrt(mean_squared_error(df_value[:], predictions[:]))
    print('Test Score: %.2f RMSE' % (testScore_1))

    testScore_2 = math.sqrt(mean_absolute_error(df_value[:], predictions[:]))
    print('Test Score: %f MAE' % (testScore_2))

    testScore_3 = np.mean(np.abs(predictions - df_value)/np.abs(df_value)*100)
    print('Test Score: %f MAPE' % (testScore_3))

### SARIMAOVER ###

def plot_statistics(dataset):
    df_value, x_axis_name, y_axis_name, title = build_dataset(dataset)
    if x_axis_name == "" :
        return None
    eda(df_value, x_axis_name, y_axis_name, title)

    w = widgets.interactive(plot_moving_average, series=fixed(df_value), window=window_slider)
    expo_smooth = widgets.interactive(plot_exponential_smoothing, series=fixed(df_value), alpha=alpha_slider)
    ts_plot = widgets.interactive(tsplot, y=fixed(df_value), lags=lags_slider)
    
    plot_stl_decomposition(dataset)
    
    VBOX = widgets.VBox(children=[w, expo_smooth, ts_plot], titles=('Moving Average', 'Exponential Smoothing', 'TSPLOT'))
    display(VBOX)
    return df_value
    
def f(a,b,c):
    return None 

### STATISTICS FUNCTIONS OVER ###

### ML FUNCTIONS ###

def train_generator(dataset, n_lags=1):
    dataX, dataY = [], []
    for i in range(len(dataset) - n_lags - 1):
        a = dataset.iloc[i:(i+n_lags)].to_numpy()
        dataX.append(a)
        dataY.append(dataset.iloc[i + n_lags].to_numpy())

    dataX = np.array(dataX)
    dataY = np.array(dataY)

    new_dataX = np.empty([dataX.shape[0], dataX.shape[1]])
    for i in range(len(dataX)) : 
        for j in range(dataX.shape[1]) :
            new_dataX[i][j] = dataX[i][j][0]

    new_dataY = np.empty(dataY.shape[0])
    for i in range(len(dataY)) : 
        new_dataY[i] = dataY[i][0]

    return (np.array(new_dataX), np.array(new_dataY))


def pre_process(dataset_name, timesteps, train_test_split) :
    # let's load the datasets
    if dataset_name == "Azure CPU Usage" :
        df = pd.read_csv("./Azure_Dataset/azure.csv")
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.set_index('timestamp')
        df.drop('min cpu', inplace=True, axis=1)
        df.drop('max cpu', inplace=True, axis=1)

    elif dataset_name == "BitBrains" : 
        df = pd.read_csv("./BitBrains_Dataset/1.csv", ";\t")
        df['Timestamp [ms]'] = pd.to_datetime(df['Timestamp [ms]'])
        df = df.set_index('Timestamp [ms]')
        df.drop('CPU cores', inplace=True, axis=1)
        df.drop('CPU usage [%]', inplace=True, axis=1)
        df.drop('CPU capacity provisioned [MHZ]', inplace=True, axis=1)
        df.drop('Memory capacity provisioned [KB]', inplace=True, axis=1)
        df.drop('Memory usage [KB]', inplace=True, axis=1)
        df.drop('Disk read throughput [KB/s]', inplace=True, axis=1)
        df.drop('Disk write throughput [KB/s]', inplace=True, axis=1)
        df.drop('Network received throughput [KB/s]', inplace=True, axis=1)
        df.drop('Network transmitted throughput [KB/s]', inplace=True, axis=1)

    elif dataset_name ==  "KSA" :
        df = pd.read_excel("./KSA_Dataset/KSA_test.xlsx")
        df.drop('Response Time', inplace=True, axis=1)
        df.drop('Class Name', inplace=True, axis=1) 
    
    else : 
        print("Wrong dataset name, please chose from : azure, bit brains and KSA")

    # create train test split
    train_length = round(len(df)*train_test_split)
    test_length = len(df) - train_length
    train = df.iloc[0:train_length]
    test = df.iloc[train_length:]

    train_mean = train.mean()
    train_std = train.std()

    train = (train - train_mean) / train_std
    test = (test - train_mean) / train_std

    # let's scale the values of the dataset
    scaler = MinMaxScaler(feature_range = (0,1)) #transform features by scaling each feature to a given range
    train_scaled = pd.DataFrame(scaler.fit_transform(train), columns=df.columns)
    test_scaled = pd.DataFrame(scaler.fit_transform(test), columns=df.columns)


    X_train, y_train = train_generator(train_scaled, n_lags = timesteps)
    X_test_scaled, y_test_scaled = train_generator(test_scaled, n_lags=timesteps)
    X_test, y_test = train_generator(test, n_lags=timesteps)

    return X_train, y_train, X_test, y_test, X_test_scaled, y_test_scaled, scaler



"""
This function takes as input : 
- the models that we want to train (3 options)
- the parameters of the training
- ouputs training results and graphs

"""
def train_models(list_of_models=[], print_summary=False, X_train=[], 
                    y_train=[], loss_function="mean_absolute_error", 
                    optimizer=tf.keras.optimizers.Adam(), epochs=200, validation_split=0.25,
                    batch_size=256, verbose=1, save__model_path="") :

    for model in list_of_models :
        if model not in ["NN", "GRU", "LSTM"] :
            print("Model should be between the following : NN, GRU, LSTM")   
            return None
        if model == "NN" :
            # build a NN and train it
            model_NN = tf.keras.Sequential()
            model_NN.add(tf.keras.layers.Dense(32, input_dim=X_train.shape[1], activation='relu'))
            model_NN.add(tf.keras.layers.Dense(32, activation='relu'))
            model_NN.add(tf.keras.layers.Dense(1, activation='sigmoid'))
            
            if print_summary == True : 
                print(model_NN.summary())
            
            model_NN.compile(loss=loss_function, optimizer=optimizer)
            es = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=8, verbose=1, restore_best_weights=True)
            lr_red = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, min_lr=0.0000001,)


            callbacks = [es, lr_red]
            history = model_NN.fit(X_train, y_train, epochs=epochs, validation_split=validation_split, batch_size=batch_size, verbose=verbose,
                                shuffle=False,
                                callbacks = callbacks)
            
            if save__model_path != "" :
                model_NN.save(save__model_path)

            # let's print some graphs

            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title('model loss')
            plt.ylabel('loss')
            plt.xlabel('epoch')
            plt.legend(['train', 'test'], loc='upper left')
            plt.show()

            # why not also draw out the learning rate
            plt.plot(history.history['lr'])
            plt.title('Learning Rate')
            plt.ylabel('LR')
            plt.xlabel('epoch')
            plt.show()

            return model_NN

        if model == "GRU" : 
            # build an GRU and train it
            model_GRU = tf.keras.models.Sequential()
            model_GRU.add(tf.keras.layers.GRU(512,input_shape=(X_train.shape[1], 1),return_sequences=True))
            model_GRU.add(tf.keras.layers.GRU(512, return_sequences=False))
            model_GRU.add(tf.keras.layers.Dense(1))
            model_GRU.summary()
            
            if print_summary == True : 
                print(model_GRU.summary())
            
            model_GRU.compile(loss=loss_function, optimizer=optimizer)
            es = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=8, verbose=1, restore_best_weights=True)
            lr_red = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, min_lr=0.0000001,)


            callbacks = [es, lr_red]
            history = model_GRU.fit(X_train, y_train, epochs=epochs, validation_split=validation_split, batch_size=batch_size, verbose=verbose,
                                shuffle=False,
                                callbacks = callbacks)
            
            if save__model_path != "" :
                model_GRU.save(save__model_path)

            # let's print some graphs

            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title('model loss')
            plt.ylabel('loss')
            plt.xlabel('epoch')
            plt.legend(['train', 'test'], loc='upper left')
            plt.show()

            # why not also draw out the learning rate
            plt.plot(history.history['lr'])
            plt.title('Learning Rate')
            plt.ylabel('LR')
            plt.xlabel('epoch')
            plt.show()

            return model_GRU

        if model == "LSTM" :
            # build an LSTM and train it
            model_lstm = tf.keras.models.Sequential()
            model_lstm.add(tf.keras.layers.LSTM(512,input_shape=(X_train.shape[1], 1),return_sequences=True))
            model_lstm.add(tf.keras.layers.LSTM(512, return_sequences=False))
            model_lstm.add(tf.keras.layers.Dense(1))
            model_lstm.summary()
            
            if print_summary == True : 
                print(model_lstm.summary())
            
            model_lstm.compile(loss=loss_function, optimizer=optimizer)
            es = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=8, verbose=1, restore_best_weights=True)
            lr_red = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, min_lr=0.0000001,)


            callbacks = [es, lr_red]
            history = model_lstm.fit(X_train, y_train, epochs=epochs, validation_split=validation_split, batch_size=batch_size, verbose=verbose,
                                shuffle=False,
                                callbacks = callbacks)
            
            if save__model_path != "" :
                model_lstm.save(save__model_path)

            # let's print some graphs

            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title('model loss')
            plt.ylabel('loss')
            plt.xlabel('epoch')
            plt.legend(['train', 'test'], loc='upper left')
            plt.show()

            # why not also draw out the learning rate
            plt.plot(history.history['lr'])
            plt.title('Learning Rate')
            plt.ylabel('LR')
            plt.xlabel('epoch')
            plt.show()

            return model_lstm


"""
This function takes as input : 
- the model
- the test set
- outputs the test scores and the graphs we want
"""
def test_models(model, y_test, X_test_scaled, scaler):
    preds = model.predict(X_test_scaled)
    preds = scaler.inverse_transform(preds)
    plt.rcParams["figure.figsize"] = (32,12)
    TestY = pd.DataFrame(y_test, columns=['avg_cpu'])
    PredY = pd.DataFrame(preds, columns=['avg_cpu'])

    plot_avg = plt.figure(1)
    plt.plot(TestY['avg_cpu'])
    plt.plot(PredY['avg_cpu'])
    plt.show()

    testScore_1 = math.sqrt(mean_squared_error(y_test[:], preds[:]))
    print('Test Score: %.2f RMSE' % (testScore_1))

    testScore_2 = math.sqrt(mean_absolute_error(y_test[:], preds[:]))
    print('Test Score: %f MAE' % (testScore_2))

    testScore_3 = np.mean(np.abs(preds - y_test)/np.abs(y_test)*100)
    print('Test Score: %f MAPE' % (testScore_3))


def plot_ml(dataset, n_lags, model_name, print_summary, loss_function, epochs):

    X_train, y_train, X_test, y_test, X_test_scaled, y_test_scaled, scaler = pre_process(dataset, n_lags, 0.8)

    # let's train the models
    model = train_models([model_name], print_summary, X_train, y_train, loss_function, 
                                    tf.keras.optimizers.Adam(), epochs, 0.25, 256, 1, "")
    model.save("./model_"+model_name)

    test_models(model, y_test, X_test_scaled, scaler)


### ML FUNCTIONS OVER ###

def choose_vars(dataset, n_lags, model_name, print_summary, loss_function, epochs):
    return None


In [3]:
# let's create all my widgest here : 

dataset_selection = widgets.Dropdown(
    options=[' ', 'Azure CPU Usage', 'BitBrains', 'KSA'],
    value=' ',
    description='Choose the dataset from the following list :',
    disabled=False,
    style={'description_width': 'initial'}
)

dataset_selection_ml = widgets.Dropdown(
    options=['Azure CPU Usage', 'BitBrains', 'KSA'],
    value='KSA',
    description='Choose the dataset from the following list :',
    disabled=False,
    style={'description_width': 'initial'}
)

dataset_selection_sarima = widgets.Dropdown(
    options=[' ', 'Azure CPU Usage', 'BitBrains', 'KSA'],
    value=' ',
    description='Choose the dataset from the following list :',
    disabled=False,
    style={'description_width': 'initial'}
)

window_slider = widgets.IntSlider(
    value=60,
    min=0,
    max=200,
    step=20,
    description='Window size for moving average computation',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    style={'description_width': 'initial'},
    layout=Layout(width='60%', height='100px')
)

lags_slider = widgets.IntSlider(
    value=30,
    min=0,
    max=100,
    step=10,
    description='Lags',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    style={'description_width': 'initial'},
    readout_format='d',
    layout=Layout(width='60%', height='100px')
)

alpha_slider = widgets.FloatSlider(
    value=0.5,
    min=0,
    max=1.0,
    step=0.1,
    description='Alpha parameter for exponential smoothing',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    style={'description_width': 'initial'},
    readout_format='d',
    layout=Layout(width='60%', height='100px')
)

sarima_d_drpdwn = widgets.Dropdown(
    options=['1', '2', '3', '4', '5'],
    value='2',
    description='d value :',
    style={'description_width': 'initial'},
    disabled=False
)

sarima_D_drpdwn = widgets.Dropdown(
    options=['1', '2', '3', '4', '5'],
    value='2',
    description='D value :',
    style={'description_width': 'initial'},
    disabled=False
)

sarima_s_drpdwn = widgets.Dropdown(
    options=['1', '2', '3', '4', '5'],
    value='2',
    description='s value :',
    style={'description_width': 'initial'},
    disabled=False
)

model_drpdown = widgets.Dropdown(
    options=['NN', 'GRU', 'LSTM'],
    value='NN',
    description='Choose model :',
    style={'description_width': 'initial'},
    disabled=False
)

print_summary_checkbox = widgets.Checkbox(
    value=False,
    description='Print Summary',
    style={'description_width': 'initial'},
    disabled=False,
    indent=False
)

loss_function_drpdown = widgets.Dropdown(
    options=['mean_absolute_error'],
    value='mean_absolute_error',
    description='Choose loss function :',
    style={'description_width': 'initial'},
    disabled=False
)

epochs_number = widgets.Text(
    value='200',
    placeholder='Type something',
    description='Epochs :',
    style={'description_width': 'initial'},
    disabled=False
)

n_lags_slider = widgets.IntSlider(
    value=10,
    min=0,
    max=100,
    step=10,
    description='Window size for dataset building',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    style={'description_width': 'initial'},
    readout_format='d'
)

# GTM FY22 : IT Capacity Planning


## Introduction 


The goal of this project is to complete a study that will showcase the use of analytics and AI in order to forecast IT servers usage, as well as potentially predict capacity needs.

Our study was done on three different datasets from three different sources : 

1. Microsoft Azure : this first dataset contains the CPU Usage data of machines running Microsoft Azure services, sampled every 5 minutes. The data has three attributes : 
    - Max CPU Utilization
    - Average CPU Utilization
    - Minimum CPU Utilization
    
    In this study, we only focused on the average CPU Utilization, but of course this could be extended to any kind of usage. A further study could also be implemented by using these three values in order to predict a single futuristic value

<img src="Images/azure.png" alt="Azure Dataset" style="height: 100px; width:180px;"/>

2. GWA-T-12 Bitbrains : the dataset contains the performance metrics of 1,750 VMs from a distributed datacenter from Bitbrains, which is a service provider that specializes in managed hosting and business computation for enterprises. Customers include many major banks (ING), credit card operators (ICS), insurers (Aegon), etc. Bitbrains hosts applications used in the solvency domain; examples of application vendors are Towers Watson and Algorithmics. These applications are typically used for financial reporting, which is used predominately at the end of financial quarters.

    In this study, we only focus on the results of single machines. So typically, a prediction model is built for one single machine, since different machines coulnd be running different programs, with different configurations ... and these are not mentioned in the dataset. But the following was also tested : training a model on a specific machine and predicting the workload of another one. This also worked, not to the best results, but it could definetly be an interesting future field of work

<img src="Images/TUDLogo.png" alt="TU Delft" style="height: 100px; width:200px;"/>

- Here are the metrics measured in the dataset (in this study we only use and focus on the CPU usage in terms of MHZ) : 
    - Timestamp: number of milliseconds since 1970-01-01.
    - CPU cores: number of virtual CPU cores provisioned.
    - CPU capacity provisioned (CPU requested): the capacity of the CPUs in terms of MHZ, it equals to number of cores x speed per core.
    - CPU usage: in terms of MHZ.
    - CPU usage: in terms of percentage
    - Memory provisioned (memory requested): the capacity of the memory of the VM in terms of KB.
    - Memory usage: the memory that is actively used in terms of KB.
    - Disk read throughput: in terms of KB/s
    - Disk write throughput: in terms of KB/s
    - Network received throughput: in terms of KB/s
    - Network transmitted throughput: in terms of KB/s

3. KSA Ministry of Finance : this dataset was collected from the KSA Ministry of Finance that contains 28,147 instances from 13 cloud nodes. It was recorded during the period from March 1, 2016, to February 20, 2017, in continuous time slots. The data represent the performance of the servers implemented in the institution. The follwoing metrics were measured : 
    - Number of Jobs in a minute/5min/15min
    - Memory Capacity
    - Disk Capacity
    - Number of CPU Cores
    - CPU speed per Core
    - Average receive for network bandwidth in Kbps
    - Average transmit for network bandwidth in Kbps
    - Memory utilization in percent
    - CPU utilization in percent
    - Response Time in milliseconds

    It is clear that a lot of metrics can be useful for this study. Here, we only focused on the CPU as it is the metric we decided to build our study around. But of course, this could be definetly completed by an inside-out prediction : for example using the external parameters (number of jobs, mem capacity, network bandwidth ...) to predict the inner values (CPU,Memory utilization) 



## Statistics

In this part, you can choose a dataset from the dropdown below and different graphs will appear to showcase data and metadata about the dataset.

Before jumping into these results, let us have an overlook on the definition of the below graphs/metric : 

1. An overview of the data (couple of rows and a graph representation)

2. Moving Average : In statistics, a moving average is a calculation used to analyze data points by creating a series of averages of different subsets of the full data set (of a fixed window size)

3. Exponential Smoothing : Exponential smoothing is a rule of thumb technique for smoothing time series data using the exponential window function. Whereas in the simple moving average the past observations are weighted equally, exponential functions are used to assign exponentially decreasing weights over time

4. Dickey-Fuller test : Named for American statisticians David Dickey and Wayne Fuller, who developed the test in 1979, the Dickey-Fuller test is used to determine whether a unit root (a feature that can cause issues in statistical inference) is present in an autoregressive model.


In [4]:
plot_stats = widgets.interactive(plot_statistics, dataset=dataset_selection)
display(plot_stats)

interactive(children=(Dropdown(description='Choose the dataset from the following list :', options=(' ', 'Azur…

## SARIMA

In this section, the SARIMA prediction algorithm is implemented.

Let's go over the theoretical part of ARIMA and SARIMA : 

- ARIMA : ARIMA model is a class of linear models that utilizes historical values to forecast future values. ARIMA stands for Autoregressive Integrated Moving Average, each of which technique contributes to the final forecast. Let’s understand it one by one.
    - Autoregressive : we forecast the variable of interest using a linear combination of past values of that variable
    - Integrated : represents any differencing that has to be applied in order to make the data stationary
    - Moving Average : Moving average models uses past forecast errors rather than past values in a regression-like model to forecast future values
 

- SARIMA : SARIMA stands for Seasonal-ARIMA and it includes seasonality contribution to the forecast. The importance of seasonality is quite evident and ARIMA fails to encapsulate that information implicitly. The Autoregressive (AR), Integrated (I), and Moving Average (MA) parts of the model remain as that of ARIMA. The addition of Seasonality adds robustness to the SARIMA model.

In [5]:
plot_sarima = widgets.interactive(sarima, dataset=dataset_selection_sarima)
display(plot_sarima)

interactive(children=(Dropdown(description='Choose the dataset from the following list :', options=(' ', 'Azur…

## Machine Learning

In this section, we implemented everything related to ML in this study. You can first choose a dataset below, the model that you would like to train on this dataset as well as different parameters.

The training and testing results/metrics will be printed out then!

For results metric, we use : 
- RMSE : Root Mean Squared Error is the square root of Mean Squared Error (MSE). MSE is nothing but a representation of how forecasted values differ from actual or true ones. We take the square root in order to avoid the negative sign as errors can be positive or negative


- MAPE : Mean Absolute Percentage Error is the measure of how accurate a forecast system is. It measures this accuracy as a percentage, and can be calculated as the average absolute percent error for each time period minus actual values divided by actual values


- MAE : The MAE measures the average magnitude of the errors in a set of forecasts, without considering their direction. It measures accuracy for continuous variables. The equation is given in the library references. Expressed in words, the MAE is the average over the verification sample of the absolute values of the differences between forecast and the corresponding observation. The MAE is a linear score which means that all the individual differences are weighted equally in the average


In [6]:
choose_vars = widgets.interactive(choose_vars, dataset=dataset_selection_ml, model_name=model_drpdown, n_lags=n_lags_slider, 
                            print_summary=print_summary_checkbox, loss_function=loss_function_drpdown, 
                            epochs=epochs_number)
display(choose_vars)

interactive(children=(Dropdown(description='Choose the dataset from the following list :', index=2, options=('…

In [7]:
dataset_ml = choose_vars.children[0].value
n_lags = choose_vars.children[1].value
print_summary = choose_vars.children[3].value
loss_function = choose_vars.children[4].value
epochs = int(choose_vars.children[5].value)
model_name = choose_vars.children[2].value

button = widgets.Button(description="Click here to launch ML")
output = widgets.Output()

display(button, output)

def launch_ML(b):
    with output:
        plot_ml(dataset_ml, n_lags, model_name, print_summary, loss_function, epochs)
button.on_click(launch_ML)

Button(description='Click here to launch ML', style=ButtonStyle())

Output()