In [1]:
from _params_models import rf_model_params
from _ml_models import *
from _plots import *
from _util import *

import io
import sys
import time
import pickle
import sympy as sp
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from typing import Tuple
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

In [2]:
def get_datasets(dataset_name="sinusoid_8h"):
    source_data = dataset_name
    if dataset_name == "mix":
        source_data = "mix_5h"
    if dataset_name == "flashcrowd":
        source_data = "flashcrowd_6h"
    if dataset_name == "sinusoid":
        source_data = "sinusoid_8h"
    data_log = pd.read_csv(f"assets/data/log_INT_{source_data}.txt", delimiter=",")
    data_log.columns = data_log.columns.str.replace(" ", "")
    data_dash = pd.read_csv(f"assets/data/dash_{source_data}.log", sep=",")

    return data_log, data_dash

def remove_useless_attribute(dataset):
    dataset.drop(columns=dataset.columns[dataset.nunique() == 1], inplace=True)
    return dataset

def remove_outlier_IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df_final = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))]
    return df_final

def change_NaN_to_mean(dataset):
    dataset = dataset.fillna(dataset.mean())
    return dataset


def normalization(X):
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    return X

def merge_datasert(data_log, data_dash):
    data_log = remove_useless_attribute(data_log)

    data_dash['timestamp'] = data_dash['timestamp'].astype(str).str[:10].astype(int)
    total = data_log.merge(data_dash, on=['timestamp', 'timestamp'], how='left')
   
    total = remove_outlier_IQR(total)
    total = change_NaN_to_mean(total)
    features = total.iloc[:,1:len(data_log.columns)].values
    labels = total['framesDisplayedCalc'].values

    features = normalization(features)

    return features, labels

def visualize_results(feature_importances, feature_names):

    feature_importances_df = pd.DataFrame({
        'Features': feature_names,
        'Importância': feature_importances
    }).sort_values(by='Importância', ascending=False)

    plt.figure(figsize=(10, 5))
    sns.barplot(x='Importância', y='Features', data=feature_importances_df)
    plt.title('Importância das Features')
    plt.xlabel('Importância')
    plt.ylabel('Features')
    plt.show()

In [5]:
data_log_sinusoid, data_dash_sinusoid = get_datasets('sinusoid')
data_log_flashcrowd, data_dash_flashcrowd = get_datasets('flashcrowd')
data_log_mix, data_dash_mix = get_datasets('mix')
data_log = pd.concat([data_log_sinusoid, data_log_flashcrowd, data_log_mix])
data_dash = pd.concat([data_dash_sinusoid, data_dash_flashcrowd, data_dash_mix])

In [7]:
data_log_sinusoid.head()

Unnamed: 0,timestamp,switchID_t3,ingress_port3,egress_port3,egress_spec3,ingress_global_timestamp3,egress_global_timestamp3,enq_timestamp3,enq_qdepth3,deq_timedelta3,...,switchID_t1,ingress_port1,egress_port1,egress_spec1,ingress_global_timestamp1,egress_global_timestamp1,enq_timestamp1,enq_qdepth1,deq_timedelta1,deq_qdepth1
0,1621899282,3,0,2,0,39094865473,39094865660,440159961,0,39,...,1,0,1,0,39245046271,39245046453,590340761,0,30,0
1,1621899282,3,0,2,0,39094947171,39094947281,440241591,0,29,...,1,0,1,0,39245127761,39245127953,590422260,0,32,0
2,1621899282,3,0,2,0,39095064755,39095064937,440359239,0,36,...,1,0,1,0,39245245663,39245245851,590540158,0,31,0
3,1621899282,3,0,2,0,39095165462,39095165651,440459953,0,37,...,1,0,1,0,39245346337,39245346527,590640834,0,31,0
4,1621899282,3,0,2,0,39095283945,39095284124,440578433,0,29,...,1,0,1,0,39245464819,39245465008,590759307,0,38,0
