In [1]:
import pandas as pd
from typing import Tuple
from sklearn.preprocessing import StandardScaler
import io
import sys
import sympy as sp
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from typing import Tuple
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
)
# import boilerplate.util as util
class rf_model_params:
    def __init__(self, 
                 test_size: float,
                 random_state: int = None,
                 n_estimators: int = 100,
                 max_features: str = 'sqrt',
                 max_depth: int = None,
                 min_samples_leaf: int = 1,
                 min_samples_split: int = 2,
                 bootstrap: bool = True,
                 verbose: int = 0,
                 n_jobs: int = 2,
                 n_splits: int = 5,
                 shuffle: bool = False):
        '''
        Inicializa os parâmetros para o modelo Random Forest.

        Parâmetros:
        - test_size (float): Proporção do conjunto de dados a ser usada como teste.
        - random_state (int): Semente para o gerador de números aleatórios.
        - n_estimators (int): Número de árvores na floresta.
        - max_features (str): O número de features a serem consideradas ao procurar a melhor divisão.
        - max_depth (int): Profundidade máxima da árvore. Se None, os nós serão expandidos até que todas as folhas sejam puras.
        - min_samples_leaf (int): Número mínimo de amostras que devem estar presentes em um nó folha.
        - min_samples_split (int): Número mínimo de amostras necessárias para dividir um nó.
        - bootstrap (bool): Se True, as amostras são extraídas com substituição.
        - verbose (int): Controle de verbosidade. O valor 0 significa nenhuma saída.
        - n_jobs (int): Número de jobs a serem executados em paralelo (-1 significa usar todos os processadores).
        - n_splits (int): Número de splits para validação cruzada.
        - shuffle (bool): Se True, embaralha os dados antes de dividi-los.
        '''
        self.test_size = test_size
        self.random_state = random_state
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.bootstrap = bootstrap
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.n_splits = n_splits
        self.shuffle = shuffle
    
    def __str__(self):
        return (f"rf_model_params("
                f"test_size={self.test_size}, "
                f"random_state={self.random_state}, "
                f"n_estimators={self.n_estimators}, "
                f"max_features='{self.max_features}', "
                f"max_depth={self.max_depth}, "
                f"min_samples_leaf={self.min_samples_leaf}, "
                f"min_samples_split={self.min_samples_split}, "
                f"bootstrap={self.bootstrap}, "
                f"verbose={self.verbose}, "
                f"n_jobs={self.n_jobs}, "
                f"n_splits={self.n_splits}, "
                f"shuffle={self.shuffle})")

In [2]:
def nmae(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    nmae_value = mae / np.mean(y_true)
    return nmae_value

def nrmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    nrmse_value = rmse / np.mean(y_true)
    return nrmse_value

def get_datasets(dataset_name="sinusoid") -> Tuple[pd.DataFrame, pd.DataFrame]:
    source_data = "sinusoid_8h"

    if dataset_name == "mix":
        source_data = "mix_5h"
    if dataset_name == "flashcrowd":
        source_data = "flashcrowd_6h"

    data_log = pd.read_csv(f"assets/data/log_INT_{source_data}.txt", delimiter=",")

    data_log.columns = data_log.columns.str.replace(" ", "")

    data_dash = pd.read_csv(f"assets/data/dash_{source_data}.log", sep=",")

    return data_log, data_dash

def remove_useless_attribute(dataset):
    dataset.drop(columns=dataset.columns[dataset.nunique() == 1], inplace=True)
    return dataset

def remove_outlier_IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df_final = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))]
    return df_final

def change_NaN_to_mean(dataset):
    dataset = dataset.fillna(dataset.mean())
    return dataset

def merge_dataset(data_log, data_dash):

    total = data_log.merge(data_dash, on=['timestamp', 'timestamp'], how='left')
    total = total.dropna()
    
    features = total.iloc[:,1:len(data_log.columns)].values
    
    labels = total['framesDisplayedCalc'].values

    return normalization(features), labels

def normalization(X):
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    return X

In [3]:
def default_random_forest_model(
    features: pd.DataFrame, labels: pd.Series, model_params
):
    X_train, X_validation, y_train, y_validation = train_test_split(
        features,
        labels,
        test_size=model_params.test_size,
        random_state=model_params.random_state,
        shuffle=model_params.shuffle,
    )

   
    X_train_scaled = X_train
    X_validation_scaled = X_validation

    rf_model = RandomForestRegressor(
        n_estimators=model_params.n_estimators,
        max_depth=model_params.max_depth,
        min_samples_split=model_params.min_samples_split,
        min_samples_leaf=model_params.min_samples_leaf,
        bootstrap=model_params.bootstrap,
        verbose=model_params.verbose,
        max_features=model_params.max_features,
        n_jobs=model_params.n_jobs,
        random_state=model_params.random_state,
    )


    nmae_scorer = make_scorer(nmae, greater_is_better=False)

    if(model_params.shuffle):
        kf = KFold(
            n_splits=model_params.n_splits,
            shuffle=model_params.shuffle,
            random_state=model_params.random_state,
        )
    else:
        kf = KFold(
            n_splits=model_params.n_splits,
            shuffle=model_params.shuffle,
        )
    cross_val_scores = cross_val_score(
        rf_model, X_train_scaled, y_train, cv=kf, scoring=nmae_scorer
    )

    avg_cross_val_score = np.mean(cross_val_scores)


    rf_model.fit(X_train_scaled, y_train)

    predictions = rf_model.predict(X_validation_scaled)
    mae_rf = mean_absolute_error(y_validation, predictions)
    nmae_rf = nmae(y_validation, predictions)

    return mae_rf, nmae_rf, rf_model

In [4]:
sinusoid_dash, sinusoid_log = get_datasets()
sinusoid_dash, sinusoid_log = pd.DataFrame(sinusoid_dash), pd.DataFrame(sinusoid_log)
sinusoid_dash = remove_useless_attribute(sinusoid_dash)
sinusoid_dash = remove_outlier_IQR(sinusoid_dash)
sinusoid_dash = change_NaN_to_mean(sinusoid_dash)
features, labels = merge_dataset(sinusoid_dash, sinusoid_log)

In [None]:
best_params = rf_model_params(
    n_estimators=90,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features='sqrt',
    max_depth=44,
    bootstrap=True,
    n_splits=5,
    random_state=42,
    shuffle=False,
    test_size=0.2,
    verbose = 2,
    n_jobs=-1
)

print(default_random_forest_model(features, labels, best_params))
