## Imports


In [1]:
from _params_models import rf_model_params
from _ml_models import *
from _plots import *
from _util import *

import io
import sys
import time
import pickle
import sympy as sp
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from typing import Tuple
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

## Utility
functions and things that helped the use and modularity of the code

In [2]:
def get_datasets(dataset_name="sinusoid_8h"):
    source_data = dataset_name
    if dataset_name == "mix":
        source_data = "mix_5h"
    if dataset_name == "flashcrowd":
        source_data = "flashcrowd_6h"
    if dataset_name == "sinusoid":
        source_data = "sinusoid_8h"
    data_log = pd.read_csv(f"assets/data/log_INT_{source_data}.txt", delimiter=",")
    data_log.columns = data_log.columns.str.replace(" ", "")
    data_dash = pd.read_csv(f"assets/data/dash_{source_data}.log", sep=",")

    return data_log, data_dash

def remove_useless_attribute(dataset):
    dataset.drop(columns=dataset.columns[dataset.nunique() == 1], inplace=True)
    return dataset

def remove_outlier_IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df_final = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR)))]
    return df_final

def change_NaN_to_mean(dataset):
    dataset = dataset.fillna(dataset.mean())
    return dataset


def normalization(X):
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    return X

def merge_datasert(data_log, data_dash):
    data_log = remove_useless_attribute(data_log)

    data_dash['timestamp'] = data_dash['timestamp'].astype(str).str[:10].astype(int)
    total = data_log.merge(data_dash, on=['timestamp', 'timestamp'], how='left')
   
    total = remove_outlier_IQR(total)
    total = change_NaN_to_mean(total)
    features = total.iloc[:,1:len(data_log.columns)].values
    labels = total['framesDisplayedCalc'].values

    features = normalization(features)

    return features, labels

def visualize_results(feature_importances, feature_names):

    feature_importances_df = pd.DataFrame({
        'Features': feature_names,
        'Importância': feature_importances
    }).sort_values(by='Importância', ascending=False)

    plt.figure(figsize=(10, 5))
    sns.barplot(x='Importância', y='Features', data=feature_importances_df)
    plt.title('Importância das Features')
    plt.xlabel('Importância')
    plt.ylabel('Features')
    plt.show()

## Models for tranning

In [3]:
def nmae(y_true, y_pred):
    nmae_value = mean_absolute_error(y_true, y_pred) / np.mean(y_true)
    return nmae_value

def default_random_forest_model(
    features: pd.DataFrame, labels: pd.Series, model_params: rf_model_params
):

    X_train, X_validation, y_train, y_validation = train_test_split(
        features,
        labels,
        test_size=model_params.test_size,
        random_state=model_params.random_state,
        shuffle=model_params.shuffle
    )

    X_train_scaled = X_train
    X_validation_scaled = X_validation

    rf_model = RandomForestRegressor(
        n_estimators=model_params.n_estimators,
        max_depth=model_params.max_depth,
        min_samples_split=model_params.min_samples_split,
        min_samples_leaf=model_params.min_samples_leaf,
        bootstrap=model_params.bootstrap,
        verbose=model_params.verbose,
        max_features=model_params.max_features,
        n_jobs=model_params.n_jobs,
        random_state=model_params.random_state,
    )

    mae_scorer = make_scorer(nmae, greater_is_better=False)

    kf = KFold(
        n_splits=model_params.n_splits,
        shuffle=model_params.shuffle,
    )

    cross_val_scores = cross_val_score(
        rf_model, X_train_scaled, y_train, cv=kf, scoring=mae_scorer
    )

    avg_cross_val_score = np.mean(cross_val_scores)

    rf_model.fit(X_train_scaled, y_train)
    
    predictions = rf_model.predict(X_validation_scaled)
    mae_rf = mean_absolute_error(y_validation, predictions)
    nmae_rf = nmae(y_validation, predictions)

    return mae_rf, nmae_rf, rf_model

In [4]:
data_log_sinusoid, data_dash_sinusoid = get_datasets('sinusoid')
data_log_flashcrowd, data_dash_flashcrowd = get_datasets('flashcrowd')
data_log_mix, data_dash_mix = get_datasets('mix')
data_log = pd.concat([data_log_sinusoid, data_log_flashcrowd, data_log_mix])
data_dash = pd.concat([data_dash_sinusoid, data_dash_flashcrowd, data_dash_mix])

In [None]:
aux = remove_useless_attribute(data_log)
aux 

In [5]:
features_sinusoid, labels_sinusoid = merge_dataset(data_log_sinusoid, data_dash_sinusoid)
features_flashcrowd, labels_flashcrowd = merge_dataset(data_log_flashcrowd, data_dash_flashcrowd)
features_mix, labels_mix = merge_dataset(data_log_mix, data_dash_mix)
features_total, labels_total = merge_dataset(data_log, data_dash)

In [None]:
pd.DataFrame(features_sinusoid)

In [None]:
def evaluate_and_print_model(features, labels, best_params, label):
    start_time = time.time()
    mae, nmae, model = default_random_forest_model(features, labels, best_params)
    end_time = time.time()

    elapsed_time = end_time - start_time
    minutes = int(elapsed_time // 60)
    seconds = int(elapsed_time % 60)

    print(f"{label} -> MAE = {mae}, NMAE = {nmae * 100:.2f}%, Tempo de treino: {minutes} minutos e {seconds} segundos")
    return mae, nmae, model

best_params = rf_model_params(
    n_estimators=90,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features='sqrt',
    max_depth=44,
    bootstrap=True,
    n_splits=5,
    random_state=42,
    shuffle=False,
    test_size=0.2,
    verbose = 0,
    n_jobs=2
)

mae_sinusoid, nmae_sinusoid, model_sinusoid = evaluate_and_print_model(features_sinusoid, labels_sinusoid, best_params, "Sinusoid")
mae_flashcrowd, nmae_flashcrowd, model_flashcrowd = evaluate_and_print_model(features_flashcrowd, labels_flashcrowd, best_params, "Flashcrowd")
mae_mix, nmae_mix, model_mix = evaluate_and_print_model(features_mix, labels_mix, best_params, "Mix")
mae_total, nmae_total, model_total = evaluate_and_print_model(features, labels, best_params, "Total")

In [13]:
def plot_predictions_comparison(start_graph, end_graph, labels_list, predictions_list, model_name):
    print(len(labels_list))
    print(len(predictions_list))
    datasets_info = [
        ('Sinusoid', labels_list[0][start_graph:end_graph], predictions_list[0][start_graph:end_graph]),
        ('Flashcrowd', labels_list[1][start_graph:end_graph], predictions_list[1][start_graph:end_graph]),
        ('Mix', labels_list[2][start_graph:end_graph], predictions_list[2][start_graph:end_graph]),
        ('Total', labels_list[3][start_graph:end_graph], predictions_list[3][start_graph:end_graph])
    ]

    fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(16, 16), constrained_layout=True)
    t = np.arange(start_graph, end_graph)

    ax1.plot(t, datasets_info[0][1], label='Sinusoid Labels', color='blue')
    ax1.plot(t, datasets_info[0][2], label='Sinusoid Predictions', color='orange')
    ax1.set_title('Sinusoid: Labels vs Predictions')
    ax1.set_xlabel('Índice')
    ax1.set_ylabel('Valor')
    ax1.grid(True)
    ax1.legend()

    ax2.plot(t, datasets_info[1][1], label='Flashcrowd Labels', color='green')
    ax2.plot(t, datasets_info[1][2], label='Flashcrowd Predictions', color='red')
    ax2.set_title('Flashcrowd: Labels vs Predictions')
    ax2.set_xlabel('Índice')
    ax2.set_ylabel('Valor')
    ax2.grid(True)
    ax2.legend()


    ax3.plot(t, datasets_info[2][1], label='Mix Labels', color='purple')
    ax3.plot(t, datasets_info[2][2], label='Mix Predictions', color='green')
    ax3.set_title('Mix: Labels vs Predictions')
    ax3.set_xlabel('Índice')
    ax3.set_ylabel('Valor')
    ax3.grid(True)
    ax3.legend()

    ax4.plot(t, datasets_info[3][1], label='Total Labels', color='blue')
    ax4.plot(t, datasets_info[3][2], label='Total Predictions', color='yellow')
    ax4.set_title('Total: Labels vs Predictions')
    ax4.set_xlabel('Índice')
    ax4.set_ylabel('Valor')
    ax4.grid(True)
    ax4.legend()

    plt.show()

In [None]:
datasets = ['Sinusoid', 'Flashcrowd', 'Mix', 'Total']
model_names = ['Sinusoid', 'Flashcrowd', 'Mix', 'Total']
models = [model_sinusoid, model_flashcrowd, model_mix, model_total]

features_list = [features_sinusoid, features_flashcrowd, features_mix, features_total]
labels_list = [labels_sinusoid, labels_flashcrowd, labels_mix, labels_total]

for model, model_name in zip(models, model_names):
    predictions = [model.predict(features) for features in features_list]
    nmae_values = [nmae(labels, pred) for labels, pred in zip(labels_list, predictions)]

    plt.figure(figsize=(8, 5))
    bars = plt.bar(datasets, nmae_values, color=['skyblue', 'lightgreen', 'salmon', 'yellow'])
    plt.title(f'NMAE para cada Dataset usando o modelo treinado com {model_name}')
    plt.xlabel('Dataset')
    plt.ylabel('NMAE')
    plt.ylim(0, max(nmae_values) * 1.2)

    for bar, value in zip(bars, nmae_values):
        plt.text(
            bar.get_x() + bar.get_width() / 2, 
            bar.get_height(),               
            f'{value:.2f}%',               
            ha='center', va='bottom'            
        )
    plt.grid(axis='y', linestyle='--', alpha=0.7)


    plt.show()  

    plot_predictions_comparison(
        start_graph=100, 
        end_graph=20200, 
        labels_list=[labels_sinusoid, labels_flashcrowd, labels_mix, labels_total],
        predictions_list=predictions,
        model_name=model_name
    )