In [25]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [26]:
df = pd.read_parquet('../../../metrics.parquet')

In [27]:
df, _ = train_test_split(df, train_size=100000, stratify=df['time_to_stop_activity'], random_state=11)

In [28]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [29]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [30]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [31]:
def train_evaluate_rf(n_estimators, max_depth, min_samples_split, X_train, Y_train, X_test, Y_test):
    mse = None; mae = None; r2 = None

    try:
        rf = RandomForestRegressor(
            n_estimators = n_estimators, 
            max_depth = max_depth, 
            min_samples_split = min_samples_split
        )
        rf.fit(X_train, Y_train)
        y_pred = rf.predict(X_test)
        
        mse = mean_squared_error(Y_test, y_pred)
        mae = mean_absolute_error(Y_test, y_pred)
        r2 = r2_score(Y_test, y_pred)

        print(f'RF - Estimators: {n_estimators}, Max Depth: {max_depth}, Min Samples Split: {min_samples_split} Finalized - {mse}, {mae}, {r2}')
        
        return {
            'n_estimators': n_estimators,
            'max_depth': max_depth, 
            'min_samples_split': min_samples_split, 
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }
    
    except Exception as e:
        print(f'RF - Estimators: {n_estimators}, Max Depth: {max_depth}, Min Samples Split: {min_samples_split} Error')
        return None


In [32]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [34]:
param_combinations = [
    (n_estimators, max_depth, min_samples_split) 
    for n_estimators in [50, 100, 200]
    for max_depth in [10, 20, 30]
    for min_samples_split in [2, 5, 10]
]

In [35]:
benchmark = Parallel(n_jobs=-1)(
    delayed(train_evaluate_rf)(reg, dep, min_samples, X_train, Y_train, X_test, Y_test)
    for reg, dep, min_samples in param_combinations
)

RF - Estimators: 50, Max Depth: 10, Min Samples Split: 2 Finalized - 5.74725060391653, 1.0900894808766532, 0.6683264290136646
RF - Estimators: 50, Max Depth: 10, Min Samples Split: 5 Finalized - 5.719923871773111, 1.090670906961916, 0.6699034534829331
RF - Estimators: 50, Max Depth: 10, Min Samples Split: 10 Finalized - 5.741516821674083, 1.0927724619412837, 0.668657325326123
RF - Estimators: 50, Max Depth: 20, Min Samples Split: 10 Finalized - 5.591471854325733, 1.0802381951497944, 0.6773164135682541
RF - Estimators: 50, Max Depth: 20, Min Samples Split: 5 Finalized - 5.61072037575584, 1.0797658809468877, 0.6762055822718809
RF - Estimators: 50, Max Depth: 20, Min Samples Split: 2 Finalized - 5.590355477975892, 1.072842441005553, 0.6773808395966344
RF - Estimators: 50, Max Depth: 30, Min Samples Split: 10 Finalized - 5.62264217672564, 1.122089041538, 0.6755175756800773
RF - Estimators: 50, Max Depth: 30, Min Samples Split: 5 Finalized - 5.653652883169954, 1.1252165594870442, 0.67372794

In [36]:
benchmark_df = pd.DataFrame(benchmark)

In [37]:
benchmark_df.to_excel('RandomForestRegressorBenchmark.xlsx')