In [1]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm

import os
import glob

In [2]:
folder_path = "../../Data/"
file_list = glob.glob(os.path.join(folder_path, "*.parquet"))

df = [pd.read_parquet(file) for file in file_list]
df = pd.concat(df, ignore_index=True)

In [3]:
df, _ = train_test_split(df, train_size=200000, stratify=df['time_to_stop_activity'], random_state=11)

In [4]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [5]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [6]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [7]:
def train_evaluate_adaboost(n_estimators, learning_rate, max_depth, X_train, Y_train, X_test, Y_test):
    mse = None; mae = None; r2 = None

    try:
        base_estimator = DecisionTreeRegressor(max_depth=max_depth)
        adaboost = AdaBoostRegressor(
            estimator=base_estimator,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=11
        )
        
        adaboost.fit(X_train, Y_train)
        y_pred = adaboost.predict(X_test)

        mse = mean_squared_error(Y_test, y_pred)
        mae = mean_absolute_error(Y_test, y_pred)
        r2 = r2_score(Y_test, y_pred)

        print(f'AB - Estimators: {n_estimators}, Learning Rate: {learning_rate}, Max Depth: {max_depth} Finalized - {mse}, {mae}, {r2}')
    
    except Exception as e:
        print(f'AB - Estimators: {n_estimators}, Learning Rate: {learning_rate}, Max Depth: {max_depth} Error: {e}')
        
    return {
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

In [8]:
param_combinations = [
    (n_estimators, learning_rate, max_depth) 
    for n_estimators in [50, 100, 200]
    for learning_rate in [0.01, 0.1, 1.0]
    for max_depth in [3, 5, 10]
]


In [9]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [11]:
benchmark = Parallel(n_jobs=-1)(
    delayed(train_evaluate_adaboost)(n_estimators, lr, depth, X_train, Y_train, X_test, Y_test)
    for n_estimators, lr, depth in param_combinations
)

AB - Estimators: 50, Learning Rate: 1.0, Max Depth: 3 Finalized - 11.402019709207929, 2.6406969122041435, 0.08848978311682376
AB - Estimators: 100, Learning Rate: 1.0, Max Depth: 3 Finalized - 11.402019709207929, 2.6406969122041435, 0.08848978311682376
AB - Estimators: 50, Learning Rate: 1.0, Max Depth: 5 Finalized - 11.441213512745785, 2.638265343091528, 0.08535651784677545
AB - Estimators: 100, Learning Rate: 1.0, Max Depth: 5 Finalized - 11.441213512745785, 2.638265343091528, 0.08535651784677545
AB - Estimators: 50, Learning Rate: 0.01, Max Depth: 3 Finalized - 10.086779767483458, 1.9821630519665137, 0.1936338431263439
AB - Estimators: 50, Learning Rate: 0.1, Max Depth: 3 Finalized - 10.789857869265997, 2.507317193046481, 0.13742775952134245
AB - Estimators: 100, Learning Rate: 0.1, Max Depth: 3 Finalized - 10.964312309986203, 2.5634262749328123, 0.12348137027166495
AB - Estimators: 50, Learning Rate: 0.01, Max Depth: 5 Finalized - 9.635115734468156, 1.876121013669253, 0.22974116368

In [12]:
benchmark_df = pd.DataFrame(benchmark)

In [13]:
benchmark_df.to_excel('AdaBoostRegressorBenchmark.xlsx')