In [7]:
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from joblib import Parallel, delayed

import os
import glob

In [8]:
folder_path = "../../Data/"
file_list = glob.glob(os.path.join(folder_path, "*.parquet"))

df = [pd.read_parquet(file) for file in file_list]
df = pd.concat(df, ignore_index=True)

In [9]:
df, _ = train_test_split(df, train_size=50000, stratify=df['time_to_stop_activity'], random_state=11)

In [10]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [11]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [12]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [13]:
def train_evaluate_bayesian(alpha_1, alpha_2, lambda_1, lambda_2, X_train, Y_train, X_test, Y_test):
    mse = None; mae = None; r2 = None

    try:
        bayesian_ridge = BayesianRidge(
            alpha_1=alpha_1,
            alpha_2=alpha_2,
            lambda_1=lambda_1,
            lambda_2=lambda_2,
        )

        bayesian_ridge.fit(X_train, Y_train)
        y_pred = bayesian_ridge.predict(X_test)

        mse = mean_squared_error(Y_test, y_pred)
        mae = mean_absolute_error(Y_test, y_pred)
        r2 = r2_score(Y_test, y_pred)

        print(f'Bayesian Ridge - Alpha1: {alpha_1}, Alpha2: {alpha_2}, Lambda1: {lambda_1}, Lambda2: {lambda_2} Finalized - {mse}, {mae}, {r2}')

        return {
            'alpha_1': alpha_1,
            'alpha_2': alpha_2,
            'lambda_1': lambda_1,
            'lambda_2': lambda_2,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }

    except Exception as e:
        print(f'Bayesian Ridge - Alpha1: {alpha_1}, Alpha2: {alpha_2}, Lambda1: {lambda_1}, Lambda2: {lambda_2} Error: {e}')
        
        return {
            'alpha_1': alpha_1,
            'alpha_2': alpha_2,
            'lambda_1': lambda_1,
            'lambda_2': lambda_2,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }

In [14]:
param_combinations = [
    (alpha_1, alpha_2, lambda_1, lambda_2)
    for alpha_1 in [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    for alpha_2 in [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    for lambda_1 in [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    for lambda_2 in [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
]

In [15]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [17]:
benchmark = Parallel(n_jobs=4)(
    delayed(train_evaluate_bayesian)(alpha_1, alpha_2, lambda_1, lambda_2, X_train, Y_train, X_test, Y_test)
    for alpha_1, alpha_2, lambda_1, lambda_2 in param_combinations
)

Bayesian Ridge - Alpha1: 1e-06, Alpha2: 1e-06, Lambda1: 1e-06, Lambda2: 0.0001 Finalized - 10.073832670756309, 1.8436569513153636, 0.17788243787921665
Bayesian Ridge - Alpha1: 1e-06, Alpha2: 1e-06, Lambda1: 1e-06, Lambda2: 1e-06 Finalized - 10.073832670981817, 1.8436569514293482, 0.17788243786081304
Bayesian Ridge - Alpha1: 1e-06, Alpha2: 1e-06, Lambda1: 1e-06, Lambda2: 0.001 Finalized - 10.073832668706261, 1.8436569502791378, 0.1778824380465195
Bayesian Ridge - Alpha1: 1e-06, Alpha2: 1e-06, Lambda1: 1e-06, Lambda2: 1e-05 Finalized - 10.073832670961314, 1.8436569514189851, 0.17788243786248625
Bayesian Ridge - Alpha1: 1e-06, Alpha2: 1e-06, Lambda1: 1e-05, Lambda2: 1e-06 Finalized - 10.073832674011772, 1.8436569529608833, 0.17788243761354083
Bayesian Ridge - Alpha1: 1e-06, Alpha2: 1e-06, Lambda1: 1e-06, Lambda2: 0.01 Finalized - 10.073832648205927, 1.8436569399169278, 0.17788243971953543
Bayesian Ridge - Alpha1: 1e-06, Alpha2: 1e-06, Lambda1: 1e-06, Lambda2: 0.1 Finalized - 10.0738324432

In [18]:
benchmark_df = pd.DataFrame(benchmark)

In [20]:
benchmark_df.to_excel('BayesianRegressorBenchmark.xlsx')