In [1]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
df = pd.read_parquet('../../../metrics.parquet')

In [3]:
df, _ = train_test_split(df, train_size=5000, stratify=df['time_to_stop_activity'], random_state=11)

In [4]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [5]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [6]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [7]:
def train_evaluate_gpr(kernel_length_scale, kernel_constant_value, X_train, Y_train, X_test, Y_test):
    mse = None; mae = None; r2 = None

    try:
        kernel = C(kernel_constant_value, (1e-3, 1e3)) * RBF(kernel_length_scale, (1e-2, 1e2))

        gpr = GaussianProcessRegressor(
            kernel=kernel, 
            n_restarts_optimizer=10, 
            random_state=11
        )
        
        gpr.fit(X_train, Y_train)
        
        y_pred, std = gpr.predict(X_test, return_std=True)
        
        mse = mean_squared_error(Y_test, y_pred)
        mae = mean_absolute_error(Y_test, y_pred)
        r2 = r2_score(Y_test, y_pred)
        
        print(f'GPR - Length Scale: {kernel_length_scale}, Constant Value: {kernel_constant_value} | MSE: {mse}, MAE: {mae}, R2: {r2}')

        return {
            'kernel_length_scale': kernel_length_scale,
            'kernel_constant_value': kernel_constant_value,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }
    
    except Exception as e:
        print(f'Error with kernel_length_scale: {kernel_length_scale}, kernel_constant_value: {kernel_constant_value} | {e}')

        return {
            'kernel_length_scale': kernel_length_scale,
            'kernel_constant_value': kernel_constant_value,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }

In [8]:
param_combinations = [
    (kernel_length_scale, kernel_constant_value)
    for kernel_length_scale in [0.5, 1.0, 1.5]
    for kernel_constant_value in [1.0, 10.0, 100.0]
]

In [9]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [11]:
benchmark = Parallel(n_jobs=-1)(
    delayed(train_evaluate_gpr)(kernel_length_scale, kernel_constant_value, X_train, Y_train, X_test, Y_test)
    for kernel_length_scale, kernel_constant_value in param_combinations
)



GPR - Length Scale: 1.5, Constant Value: 100.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513




GPR - Length Scale: 1.5, Constant Value: 1.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513




GPR - Length Scale: 1.5, Constant Value: 10.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513




GPR - Length Scale: 1.0, Constant Value: 100.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513
GPR - Length Scale: 1.0, Constant Value: 1.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513
GPR - Length Scale: 0.5, Constant Value: 1.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513
GPR - Length Scale: 1.0, Constant Value: 10.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513
GPR - Length Scale: 0.5, Constant Value: 100.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513
GPR - Length Scale: 0.5, Constant Value: 10.0 | MSE: 18471.42403432114, MAE: 8.233028791461056, R2: -1065.7196717747513


In [12]:
benchmark_df = pd.DataFrame(benchmark)

In [13]:
benchmark_df.to_excel('GaussianProcessRegressorBenchmark.xlsx')