In [1]:
import pandas as pd
import numpy as np

from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_parquet('../../../metrics.parquet')

In [3]:
df, _ = train_test_split(df, train_size=20000, stratify=df['time_to_stop_activity'], random_state=11)

In [4]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [5]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [6]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [7]:
def train_evaluate_krr(alpha, kernel, gamma, degree, coef0, X_train, Y_train, X_test, Y_test):
    mse = None
    mae = None
    r2 = None

    try:
        krr = KernelRidge(alpha=alpha, kernel=kernel, gamma=gamma, degree=degree, coef0=coef0)
        krr.fit(X_train, Y_train)
        y_pred = krr.predict(X_test)
        
        mse = mean_squared_error(Y_test, y_pred)
        mae = mean_absolute_error(Y_test, y_pred)
        r2 = r2_score(Y_test, y_pred)
        
        print(f'KRR - Alpha: {alpha}, Kernel: {kernel}, Gamma: {gamma}, Degree: {degree}, Coef0: {coef0} Finalized - MSE: {mse}, MAE: {mae}, R2: {r2}')

        return {
            'alpha': alpha,
            'kernel': kernel,
            'gamma': gamma,
            'degree': degree,
            'coef0': coef0,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }
    
    except Exception as e:
        print(f'KRR - Alpha: {alpha}, Kernel: {kernel}, Gamma: {gamma}, Degree: {degree}, Coef0: {coef0} Error: {e}')
    
        return {
            'alpha': alpha,
            'kernel': kernel,
            'gamma': gamma,
            'degree': degree,
            'coef0': coef0,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }

In [8]:
param_combinations = [
    (alpha, kernel, gamma, degree, coef0)
    for alpha in [0.1, 1.0, 10.0]
    for kernel in ['linear', 'poly', 'rbf']
    for gamma in [None, 0.1, 1.0]
    for degree in [3, 4]
    for coef0 in [0.0, 1.0]
]

In [9]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [11]:
results = Parallel(n_jobs=4)(
    delayed(train_evaluate_krr)(alpha, kernel, gamma, degree, coef0, X_train, Y_train, X_test, Y_test)
    for alpha, kernel, gamma, degree, coef0 in param_combinations
)

KRR - Alpha: 0.1, Kernel: linear, Gamma: None, Degree: 3, Coef0: 1.0 Finalized - MSE: 15.677854154541732, MAE: 2.260151804815975, R2: 0.09189754119810234
KRR - Alpha: 0.1, Kernel: linear, Gamma: None, Degree: 4, Coef0: 1.0 Finalized - MSE: 15.677854154541732, MAE: 2.260151804815975, R2: 0.09189754119810234
KRR - Alpha: 0.1, Kernel: linear, Gamma: None, Degree: 3, Coef0: 0.0 Finalized - MSE: 15.677854154541732, MAE: 2.260151804815975, R2: 0.09189754119810234
KRR - Alpha: 0.1, Kernel: linear, Gamma: None, Degree: 4, Coef0: 0.0 Finalized - MSE: 15.677854154541732, MAE: 2.260151804815975, R2: 0.09189754119810234
KRR - Alpha: 0.1, Kernel: linear, Gamma: 0.1, Degree: 3, Coef0: 1.0 Finalized - MSE: 15.677854154541732, MAE: 2.260151804815975, R2: 0.09189754119810234
KRR - Alpha: 0.1, Kernel: linear, Gamma: 0.1, Degree: 3, Coef0: 0.0 Finalized - MSE: 15.677854154541732, MAE: 2.260151804815975, R2: 0.09189754119810234
KRR - Alpha: 0.1, Kernel: linear, Gamma: 0.1, Degree: 4, Coef0: 0.0 Finalized 

  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


KRR - Alpha: 0.1, Kernel: poly, Gamma: None, Degree: 3, Coef0: 0.0 Finalized - MSE: 336788.237438509, MAE: 24.382124785500658, R2: -19506.658605490265
KRR - Alpha: 0.1, Kernel: poly, Gamma: None, Degree: 3, Coef0: 1.0 Finalized - MSE: 422511.6045896028, MAE: 19.23693438793082, R2: -24471.98100990458
KRR - Alpha: 0.1, Kernel: poly, Gamma: None, Degree: 4, Coef0: 0.0 Finalized - MSE: 306168525.4668431, MAE: 383.6607770713029, R2: -17734083.527355306
KRR - Alpha: 0.1, Kernel: poly, Gamma: None, Degree: 4, Coef0: 1.0 Finalized - MSE: 178324381.33130744, MAE: 337.18655832838544, R2: -10329015.174983097


  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


KRR - Alpha: 0.1, Kernel: poly, Gamma: 0.1, Degree: 3, Coef0: 1.0 Finalized - MSE: 15408313.554628303, MAE: 115.72356202633648, R2: -892488.9598517517
KRR - Alpha: 0.1, Kernel: poly, Gamma: 0.1, Degree: 3, Coef0: 0.0 Finalized - MSE: 66341357.41303501, MAE: 241.0086969674647, R2: -3842664.532743219




KeyboardInterrupt: 

In [None]:
benchmark_df = pd.DataFrame(results)

In [None]:
benchmark_df.to_excel('KernelRidgeRegressorBenchmark.xlsx')