In [12]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm

In [13]:
df = pd.read_parquet('../../../metrics.parquet')

In [14]:
df, _ = train_test_split(df, train_size=50000, stratify=df['time_to_stop_activity'], random_state=11)

In [15]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [16]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [17]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [18]:
def train_evaluate_knn(n_neighbors, weights, algorithm, X_train, Y_train, X_test, Y_test):
    try:
        knn = KNeighborsRegressor(
            n_neighbors=n_neighbors,
            weights=weights,
            algorithm=algorithm
        )
        
        knn.fit(X_train, Y_train)
        y_pred = knn.predict(X_test)

        mse = mean_squared_error(Y_test, y_pred)
        mae = mean_absolute_error(Y_test, y_pred)
        r2 = r2_score(Y_test, y_pred)

        print(
            f"KNN - Neighbors: {n_neighbors}, Weights: {weights}, Algorithm: {algorithm} "
            f"Finalized - MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}"
        )
        
        return {
            'n_neighbors': n_neighbors,
            'weights': weights,
            'algorithm': algorithm,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }
    
    except Exception as e:
        print(
            f"KNN - Neighbors: {n_neighbors}, Weights: {weights}, Algorithm: {algorithm} Error: {str(e)}"
        )

        return {
            'n_neighbors': n_neighbors,
            'weights': weights,
            'algorithm': algorithm,
            'mean_squared_error': None,
            'mean_absolute_error': None,
            'r2_score': None
        }


In [19]:
param_combinations = [
        (n_neighbors, weights, algorithm)
        for n_neighbors in [3, 5, 10, 20, 50]
        for weights in ['uniform', 'distance']
        for algorithm in ['auto', 'ball_tree', 'kd_tree']
    ]

In [20]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [22]:
results = Parallel(n_jobs=-1)(
    delayed(train_evaluate_knn)(n_neighbors, weights, algorithm, X_train, Y_train, X_test, Y_test)
    for n_neighbors, weights, algorithm in param_combinations
)

KNN - Neighbors: 3, Weights: uniform, Algorithm: auto Finalized - MSE: 7.7558, MAE: 1.3404, R²: 0.5550
KNN - Neighbors: 3, Weights: distance, Algorithm: auto Finalized - MSE: 8.0444, MAE: 1.3264, R²: 0.5384
KNN - Neighbors: 5, Weights: distance, Algorithm: auto Finalized - MSE: 7.4375, MAE: 1.2968, R²: 0.5732
KNN - Neighbors: 10, Weights: distance, Algorithm: auto Finalized - MSE: 6.9924, MAE: 1.2601, R²: 0.5988
KNN - Neighbors: 5, Weights: uniform, Algorithm: auto Finalized - MSE: 7.1506, MAE: 1.3127, R²: 0.5897
KNN - Neighbors: 10, Weights: uniform, Algorithm: auto Finalized - MSE: 6.7273, MAE: 1.2725, R²: 0.6140
KNN - Neighbors: 3, Weights: uniform, Algorithm: kd_tree Finalized - MSE: 7.7597, MAE: 1.3414, R²: 0.5547
KNN - Neighbors: 3, Weights: distance, Algorithm: kd_tree Finalized - MSE: 8.0472, MAE: 1.3271, R²: 0.5382
KNN - Neighbors: 5, Weights: distance, Algorithm: kd_tree Finalized - MSE: 7.4378, MAE: 1.2968, R²: 0.5732
KNN - Neighbors: 5, Weights: uniform, Algorithm: kd_tree 

In [23]:
benchmark_df = pd.DataFrame(results)

In [24]:
benchmark_df.to_excel('KNNRegressorBenchmark.xlsx')