In [27]:
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm

In [28]:
df = pd.read_parquet('../../../metrics.parquet')

In [29]:
df, _ = train_test_split(df, train_size=50000, stratify=df['time_to_stop_activity'], random_state=11)

In [30]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [31]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [32]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [33]:
def train_evaluate_radius_neighbors(radius, weights, metric, X_train, Y_train, X_test, Y_test):
    mse = None
    mae = None
    r2 = None

    try:
        rnr = RadiusNeighborsRegressor(radius=radius, weights=weights, metric=metric)
        rnr.fit(X_train, Y_train)

        y_pred = []
        neighborhoods = rnr.radius_neighbors(X_test, return_distance=False)
        default_value = Y_train.mean()

        for i, neighbors in enumerate(neighborhoods):
            if len(neighbors) == 0: 
                y_pred.append(default_value)
            else: 
                y_pred.append(rnr.predict(X_test[i].reshape(1, -1))[0])

        mse = mean_squared_error(Y_test, y_pred)
        mae = mean_absolute_error(Y_test, y_pred)
        r2 = r2_score(Y_test, y_pred)

        print(f'RNR - Radius: {radius}, Weights: {weights}, Metric: {metric} Finalized - MSE: {mse}, MAE: {mae}, R2: {r2}')

        return {
            'radius': radius,
            'weights': weights,
            'metric': metric,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }
    
    except Exception as e:
        print(f'RNR - Radius: {radius}, Weights: {weights}, Metric: {metric} Error: {e}')
    
        return {
            'radius': radius,
            'weights': weights,
            'metric': metric,
            'mean_squared_error': mse,
            'mean_absolute_error': mae,
            'r2_score': r2
        }

In [34]:
param_combinations = [
    (radius, weights, metric)
    for radius in [1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0]
    for weights in ['uniform', 'distance']
    for metric in ['euclidean', 'manhattan']
]

In [35]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [36]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [37]:
results = Parallel(n_jobs=4)(
    delayed(train_evaluate_radius_neighbors)(radius, weights, metric, X_train, Y_train, X_test, Y_test)
    for radius, weights, metric in param_combinations
)

RNR - Radius: 1.0, Weights: uniform, Metric: manhattan Finalized - MSE: 8.91739469840397, MAE: 1.8279264983061443, R2: 0.4882982549950846
RNR - Radius: 1.0, Weights: distance, Metric: manhattan Finalized - MSE: 7.636572119928413, MAE: 1.4606795739565488, R2: 0.5617949623422361
RNR - Radius: 1.0, Weights: uniform, Metric: euclidean Finalized - MSE: 11.602712085716359, MAE: 2.277469166326181, R2: 0.3342082276437415
RNR - Radius: 1.0, Weights: distance, Metric: euclidean Finalized - MSE: 7.615786731356675, MAE: 1.5323858513816602, R2: 0.5629876783722518
RNR - Radius: 2.0, Weights: uniform, Metric: manhattan Finalized - MSE: 11.643434410800685, MAE: 2.2593376345994263, R2: 0.33187148182155224
RNR - Radius: 2.0, Weights: uniform, Metric: euclidean Finalized - MSE: 13.081964293981233, MAE: 2.6792719943933343, R2: 0.24932514666864602
RNR - Radius: 2.0, Weights: distance, Metric: manhattan Finalized - MSE: 7.660986407037408, MAE: 1.5040653575227947, R2: 0.5603940113089754




RNR - Radius: 2.0, Weights: distance, Metric: euclidean Finalized - MSE: 7.997311934523677, MAE: 1.7507122336630347, R2: 0.5410948364798938
RNR - Radius: 3.0, Weights: uniform, Metric: manhattan Finalized - MSE: 12.338496527970582, MAE: 2.369251394607176, R2: 0.291987131036193
RNR - Radius: 3.0, Weights: uniform, Metric: euclidean Finalized - MSE: 14.226344255683758, MAE: 2.9485204477591087, R2: 0.18365784773696958
RNR - Radius: 3.0, Weights: distance, Metric: manhattan Finalized - MSE: 7.574057809780151, MAE: 1.5319579545798188, R2: 0.5653821851435741
RNR - Radius: 3.0, Weights: distance, Metric: euclidean Finalized - MSE: 8.565382570652796, MAE: 1.9425116639914284, R2: 0.5084975650094106
RNR - Radius: 4.0, Weights: uniform, Metric: euclidean Finalized - MSE: 15.004724203145503, MAE: 3.1103164921975814, R2: 0.13899251768666943
RNR - Radius: 4.0, Weights: uniform, Metric: manhattan Finalized - MSE: 12.422968271094549, MAE: 2.4126773765691283, R2: 0.2871399374530854
RNR - Radius: 4.0, W

In [38]:
benchmark_df = pd.DataFrame(results)

In [39]:
benchmark_df.to_excel('RadiusRegressorBenchmark.xlsx')