In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from multiprocessing import Value
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from joblib import Parallel, delayed

In [15]:
df = pd.read_parquet('../../../metrics.parquet')

In [16]:
df = df.sample(n=50000)

In [17]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [18]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [19]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [20]:
imputer = SimpleImputer(strategy='mean') 
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [21]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [23]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
def train_evaluate_svr(regularization, tolerance, X_train, Y_train, X_test, Y_test):
    svr = SVR(kernel='rbf', C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)
    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    print(f'{regularization} - {tolerance} finished')
    
    return {
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

In [25]:
param_combinations = [
    (regularization, tolerance) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
]

In [26]:
benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, X_train, Y_train, X_test, Y_test)
    for reg, tol in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)

0.1 - 0.05 finished
0.1 - 0.01 finished
0.1 - 1 finished
0.1 - 0.5 finished
0.1 - 0.2 finished
0.1 - 0.1 finished
1 - 0.1 finished
1 - 0.05 finished
1 - 0.01 finished
1 - 0.2 finished
1 - 1 finished
1 - 0.5 finished
10 - 0.5 finished
10 - 0.1 finished
10 - 0.2 finished
10 - 0.05 finished
10 - 1 finished
10 - 0.01 finished
100 - 1 finished
100 - 0.5 finished
100 - 0.2 finished
100 - 0.1 finished
100 - 0.05 finished
100 - 0.01 finished
1000 - 1 finished
1000 - 0.5 finished
1000 - 0.2 finished
1000 - 0.1 finished
1000 - 0.05 finished
1000 - 0.01 finished
10000 - 1 finished
10000 - 0.5 finished
10000 - 0.2 finished
10000 - 0.1 finished
10000 - 0.05 finished
10000 - 0.01 finished


In [27]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,13.146058,13.147012,13.149073,13.145213,13.148376,13.203404
1.0,11.871114,11.87506,11.877083,11.89613,11.954764,11.895269
10.0,10.711272,10.723229,10.744769,10.800378,10.832431,10.746254
100.0,9.743899,9.704495,9.685001,9.665846,9.6498,9.606197
1000.0,9.453008,9.454553,9.452678,9.420532,9.349417,9.323776
10000.0,10.639336,10.600685,10.601712,10.586184,10.550642,10.647141


In [28]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_absolute_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,2.629566,2.63014,2.630937,2.634791,2.651108,2.682112
1.0,2.203854,2.204781,2.205773,2.209263,2.234525,2.288574
10.0,1.893863,1.89495,1.897894,1.906385,1.941074,2.00844
100.0,1.700724,1.701086,1.703072,1.713889,1.760512,1.856325
1000.0,1.597055,1.598573,1.602489,1.612591,1.669982,1.776182
10000.0,1.593399,1.592153,1.592505,1.595415,1.610674,1.685293


In [29]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='r2_score'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,0.236826,0.23677,0.236651,0.236875,0.236691,0.233497
1.0,0.310841,0.310612,0.310494,0.309388,0.305985,0.309438
10.0,0.378174,0.377479,0.376229,0.373001,0.37114,0.376143
100.0,0.434333,0.43662,0.437752,0.438864,0.439796,0.442327
1000.0,0.45122,0.45113,0.451239,0.453106,0.457234,0.458723
10000.0,0.38235,0.384594,0.384534,0.385435,0.387499,0.381897


## Model Performance Improvement
Once the model has not a good performance, we are looking methods and techniques to improve the performance

##### **Kernel Trick**
SVM can model non-linear relationships between features by using the kernel trick. The default kernel is rbf (Radial Basis Function), but you can experiment with other kernels like:

- Linear Kernel: If your data is linearly separable, the linear kernel might be the best choice.
- Polynomial Kernel: Captures polynomial relationships between data points. You can control the degree of the polynomial to fit higher-order relationships.
- Sigmoid Kernel: Similar to a neural network activation function, this kernel maps data into a hyperbolic tangent space.

Action: Try different kernels and see which works best for your data. For non-linear data, rbf is usually a good default, but for linear data, a linear kernel might perform better.

In [31]:
def train_evaluate_svr(regularization, tolerance, kernel, X_train, Y_train, X_test, Y_test):
    
    svr = SVR(kernel=kernel, C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)
    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    return {
        'kernel': kernel,
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

param_combinations = [
    (regularization, tolerance, kernel) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
    for kernel in ['rbf', 'linear', 'poly']
]

total_tasks = len(param_combinations)

benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, kernel, X_train, Y_train, X_test, Y_test)
    for reg, tol, kernel in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)



KeyboardInterrupt: 

In [34]:
benchmark_df\
    [benchmark_df.kernel == 'rbf']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

AttributeError: 'DataFrame' object has no attribute 'kernel'

In [52]:
benchmark_df\
    [benchmark_df.kernel == 'linear']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,3.076964e+168,3.076964e+168,3.076964e+168,3.076964e+168,3.076964e+168,3.076964e+168
1.0,3.076964e+170,3.076964e+170,3.076964e+170,3.076964e+170,3.076964e+170,3.076964e+170
10.0,3.0769639999999997e+172,3.0769639999999997e+172,3.0769639999999997e+172,3.0769639999999997e+172,3.0769639999999997e+172,3.0769639999999997e+172
100.0,3.076964e+174,3.076964e+174,3.076964e+174,3.076964e+174,3.076964e+174,3.076964e+174
1000.0,3.076964e+176,3.076964e+176,3.076964e+176,3.076964e+176,3.076964e+176,3.076964e+176
10000.0,3.0769640000000002e+178,3.0769640000000002e+178,3.0769640000000002e+178,3.0769640000000002e+178,3.0769640000000002e+178,3.0769640000000002e+178


In [50]:
benchmark_df\
    [benchmark_df.kernel == 'poly']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
1.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
10.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
100.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
1000.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
10000.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062


##### **Feature Scaling**
SVM is sensitive to the scale of the features. Features with larger numerical ranges dominate the decision boundary, so proper scaling is essential. You are already using StandardScaler, but you can also try:

- MinMaxScaler: Rescales features into a range, typically [0, 1]. This might work better if your features have different scales.
- RobustScaler: This is more robust to outliers because it scales the data based on the median and the interquartile range instead of the mean and standard deviation.

Action: Experiment with different scalers and check how each affects the performance.

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state = 11
)

In [27]:
def train_evaluate_svr(regularization, tolerance, scaler, X_train, Y_train, X_test, Y_test):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    svr = SVR(kernel='rbf', C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)

    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    return {
        'scaler': type(scaler).__name__,
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

In [28]:
param_combinations = [
    (regularization, tolerance, scaler) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
    for scaler in [StandardScaler(), MinMaxScaler(), RobustScaler()]
]

In [29]:
benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, scaler, X_train, Y_train, X_test, Y_test)
    for reg, tol, scaler in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)

In [41]:
benchmark_df\
    [benchmark_df.scaler == 'StandardScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,17.287797,17.229818,17.081601,16.74889,15.828858,14.757449
1.0,13.299391,13.277055,13.21541,13.061348,12.545966,11.853835
10.0,11.422297,11.408934,11.365349,11.282742,10.934229,10.423376
100.0,10.605205,10.584964,10.534264,10.399913,9.978358,9.554725
1000.0,11.53428,11.503967,11.465096,11.320334,11.025576,10.754369
10000.0,23.07656,22.945781,22.933513,22.556385,21.746142,20.738345


In [45]:
benchmark_df\
    [benchmark_df.scaler == 'RobustScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
1.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
10.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
100.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
1000.0,23.941913,23.7301,23.469835,22.964305,21.567714,19.640062
10000.0,23.941913,23.7301,23.469835,22.964305,21.567713,19.640061


In [44]:
benchmark_df\
    [benchmark_df.scaler == 'MinMaxScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,21.400989,21.338895,21.122145,20.737728,19.447737,18.384828
1.0,18.307879,18.204993,18.043665,17.644855,16.546957,15.276378
10.0,15.023713,14.981504,14.890117,14.67856,14.022465,13.148012
100.0,13.054013,13.038923,13.013252,12.883137,12.379654,11.777031
1000.0,11.923947,11.939167,11.909038,11.810606,11.467938,11.034528
10000.0,12.381233,12.365877,12.316832,12.22416,11.801225,11.451055


#### Handling Outliers
SVM is highly sensitive to outliers, as they can significantly influence the decision boundary and margins. Outliers can distort the hyperplane, leading to poor generalization.

- Outlier Detection: Before training the model, perform outlier detection (e.g., using Z-scores, IQR, or visualizations like box plots) and remove or adjust them.
- Adjust Epsilon: If your model has many outliers, you may want to increase epsilon to create a wider margin and minimize the effect of outliers.

Action: Identify and handle outliers by removing or transforming them to see if performance improves.