In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from multiprocessing import Value
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from joblib import Parallel, delayed

In [2]:
df = pd.read_parquet('../../../metrics.parquet')

In [3]:
df, _ = train_test_split(df, train_size=50000, stratify=df['time_to_stop_activity'], random_state=11)

In [4]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [5]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [6]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [7]:
imputer = SimpleImputer(strategy='mean') 
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [8]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
def train_evaluate_svr(regularization, tolerance, X_train, Y_train, X_test, Y_test):
    svr = SVR(kernel='rbf', C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)
    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    print(f'{regularization} - {tolerance} finished')
    
    return {
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

In [15]:
param_combinations = [
    (regularization, tolerance) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
]

In [16]:
benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, X_train, Y_train, X_test, Y_test)
    for reg, tol in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)

In [17]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,13.146077,13.147209,13.14206,13.142094,13.154305,13.206518
1.0,11.807991,11.813036,11.814492,11.833609,11.93717,11.915788
10.0,10.688782,10.705611,10.730526,10.795428,10.859262,10.772401
100.0,9.771922,9.721574,9.694777,9.671482,9.630051,9.535971
1000.0,9.320553,9.313347,9.291506,9.248775,9.108106,8.918613
10000.0,10.529562,10.511055,10.499839,10.495498,10.647729,10.762351


In [18]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_absolute_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,2.623654,2.624505,2.624955,2.627982,2.647314,2.679873
1.0,2.18998,2.189699,2.191166,2.195685,2.221544,2.282168
10.0,1.894554,1.895815,1.898407,1.907182,1.94491,2.015112
100.0,1.714912,1.714667,1.716755,1.725192,1.773805,1.866259
1000.0,1.611588,1.612362,1.615325,1.624107,1.667121,1.729393
10000.0,1.578459,1.577971,1.577915,1.578215,1.590021,1.662353


In [19]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='r2_score'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,0.245646,0.245581,0.245877,0.245875,0.245174,0.242178
1.0,0.322429,0.322139,0.322056,0.320959,0.315016,0.316243
10.0,0.386652,0.385686,0.384256,0.380532,0.376869,0.381853
100.0,0.439263,0.442153,0.44369,0.445027,0.447404,0.452803
1000.0,0.465164,0.465578,0.466831,0.469283,0.477355,0.488228
10000.0,0.395788,0.39685,0.397494,0.397743,0.389007,0.38243


In [20]:
benchmark_df.to_excel('SVMRegressorBenchmark.xlsx')

## Model Performance Improvement
Once the model has not a good performance, we are looking methods and techniques to improve the performance

##### **Kernel Trick**
SVM can model non-linear relationships between features by using the kernel trick. The default kernel is rbf (Radial Basis Function), but you can experiment with other kernels like:

- Linear Kernel: If your data is linearly separable, the linear kernel might be the best choice.
- Polynomial Kernel: Captures polynomial relationships between data points. You can control the degree of the polynomial to fit higher-order relationships.
- Sigmoid Kernel: Similar to a neural network activation function, this kernel maps data into a hyperbolic tangent space.

Action: Try different kernels and see which works best for your data. For non-linear data, rbf is usually a good default, but for linear data, a linear kernel might perform better.

In [None]:
def train_evaluate_svr(regularization, tolerance, kernel, X_train, Y_train, X_test, Y_test):
    
    svr = SVR(kernel=kernel, C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)
    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    return {
        'kernel': kernel,
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

param_combinations = [
    (regularization, tolerance, kernel) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
    for kernel in ['rbf', 'linear', 'poly']
]

total_tasks = len(param_combinations)

benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, kernel, X_train, Y_train, X_test, Y_test)
    for reg, tol, kernel in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)

In [None]:
benchmark_df.to_excel('SVMRegressor_KernelTrick_Benchmark.xlsx')

In [None]:
benchmark_df\
    [benchmark_df.kernel == 'rbf']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

In [None]:
benchmark_df\
    [benchmark_df.kernel == 'linear']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

In [None]:
benchmark_df\
    [benchmark_df.kernel == 'poly']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

##### **Feature Scaling**
SVM is sensitive to the scale of the features. Features with larger numerical ranges dominate the decision boundary, so proper scaling is essential. You are already using StandardScaler, but you can also try:

- MinMaxScaler: Rescales features into a range, typically [0, 1]. This might work better if your features have different scales.
- RobustScaler: This is more robust to outliers because it scales the data based on the median and the interquartile range instead of the mean and standard deviation.

Action: Experiment with different scalers and check how each affects the performance.

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state = 11
)

In [None]:
def train_evaluate_svr(regularization, tolerance, scaler, X_train, Y_train, X_test, Y_test):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    svr = SVR(kernel='rbf', C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)

    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    return {
        'scaler': type(scaler).__name__,
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

In [None]:
param_combinations = [
    (regularization, tolerance, scaler) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
    for scaler in [StandardScaler(), MinMaxScaler(), RobustScaler()]
]

In [None]:
benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, scaler, X_train, Y_train, X_test, Y_test)
    for reg, tol, scaler in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)

In [None]:
benchmark_df.to_excel('SVMRegressor_FeatureScaler_Benchmark.xlsx')

In [None]:
benchmark_df\
    [benchmark_df.scaler == 'StandardScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

In [None]:
benchmark_df\
    [benchmark_df.scaler == 'RobustScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

In [None]:
benchmark_df\
    [benchmark_df.scaler == 'MinMaxScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

#### Handling Outliers
SVM is highly sensitive to outliers, as they can significantly influence the decision boundary and margins. Outliers can distort the hyperplane, leading to poor generalization.

- Outlier Detection: Before training the model, perform outlier detection (e.g., using Z-scores, IQR, or visualizations like box plots) and remove or adjust them.
- Adjust Epsilon: If your model has many outliers, you may want to increase epsilon to create a wider margin and minimize the effect of outliers.

Action: Identify and handle outliers by removing or transforming them to see if performance improves.