In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from multiprocessing import Value
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from joblib import Parallel, delayed
import os
import glob

In [3]:
folder_path = "../../Data/"

file_list = glob.glob(os.path.join(folder_path, "*.parquet"))

df = [pd.read_parquet(file) for file in file_list]
df = pd.concat(df, ignore_index=True)

In [4]:
df, _ = train_test_split(df, train_size=50000, stratify=df['time_to_stop_activity'], random_state=11)

In [5]:
df = df.drop(
    columns = [
        "user", 
        'project', 
        'current_month', 
        'turnover_num',
        'turnover',
        'betweenness_centrality_12_intercept',
        'betweenness_centrality_12_slope', 
        'betweenness_centrality_12_std_dev',
        'betweenness_centrality_3_intercept', 
        'betweenness_centrality_3_slope',
        'betweenness_centrality_3_std_dev',
        'betweenness_centrality_6_intercept', 
        'betweenness_centrality_6_slope',
        'betweenness_centrality_6_std_dev',
        'betweenness_centrality_9_intercept', 
        'betweenness_centrality_9_slope',
        'betweenness_centrality_9_std_dev'
    ], axis = 1
)

In [6]:
df = df.astype(np.float64)
df = df.replace([np.inf], np.nan)

for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

df = df.replace([-np.inf], np.nan)

for column in df.columns:
    max_value = df[column].min(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [7]:
for column in df.columns:
    max_value = df[column].max(skipna=True) 
    df[column] = df[column].fillna(max_value)

In [8]:
imputer = SimpleImputer(strategy='mean') 
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [9]:
x = df.drop(columns = ['time_to_stop_activity'], axis = 1).values
y = df['time_to_stop_activity'].values

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state=11
)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
def train_evaluate_svr(regularization, tolerance, X_train, Y_train, X_test, Y_test):
    svr = SVR(kernel='rbf', C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)
    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    print(f'{regularization} - {tolerance} finished')
    
    return {
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

In [13]:
param_combinations = [
    (regularization, tolerance) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
]

In [14]:
benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, X_train, Y_train, X_test, Y_test)
    for reg, tol in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)

In [15]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,13.390178,13.280265,13.160565,12.939294,12.402443,11.890757
1.0,12.479996,12.37104,12.262814,12.070211,11.604842,11.19972
10.0,12.075603,11.978575,11.881581,11.702694,11.286046,10.960991
100.0,11.928595,11.842577,11.745101,11.57824,11.186954,10.889238
1000.0,12.387036,12.311828,12.221274,12.026256,11.558197,11.201904
10000.0,15.856629,15.563041,15.47254,14.54193,13.941658,12.875893


In [16]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_absolute_error'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,1.302288,1.326845,1.361401,1.430852,1.640659,1.989418
1.0,1.263937,1.286779,1.320262,1.389101,1.598426,1.949192
10.0,1.244891,1.26664,1.29925,1.369076,1.579936,1.933765
100.0,1.244098,1.267537,1.30001,1.367794,1.577872,1.928193
1000.0,1.294939,1.288773,1.322311,1.390511,1.596273,1.941795
10000.0,1.671229,1.69022,1.676505,1.41846,1.588372,1.930504


In [17]:
benchmark_df\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='r2_score'
    )

tolerance,0.01,0.05,0.10,0.20,0.50,1.00
regularization,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.1,-0.092762,-0.083792,-0.074023,-0.055966,-0.012154,0.029605
1.0,-0.018483,-0.009591,-0.000759,0.01496,0.052938,0.086
10.0,0.01452,0.022438,0.030354,0.044952,0.078955,0.105482
100.0,0.026517,0.033537,0.041492,0.055109,0.087041,0.111338
1000.0,-0.010896,-0.004759,0.002631,0.018547,0.056745,0.085821
10000.0,-0.294047,-0.270088,-0.262702,-0.186755,-0.137768,-0.050792


In [18]:
benchmark_df.to_excel('SVMRegressorBenchmark.xlsx')

In [19]:
a

NameError: name 'a' is not defined

## Model Performance Improvement
Once the model has not a good performance, we are looking methods and techniques to improve the performance

##### **Kernel Trick**
SVM can model non-linear relationships between features by using the kernel trick. The default kernel is rbf (Radial Basis Function), but you can experiment with other kernels like:

- Linear Kernel: If your data is linearly separable, the linear kernel might be the best choice.
- Polynomial Kernel: Captures polynomial relationships between data points. You can control the degree of the polynomial to fit higher-order relationships.
- Sigmoid Kernel: Similar to a neural network activation function, this kernel maps data into a hyperbolic tangent space.

Action: Try different kernels and see which works best for your data. For non-linear data, rbf is usually a good default, but for linear data, a linear kernel might perform better.

In [None]:
def train_evaluate_svr(regularization, tolerance, kernel, X_train, Y_train, X_test, Y_test):
    
    svr = SVR(kernel=kernel, C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)
    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    return {
        'kernel': kernel,
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

param_combinations = [
    (regularization, tolerance, kernel) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
    for kernel in ['rbf', 'linear', 'poly']
]

total_tasks = len(param_combinations)

benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, kernel, X_train, Y_train, X_test, Y_test)
    for reg, tol, kernel in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)

In [None]:
benchmark_df.to_excel('SVMRegressor_KernelTrick_Benchmark.xlsx')

In [None]:
benchmark_df\
    [benchmark_df.kernel == 'rbf']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

In [None]:
benchmark_df\
    [benchmark_df.kernel == 'linear']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

In [None]:
benchmark_df\
    [benchmark_df.kernel == 'poly']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

##### **Feature Scaling**
SVM is sensitive to the scale of the features. Features with larger numerical ranges dominate the decision boundary, so proper scaling is essential. You are already using StandardScaler, but you can also try:

- MinMaxScaler: Rescales features into a range, typically [0, 1]. This might work better if your features have different scales.
- RobustScaler: This is more robust to outliers because it scales the data based on the median and the interquartile range instead of the mean and standard deviation.

Action: Experiment with different scalers and check how each affects the performance.

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x, y, 
    test_size = 0.3, 
    random_state = 11
)

In [None]:
def train_evaluate_svr(regularization, tolerance, scaler, X_train, Y_train, X_test, Y_test):
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    svr = SVR(kernel='rbf', C=regularization, epsilon=tolerance)
    svr.fit(X_train, Y_train)

    y_pred = svr.predict(X_test)
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)
    r2 = r2_score(Y_test, y_pred)
    
    return {
        'scaler': type(scaler).__name__,
        'regularization': regularization, 
        'tolerance': tolerance, 
        'mean_squared_error': mse,
        'mean_absolute_error': mae,
        'r2_score': r2
    }

In [None]:
param_combinations = [
    (regularization, tolerance, scaler) 
    for regularization in [0.1, 1, 10, 100, 1000, 10000]
    for tolerance in [0.01, 0.05, 0.1, 0.2, 0.5, 1]
    for scaler in [StandardScaler(), MinMaxScaler(), RobustScaler()]
]

In [None]:
benchmark = Parallel(n_jobs=6) (
    delayed(train_evaluate_svr) (reg, tol, scaler, X_train, Y_train, X_test, Y_test)
    for reg, tol, scaler in param_combinations
)

benchmark_df = pd.DataFrame(benchmark)

In [None]:
benchmark_df.to_excel('SVMRegressor_FeatureScaler_Benchmark.xlsx')

In [None]:
benchmark_df\
    [benchmark_df.scaler == 'StandardScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

In [None]:
benchmark_df\
    [benchmark_df.scaler == 'RobustScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

In [None]:
benchmark_df\
    [benchmark_df.scaler == 'MinMaxScaler']\
    .pivot(
        index='regularization', 
        columns='tolerance', 
        values='mean_squared_error'
    )

#### Handling Outliers
SVM is highly sensitive to outliers, as they can significantly influence the decision boundary and margins. Outliers can distort the hyperplane, leading to poor generalization.

- Outlier Detection: Before training the model, perform outlier detection (e.g., using Z-scores, IQR, or visualizations like box plots) and remove or adjust them.
- Adjust Epsilon: If your model has many outliers, you may want to increase epsilon to create a wider margin and minimize the effect of outliers.

Action: Identify and handle outliers by removing or transforming them to see if performance improves.