# Support Vector Regressor MODEL


In [6]:
import pandas as pd
import numpy as np

# For imports
from notebooks import utility
import importlib

# For optimization
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Data import
Let's import the data that was previously cleaned

In [7]:
X_train = pd.read_csv("../DWMProjectData/formodel/X_train.csv")
y_train = pd.read_csv("../DWMProjectData/formodel/y_train.csv")
X_valid = pd.read_csv("../DWMProjectData/formodel/X_valid.csv")
y_valid = pd.read_csv("../DWMProjectData/formodel/y_valid.csv")
X_test = pd.read_csv("../DWMProjectData/formodel/X_test.csv")
y_test = pd.read_csv("../DWMProjectData/formodel/y_test.csv")
# Transform all y in a 1-dimensional array - required to avoid warning in model building
y_train = np.ravel(y_train)
y_valid = np.ravel(y_valid)
y_test = np.ravel(y_test)

## Scale data
For SVR, data scaling bring to much better results, although it is not strictly required. The reasons for this can be found [here](https://www.baeldung.com/cs/svm-feature-scaling) and [here](https://scikit-learn.org/stable/modules/svm.html) (They refears to SVM but for regression the reason are the same)

In [8]:
from utility import scale
importlib.reload(utility)
X_train, X_valid, X_test = scale(X_train, X_valid, X_test)

## Score function

I defined the score functions used for the regression. For a more clear approach I wrote the function `print_metrics` in the file `utility.py` In particular, I decided to write a function that prints the following values to compare models:
- mean absolute error
- mean squared error
- $r^2$, where the best score is 1, good is above 0.7
- explained variance score, where the best score is 1

In [9]:
from utility import print_metrics
importlib.reload(utility)

<module 'notebooks.utility' from 'C:\\Users\\marco\\Documents\\UNI\\Y3\\DataWebMining\\project\\DWMProject\\notebooks\\utility.py'>

## Model building

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVR

param_grid = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.1, 1, 10, 100]
}
model_base = SVR(verbose=True)

# model_fitted = RandomizedSearchCV(model_base, param_grid, n_jobs=1)
model_fitted = GridSearchCV(model_base, param_grid, n_jobs=1, verbose=4)
model_fitted.fit(X_train, y_train)
print(f"Best params are {model_fitted.best_params_}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.001 total time=   6.5s
[CV 2/5] END .............C=0.1, kernel=linear;, score=-0.001 total time=   5.6s
[CV 3/5] END ..............C=0.1, kernel=linear;, score=0.002 total time=   5.5s
[CV 4/5] END ..............C=0.1, kernel=linear;, score=0.000 total time=   5.0s
[CV 5/5] END ..............C=0.1, kernel=linear;, score=0.002 total time=   5.3s
[CV 1/5] END ...............C=0.1, kernel=poly;, score=-0.256 total time=   4.3s
[CV 2/5] END ...............C=0.1, kernel=poly;, score=-0.324 total time=   5.3s
[CV 3/5] END ...............C=0.1, kernel=poly;, score=-3.852 total time=   5.3s
[CV 4/5] END ............C=0.1, kernel=poly;, score=-6900.686 total time=   4.2s
[CV 5/5] END ................C=0.1, kernel=poly;, score=0.009 total time=   4.9s
[CV 1/5] END .................C=0.1, kernel=rbf;, score=0.000 total time=   2.5s
[CV 2/5] END .................C=0.1, kernel=rbf;

## Model re-building with best parameters + Metrics

In [None]:
model_final = SVR(inserire parametri)
X_train_n = pd.concat(X_train, X_valid)
y_train_n = pd.concat(y_train, y_valid)
model_final.fit(X_train_n, y_train_n)

print_metrics(y_test, model_final.predict(X_test))