In [1]:

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.svm import SVR

# Set random seed 
RSEED = 42


Introduction:

Support Vector Machines (SVM) can be used for regression tasks as well as classification tasks. When used for regression, it's often referred to as Support Vector Regression (SVR).

The primary goal of SVR is to find a function that approximates the mapping from input variables to the continuous output variable in such a way that the margin of error or deviation (epsilon, ε) is minimized. The idea is to fit as many data points within a certain margin while minimizing the deviation of data points from the regression line.

Key components of SVR for regression:

    Kernel Trick: SVR, like SVM for classification, can use various kernel functions (e.g., linear, polynomial, radial basis function) to map the input data into a higher-dimensional feature space. The choice of kernel can have a significant impact on the model's performance.

    Margin: SVR aims to minimize the deviation of data points from a decision boundary (often referred to as the epsilon-tube). The width of this tube is controlled by a hyperparameter, ε (epsilon). Data points falling within this tube are considered to have zero error.

    Loss Function: SVR uses a loss function that penalizes data points based on their deviation from the regression line while allowing for a margin of error (ε). Common loss functions include the epsilon-insensitive loss and the squared ε-insensitive loss.

    Regularization: Like SVM, SVR often includes a regularization parameter, typically denoted as C, which controls the trade-off between maximizing the margin and minimizing the loss. A smaller C encourages a wider margin but allows more errors, while a larger C penalizes errors more heavily.

    Hyperparameter Tuning: The choice of kernel, kernel parameters, ε, and C are all hyperparameters that need to be tuned to optimize the SVR model for a specific dataset.

In [5]:
final_data = pd.read_csv("data/wrangled_data.csv")

In [6]:
del final_data['date_caught']

In [20]:
# Select X and y features
X = final_data.drop(['capture_number'], axis = 1)
y = final_data['capture_number']


# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# Check the shape of the data sets
print("X_train:", X_train.shape)  
print("y_train:", y_train.shape)  
print("X_test:", X_test.shape) 
print("y_test:", y_test.shape)  

X_train: (12643, 40)
y_train: (12643,)
X_test: (5419, 40)
y_test: (5419,)


In [7]:
# Select X and y features
X = final_data[['year_woy','capture_site','cs_category_0','cs_category_1','cs_category_2','cs_category_3','cs_category_4']
                   
]
y = final_data['capture_number']


# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# Check the shape of the data sets
print("X_train:", X_train.shape)  
print("y_train:", y_train.shape)  
print("X_test:", X_test.shape) 
print("y_test:", y_test.shape)  

X_train: (12643, 7)
y_train: (12643,)
X_test: (5419, 7)
y_test: (5419,)


In [12]:
def evaluate_rmse(y_true, y_pred, ndigits=3):
    """ Prints the RMSE (root mean squared error) of y_pred in relation to y_true"""
    rmse = mean_squared_error(y_true, y_pred, squared=False )
    print("Number of predictions: ", len(y_pred))
    print("RMSE: ", round(rmse, ndigits))
    return rmse

In [14]:
# Create an SVR model
svr = SVR(kernel='poly', C=1.0, epsilon=0.2)

In [21]:
# Fit the model to the training data
svr.fit(X_train, y_train) 

In [22]:
# Make predictions on the test data
y_pred = svr.predict(X_test) 

In [23]:
# evaluate
error = evaluate_rmse(y_test, y_pred)

Number of predictions:  5419
RMSE:  4.106


Doesn't look that good, makes sense to try out other models 