# Chapter 8 Support vector machines
## 8.3 Practice

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [2]:
# Load data
DATA_PATH = "../data/raw/data_ml.csv"
data_ml = pd.read_csv(DATA_PATH)
data_ml['date'] = pd.to_datetime(data_ml['date'])
data_ml['R1M_Usd_C'] = (data_ml['R1M_Usd'] > 0).astype(int) # can be useful for classification tasks
data_ml.head()

Unnamed: 0,stock_id,date,Advt_12M_Usd,Advt_3M_Usd,Advt_6M_Usd,Asset_Turnover,Bb_Yld,Bv,Capex_Ps_Cf,Capex_Sales,...,Total_Debt,Total_Debt_Capital,Total_Liabilities_Total_Assets,Vol1Y_Usd,Vol3Y_Usd,R1M_Usd,R3M_Usd,R6M_Usd,R12M_Usd,R1M_Usd_C
0,13,2006-12-31,0.25,0.33,0.27,0.22,0.33,0.01,0.13,0.84,...,0.27,1.0,0.92,0.84,0.88,0.089,0.104,0.344,-0.041,1
1,13,2007-01-31,0.25,0.32,0.28,0.22,0.4,0.01,0.13,0.84,...,0.27,1.0,0.92,0.84,0.88,0.039,0.091,-0.011,-0.253,1
2,13,2007-02-28,0.26,0.3,0.3,0.22,0.15,0.01,0.13,0.84,...,0.27,1.0,0.92,0.87,0.88,-0.012,0.101,0.118,-0.366,0
3,17,2015-03-31,0.73,0.64,0.7,0.4,0.47,0.01,0.7,0.74,...,0.75,1.0,1.0,1.0,0.99,0.174,-0.144,-0.518,-0.376,1
4,17,2015-04-30,0.72,0.62,0.66,0.4,0.46,0.01,0.7,0.74,...,0.75,1.0,1.0,1.0,0.99,-0.106,-0.576,-0.535,-0.113,0


In [3]:
# Recreate variables done in notebook_5_tree_based_methods.ipynb

X = data_ml.iloc[:,3:95] # recall features/predictors, full sample
y = data_ml['R1M_Usd'] # recall label/Dependent variable, full sample

features = X.columns.values.tolist()
separation_date = pd.to_datetime('2013-01-01')
training_sample = data_ml.loc[data_ml['date'] < separation_date]
testing_sample = data_ml.loc[data_ml['date'] > separation_date]

# Classification labels: up (1) if return > 0 else 0
y_c_train = training_sample['R1M_Usd_C'].values # classification target
y_c_test = testing_sample['R1M_Usd_C'].values

# Regression labels
y_train = training_sample['R1M_Usd'].values # regression target
X_train = training_sample[features].values
X_test = testing_sample[features].values
y_test = testing_sample['R1M_Usd'].values

# Define features and separation mask
features_short = ["Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd"]
separation_mask = data_ml['date'] < separation_date

data_ml['R1M_Usd_quantile'] = data_ml.groupby('date')['R1M_Usd'].transform(         # creating quantile... 
        lambda x: pd.qcut(x, 100, labels=False, duplicates=('drop'), precision=50)) # ...for selecting extreme values

boolean_quantile=(data_ml.loc[separation_mask]['R1M_Usd_quantile'].        # boolean array for selecting rows
                  values<=0.2) | (data_ml.loc[separation_mask]['R1M_Usd_quantile'].values>=0.8) # selecting extreme values

train_features_xgb=training_sample.loc[boolean_quantile,features_short] # Independent variables
train_label_xgb=training_sample.loc[boolean_quantile,'R1M_Usd'] # Dependent variable
train_label_xgb_C = training_sample.loc[boolean_quantile, 'R1M_Usd_C'] # Dependent variable

In [4]:
# recall of some variables 
y = train_label_xgb.iloc[0:1000]                        # Train label
x = train_features_xgb.iloc[0:1000,]                    # Training features
test_feat_short=testing_sample[features_short]
y_c=train_label_xgb_C.iloc[0:1000]                      # Dependent variable

model_svm=svm.SVR(
    kernel='rbf',                                       # SVM kernel (or: linear, polynomial, sigmoid)
    C=0.1,                                              # Slack variable penalisation
    epsilon=0.1,                                        # Width of strip for errors
    gamma=0.5                                           # Constant in the radial kernel
    )
fit_svm=model_svm.fit(x, y)                             # Fitting the model

mse = np.mean((fit_svm.predict(test_feat_short) - y_test)**2)
print(f'MSE: {mse}')

hitratio = np.mean(fit_svm.predict(test_feat_short) * y_test > 0)
print(f'Hit Ratio: {hitratio}')

MSE: 0.03446691557031802
Hit Ratio: 0.5060900917558416


In [5]:
model_svm_c=svm.SVC(
    kernel='sigmoid',
    C=0.2,                          # Slack variable penalisation
    gamma=0.5,                      # Parameter in the sigmoid kernel
    coef0=0.3                       # Parameter in the sigmoid kernel
    )

fit_svm_c=model_svm_c.fit(x,y_c)    # Fitting the model

hitratio = np.mean(fit_svm_c.predict(test_feat_short) == y_c_test)
print(f'Hit Ratio: {hitratio}')

Hit Ratio: 0.558728361106617


## 8.4 Coding exercices
#### 1. From the simple example shown above, extend SVM models to other kernels and discuss the impact on the fit.

In [6]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
results = []

for kernel in kernels:
    model_svm_c = svm.SVC(
        kernel=kernel
    )

    fit_svm_c = model_svm_c.fit(x, y_c)    # Fitting the model
    predictions = fit_svm_c.predict(test_feat_short)
    
    hitratio = np.mean(predictions == y_c_test)
    results.append({'kernel': kernel, 'hitratio': hitratio})
    print(f"Hit Ratio for kernel '{kernel}': {hitratio:.4f}")

Hit Ratio for kernel 'linear': 0.5587
Hit Ratio for kernel 'poly': 0.5451
Hit Ratio for kernel 'rbf': 0.5542
Hit Ratio for kernel 'sigmoid': 0.5260


We can observe that the simplest model (linear) is the one holding the best with a quite good hit ratio, followed by rbf (most complex one) and poly. The sigmoid did not perform well.

#### 2. Train a vanilla SVM model with labels being the 12-month forward (i.e., future) return and evaluate it on the testing sample. Do the same with a simple random forest. Compare

In [9]:
# SVM Classifier
model_svm_cls = svm.SVC(
    kernel='rbf',
    C=0.1,
    gamma=0.5
)
fit_svm_cls = model_svm_cls.fit(x, y_c)

svm_pred = fit_svm_cls.predict(test_feat_short)
svm_hitratio = np.mean(svm_pred == y_c_test)

print(f'SVM Hit Ratio: {svm_hitratio:.4f}')

# Random Forest Classifier
model_rf_cls = RandomForestClassifier(
    n_estimators=200,
    criterion='gini',
    max_samples=1000,
    random_state=42
)
fit_rf_cls = model_rf_cls.fit(x, y_c)

rf_pred = fit_rf_cls.predict(test_feat_short)
rf_hitratio = np.mean(rf_pred == y_c_test)

print(f'Random Forest Hit Ratio: {rf_hitratio:.4f}')

SVM Hit Ratio: 0.5587
Random Forest Hit Ratio: 0.5093
