# 8. Support vector machines

The first step is to activate the packages and load the data.

In [1]:

import pyreadr
import pandas as pd
import numpy as np
from datetime import datetime as dt
from sklearn import svm
from sklearn.metrics import mean_squared_error

In [3]:
#data_ml = pyreadr.read_r(insert path of data_ml.RData)["data_ml"]
data_ml.date = pd.to_datetime(data_ml.date)

data_ml = data_ml.loc[(data_ml['date'] > "1999-12-31") & (data_ml['date'] < "2019-01-01")]
data_ml.sort_values(by = ['stock_id','date'], inplace = True)

We also copy/paste some chunks from Chapter 2.


In [4]:
stock_ids = data_ml.stock_id.unique() 
stock_days = data_ml.stock_id.value_counts().sort_index() 
stock_ids_short = stock_ids[stock_days.values == max(stock_days.values)]

returns_short = data_ml[data_ml['stock_id'].isin(stock_ids_short)][['date','stock_id','R1M_Usd']]
returns = returns_short.pivot_table(index=['date'],columns='stock_id',values='R1M_Usd')  
features = list(data_ml.columns[2:95]) 
features_short = ["Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", 
                    "Ocf", "Pb", "Vol1Y_Usd"]
data_ml['R1M_Usd_C'] = data_ml.R1M_Usd >  data_ml.groupby('date').R1M_Usd.transform('median')
data_ml['R12M_Usd_C'] = data_ml.R12M_Usd >  data_ml.groupby('date').R12M_Usd.transform('median')
separation_date = dt.strptime("2014-01-15","%Y-%m-%d")
training_sample = data_ml[data_ml.date < separation_date]
testing_sample = data_ml[data_ml.date  >= separation_date]

And also from Chapter 6 (for data formats).


In [5]:
top20 = training_sample.R1M_Usd.quantile(0.8) 
bottom20= training_sample.R1M_Usd.quantile(0.2)
train_features = training_sample[training_sample.R1M_Usd>top20]
train_features = train_features.append(training_sample[training_sample.R1M_Usd<bottom20])
train_features = train_features.sort_values(by=['stock_id','date'])
train_features_xgb = train_features[features_short]
train_label_xgb = train_features.R1M_Usd

The order of above operations matters: we need the categorical variables like R1M_Usd_C to be present in the training & testing samples.

In [6]:
fit_svm = svm.SVR(
                  kernel = "rbf",    #SVM kernel.  It must be one of linear’,'poly’,'rbf’,'sigmoid’,'precomputed’ or a callable             
                  epsilon = 0.1,     #Epsilon in the epsilon-SVR model.           
                  gamma = 0.5,       #Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid'
                  C = 0.1 )          #Regularization parameter. The strength of the regularization is inversely proportional to C.           
fit_svm.fit(train_features_xgb[:1000], train_label_xgb[:1000])
test_feat_short = testing_sample[features_short]
print("MSE:", mean_squared_error(fit_svm.predict(test_feat_short), testing_sample.R1M_Usd) )
print("Hit ratio:", (fit_svm.predict(test_feat_short) * testing_sample.R1M_Usd > 0 ).mean() )

MSE: 0.03720290963990272
Hit ratio: 0.5270339562443026


The results are slightly better than those of the boosted trees. All parameters are completely arbitrary, especially the choice of the kernel. We finally turn to a classification example.


In [7]:
fit_svm_C = svm.SVC(
                  kernel = "sigmoid",  #SVC kernel.  It must be one of linear’,'poly’,'rbf’,'sigmoid’,'precomputed’ or a callable                                 
                  gamma = 0.5,         #Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
                  coef0 = 0.3,         #Independent term in kernel function. It is only significant in ‘poly’ and ‘sigmoid'
                  C = 0.2 )            #Regularization parameter. The strength of the regularization is inversely proportional to C.
fit_svm_C.fit(training_sample[features][:1000], training_sample.R1M_Usd_C[:1000])
print("Accuracy:", (fit_svm_C.predict(testing_sample[features])== testing_sample.R1M_Usd_C).mean() )

Accuracy: 0.49628247493163175


Both the small training sample and the arbitrariness in our choice of the parameters may explain why the predictive accuracy is so poor. 
