In [1]:
import pyreadr 
import pandas as pd
from datetime import datetime as dt
from sklearn import svm
import os

In [2]:
data_ml = pyreadr.read_r(os.path.join(os.path.expanduser('~'),'Downloads','data_ml.RData'))["data_ml"]
data_ml.date = pd.to_datetime(data_ml.date)
data_ml = data_ml.loc[(data_ml['date'] > "1999-12-31") & (data_ml['date'] < "2019-01-01")]
data_ml.sort_values(by = ['stock_id','date'], inplace = True)
data_ml['R1M_Usd_C'] = data_ml.R1M_Usd >  data_ml.groupby('date').R1M_Usd.transform('median')


features = list(data_ml.columns[2:95])
features_short = ["Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", 
                    "Ocf", "Pb", "Vol1Y_Usd"]
stock_ids = data_ml.stock_id.unique()                                                                                           
stock_days = data_ml.stock_id.value_counts().sort_index()                                           
stock_ids_short = stock_ids[stock_days.values == max(stock_days.values)]                            
returns_short = data_ml[data_ml['stock_id'].isin(stock_ids_short)][['date','stock_id','R1M_Usd']]
returns = returns_short.pivot_table(index='date',columns='stock_id',values='R1M_Usd')
separation_date = dt.strptime("2014-01-15","%Y-%m-%d")
training_sample = data_ml[data_ml.date < separation_date]
testing_sample = data_ml[data_ml.date  >= separation_date]

top20 = training_sample['R1M_Usd'].quantile(0.8) 
bottom20= training_sample['R1M_Usd'].quantile(0.2)
train_features = training_sample[(training_sample['R1M_Usd'] > top20) | (training_sample['R1M_Usd'] < bottom20) ]
train_features_xgb = train_features[features_short]
train_label_xgb = train_features['R1M_Usd']

In [3]:
model_svm = svm.SVR(
    kernel='rbf',  
    C=0.1,         # Slack variable penalisation                                      
    epsilon=0.1,   # Width of strip for errors                                        
    gamma=0.5      # Constant in the radial kernel                                      
    )
fit_svm = model_svm.fit(train_features_xgb.iloc[:1000] , train_label_xgb.iloc[:1000])      
test_feat_short = testing_sample[features_short]
((fit_svm.predict(test_feat_short) - testing_sample['R1M_Usd'])**2).mean()


0.03720290963990272

In [4]:
print("Hit ratio:", (fit_svm.predict(test_feat_short) * testing_sample['R1M_Usd'] > 0 ).mean() )

Hit ratio: 0.5270339562443026


In [5]:
model_svm_C = svm.SVC(
    kernel='sigmoid',  
    C=0.2,         # Slack variable penalisation                                      
    coef0=0.3,     # Parameter in the sigmoid kernel                                        
    gamma=0.5      # Constant in the sigmoid kernel                                      
    )
fit_svm_C = model_svm_C.fit(training_sample[features].iloc[:1000], training_sample['R1M_Usd_C'].iloc[:1000])       
(fit_svm_C.predict(testing_sample[features]) == testing_sample['R1M_Usd_C']).mean()


0.49628247493163175