In [86]:
import pandas as pd
from functools import reduce

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, SVR

In [87]:
feature_dir = 'data/features/'

In [88]:
def run_experiment(model, X, y):
    scores = []
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test, y_train, y_test = X.iloc[train_index],X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))
    print('Scores',scores)
    print('Average score', sum(scores)/len(scores))

In [89]:
df_twitter = pd.read_csv(feature_dir+'twitter_features.csv')
df_news = pd.read_csv(feature_dir+'news_features.csv')
df_out= pd.read_csv(feature_dir+'stock_features.csv')

In [90]:
clf = SVC(C=1.0)
clf2 = SVR(C=1.0)

In [91]:
df_y = df_out
df_X = pd.merge(df_twitter, df_news, on='Date', how='outer')

In [92]:
df_y

Unnamed: 0,Date,Change VIX,Lag 2 Change VIX,Lag 2 Significant VIX,Change SPY,Lag 2 Change SPY,Lag 2 Significant SPY
0,2015-01-02,0.116279,0.144707,1,-0.007741,-0.020787,-1
1,2015-01-05,0.080518,0.050026,1,-0.010708,-0.013469,-1
2,2015-01-06,0.059406,-0.118052,-1,-0.010188,0.009501,0
3,2015-01-07,-0.008854,-0.184119,-1,-0.003315,0.024724,1
4,2015-01-08,-0.110174,0.005020,0,0.012859,0.001961,0
...,...,...,...,...,...,...,...
499,2016-12-23,0.005300,0.044815,1,-0.000754,0.005057,0
500,2016-12-27,0.077329,0.072594,1,0.002617,-0.006814,0
501,2016-12-28,-0.030179,0.110177,1,0.002433,-0.008121,0
502,2016-12-29,0.105971,0.069962,1,-0.009225,0.002495,0


In [93]:
X = df_X.reset_index(drop=True)
y = df_y.reset_index(drop=True)
X = X.drop(columns=['Date'])
X.fillna(X.mean(), inplace=True)
y = y['Lag 2 Significant VIX']
y.fillna(y.mean(), inplace=True)

In [94]:
X

Unnamed: 0,BarackObama_sentiment_score,cnnbrk_sentiment_score,KimKardashian_sentiment_score,New York Times_sentiment_score,CNN_sentiment_score,Washington Post_sentiment_score
0,0.816094,-2.54383,1.504453,-1.230945,-4.2102,-1.884661
1,1.220500,-2.54383,1.504453,-1.230945,-1.4093,-1.884661
2,0.643600,-2.54383,4.065300,-1.230945,1.0081,-1.884661
3,5.633600,-2.54383,0.915400,-1.230945,-0.9171,-1.884661
4,1.112100,-2.54383,1.715000,-1.230945,-2.5206,-1.884661
...,...,...,...,...,...,...
499,0.816094,-3.50000,1.504453,0.366700,-3.5821,-2.920500
500,0.816094,-1.11410,1.504453,-0.991300,-3.0340,-0.117200
501,0.816094,-1.76330,1.504453,2.958000,-1.1407,-1.727400
502,0.816094,-0.24280,1.504453,0.990100,-0.4297,-0.282200


In [95]:
y.value_counts()

-1    238
 1    219
 0     47
Name: Lag 2 Significant VIX, dtype: int64

In [106]:
# Guessing Most common class always
succ = y.value_counts().max()/(y.count())

In [107]:
succ

0.4722222222222222

In [96]:
run_experiment(clf, X, y)

Scores [0.46534653465346537, 0.5247524752475248, 0.5148514851485149, 0.504950495049505, 0.51]
Average score 0.503980198019802


In [97]:
def rfc_param_selection(X, y):
    n_estims= [200, 700]
    max_feats= ['auto', 'sqrt', 'log2']
    param_grid = {'n_estimators': n_estims, 'max_features' : max_feats}
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.best_estimator_)
    print(grid_search.best_score_)
    return grid_search.best_params_

In [98]:
def svc_param_selection(X, y):
    kernels = ['rbf','linear']
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':kernels}
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.best_estimator_)
    print(grid_search.best_score_)
    return grid_search.best_params_

In [99]:
svc_param_selection(X,y)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.505960396039604


{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [100]:
rfc_param_selection(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.501980198019802


{'max_features': 'log2', 'n_estimators': 200}

In [112]:
model = RandomForestClassifier(max_features='log2',n_estimators=200)
run_experiment(model, X, y)

Scores [0.48514851485148514, 0.5148514851485149, 0.5346534653465347, 0.5148514851485149, 0.51]
Average score 0.5119009900990099


In [115]:
model = SVC(C=1, gamma=.1, kernel='rbf')
run_experiment(model, X, y)

Scores [0.504950495049505, 0.5544554455445545, 0.49504950495049505, 0.5148514851485149, 0.47]
Average score 0.5078613861386139
