In [3]:
import pandas as pd
from functools import reduce

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, SVR

In [7]:
def run_experiment(model, X, y):
    scores = []
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test, y_train, y_test = X.iloc[train_index],X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))
    print('Scores',scores)
    print('Average score', sum(scores)/len(scores))

In [8]:
feature_files = ['BarackObama', 'CNN', 'New York Times']
predict_files = ['VIX_Proc']

In [9]:
first_date = pd.to_datetime('1/1/2015')
last_date = pd.to_datetime('1/1/2017')

In [10]:
dfs = []
for feature in feature_files:
    df = pd.read_csv(feature+'.csv', parse_dates=['Date'])
    df['Date']= pd.to_datetime(df['Date'].dt.normalize())
    df[feature+'_sentiment_score'] = df['sentiment_score']
    df = df[['Date', feature+'_sentiment_score']]
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df.groupby('Date').sum()
    dfs.append(df)
    

In [11]:
dfs_out = []
for output in predict_files:
    df = pd.read_csv(output+'.csv', parse_dates=['Date'])
    df['Date']= pd.to_datetime(df['Date'].dt.normalize())
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    dfs_out.append(df)

In [12]:
df_y = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs_out)

In [13]:
df_X = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs)

In [14]:
predict_dates = df_y['Date']

In [15]:
#FOR NOW ONLY DELETE THE WEEKENDS
df_X = pd.merge(predict_dates, df_X, on=["Date"], how='left')

In [16]:
clf = SVC(C=1.0)
clf2 = SVR(C=1.0)

In [17]:
df_y

Unnamed: 0,Date,Price,Change,Lag 2 Price,2 Day Change,2 Day Direction
3773,2015-01-02,17.76,0.116279,20.33,0.144707,1
3774,2015-01-05,19.19,0.080518,20.15,0.050026,1
3775,2015-01-06,20.33,0.059406,17.93,-0.118052,-1
3776,2015-01-07,20.15,-0.008854,16.44,-0.184119,-1
3777,2015-01-08,17.93,-0.110174,18.02,0.005020,1
...,...,...,...,...,...,...
4272,2016-12-23,11.38,0.005300,11.89,0.044815,1
4273,2016-12-27,12.26,0.077329,13.15,0.072594,1
4274,2016-12-28,11.89,-0.030179,13.20,0.110177,1
4275,2016-12-29,13.15,0.105971,14.07,0.069962,1


In [18]:
X = df_X.reset_index(drop=True)
y = df_y.reset_index(drop=True)
X = X.drop(columns=['Date'])
X.fillna(X.mean(), inplace=True)
y = y['2 Day Direction']
y.fillna(y.mean(), inplace=True)

In [19]:
X

Unnamed: 0,BarackObama_sentiment_score,CNN_sentiment_score,New York Times_sentiment_score
0,0.816094,-4.2102,-1.230945
1,1.220500,-1.4093,-1.230945
2,0.643600,1.0081,-1.230945
3,5.633600,-0.9171,-1.230945
4,1.112100,-2.5206,-1.230945
...,...,...,...
499,0.816094,-3.5821,0.366700
500,0.816094,-3.0340,-0.991300
501,0.816094,-1.1407,2.958000
502,0.816094,-0.4297,0.990100


In [20]:
y

0      1
1      1
2     -1
3     -1
4      1
      ..
499    1
500    1
501    1
502    1
503   -1
Name: 2 Day Direction, Length: 504, dtype: int64

In [30]:
run_experiment(clf, X, y)

Scores [0.48514851485148514, 0.5841584158415841, 0.5445544554455446, 0.5445544554455446, 0.49]
Average score 0.5296831683168317


In [44]:
rfc = RandomForestClassifier()
param_grid = { 
'n_estimators': [200, 700],
'max_features': ['auto', 'sqrt', 'log2']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, verbose=1)
CV_rfc.fit(X,y)
print(CV_rfc.best_estimator_)
print(CV_rfc.best_score_)



Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   18.3s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.527960396039604


In [66]:
def rfc_param_selection(X, y):
    n_estims= [200, 700]
    max_feats= ['auto', 'sqrt', 'log2']
    param_grid = {'n_estimators': n_estims, 'max_features' : max_feats}
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.best_estimator_)
    print(grid_search.best_score_)
    return grid_search.best_params_

In [59]:
def svc_param_selection(X, y):
    kernels = ['rbf','linear']
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':kernels}
    grid_search = GridSearchCV(SVC(), param_grid, cv=5)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.best_estimator_)
    print(grid_search.best_score_)
    return grid_search.best_params_

In [67]:
svc_param_selection(X,y)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
0.5555247524752476


{'C': 10, 'gamma': 1, 'kernel': 'rbf'}

In [61]:
rfc_param_selection(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.527920792079208


{'max_features': 'sqrt', 'n_estimators': 200}

In [62]:
model = RandomForestClassifier(max_features='sqrt',n_estimators=700)
run_experiment(model, X, y)

Scores [0.5544554455445545, 0.5148514851485149, 0.4752475247524752, 0.48514851485148514, 0.62]
Average score 0.529940594059406


In [65]:
model = SVC(C=10, gamma=1, kernel='rbf')
run_experiment(model, X, y)

Scores [0.5346534653465347, 0.5742574257425742, 0.48514851485148514, 0.46534653465346537, 0.53]
Average score 0.5178811881188119
