In [None]:
import pandas as pd
from functools import reduce

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier

In [None]:
feature_dir = 'data/features/'

In [None]:
def run_experiment(model, X, y):
    scores = []
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test, y_train, y_test = X.iloc[train_index],X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))
    print('Scores',scores)
    print('Average score', sum(scores)/len(scores))

In [None]:
df_twitter = pd.read_csv(feature_dir+'twitter_features.csv')
df_news = pd.read_csv(feature_dir+'news_features.csv')
df_out= pd.read_csv(feature_dir+'index_features.csv')

In [None]:
clf = SVC(C=1.0)
clf2 = SVR(C=1.0)

In [None]:
df_y = df_out
df_X = pd.merge(df_twitter, df_news, on='Date', how='outer')

In [None]:
df_y

In [None]:
X = df_X.reset_index(drop=True)
y = df_y.reset_index(drop=True)
X = X.drop(columns=['Date'])
X.fillna(0, inplace=True)
y = y['Lag 2 Significant VIX']
# y.fillna(y.mean(), inplace=True)

In [None]:
X

In [None]:
y.value_counts()

In [None]:
# Guessing Most common class always
succ = y.value_counts().max()/(y.count())

In [None]:
succ

In [None]:
run_experiment(clf, X, y)

In [None]:
def rfc_param_selection(X, y):
    n_estims= [200, 700]
    max_feats= ['auto', 'sqrt', 'log2']
    param_grid = {'n_estimators': n_estims, 'max_features' : max_feats}
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, verbose=1)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.best_estimator_)
    print(grid_search.best_score_)
    return grid_search.best_params_

In [None]:
def svc_param_selection(X, y):
    kernels = ['rbf','linear']
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel':kernels}
    grid_search = GridSearchCV(SVC(), param_grid, cv=5, verbose=5)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.best_estimator_)
    print(grid_search.best_score_)
    return grid_search.best_params_

In [None]:
def MLP_param_selection(X, y):
    solvers = ['lbfgs', 'sgd']
    max_iters = [1000,1500,2000 ]
    alphas = [10,100,1000,10000]
    hidden_layer_sizes = [10,12,15]
    random_states = [i for i in range(10)]
    param_grid = {'solver': solvers, 'max_iter': max_iters, 'alpha': alphas, 'hidden_layer_sizes':hidden_layer_sizes, 'random_state':random_states}
    

    grid_search = GridSearchCV(MLPClassifier(), param_grid, cv=5, verbose=5)
    grid_search.fit(X, y)
    grid_search.best_params_
    print(grid_search.best_estimator_)
    print(grid_search.best_score_)
    return grid_search.best_params_


In [None]:
svc_param_selection(X,y)

In [None]:
rfc_param_selection(X,y)

In [None]:
model = RandomForestClassifier(max_features='log2',n_estimators=200)
run_experiment(model, X, y)

In [None]:
model = SVC(C=1, gamma=.1, kernel='rbf')
run_experiment(model, X, y)

In [None]:
MLP_param_selection(X,y)

In [None]:
model = MLPClassifier(solver='lbfgs', alpha=10, max_iter=1500, hidden_layer_sizes=10)
run_experiment(model, X, y)

In [None]:
print(3)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

In [None]:
auto_y = df_out['Change SPY']

In [None]:
auto_y

In [None]:
plot_acf(auto_y, lags = 50)