In [1]:
from datetime import datetime
import pandas as pd
from pandas import Series
import numpy as np
import matplotlib.pyplot as plt
from iexfinance.stocks import get_historical_data
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score, confusion_matrix, r2_score, mean_squared_error
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import itertools
from ipywidgets import interact
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
etf_list = ['SPY','IVV','VTI','VOO','QQQ','VEA','EFA','IEFA','VWO','AGG','IJH','IEMG','IWM','IJR','VTV','IWF','IWD','VUG','BND','LQD','XLF','VNQ','VIG','EEM','GLD','VB','BSV','VO','TIP','VEU','IVW','DIA','XLK','VYM','VGT','VCSH','MDY','IWB','VCIT','XLV','IWR','XLE','DVY','USMV','EWJ','VGK','PFF','SCHF','SDY','RSP','XLY','ITOT','IVE','SCHX','HYG','SHV','VBR','EMB','SHY','VV','SCHB','XLI','BIV','VT','MBB','BNDX','IWS','VXUS','FLOT','IWO','IXUS','MINT','SCZ','PYZ','MXI','IYM','IXP','RXI','VCR','RHS','VDC','PXI','PXE','IEO','RYF','IYG','KIE','FBT','PTH','IHI','ITA','VIS','ICF','REZ','RWR','PSJ','IGV','RYU','IDU','VPU']
np.random.seed(7)

In [2]:
def get_iex_data(stock_list, start=datetime(2015,1,1), end=datetime(2019,12,31)):
    return_list = []
    for i in stock_list:
        df = pd.DataFrame(get_historical_data(i, start, end, output_format='pandas', token='pk_d28c0190de7a4d6da30b3bd2b08487c8')).interpolate()
        df['ticker'] = i
        return_list.append(df)
#     return return_list
# etf_list = ['SPY','IVV','VTI','VOO','QQQ','VEA','EFA','IEFA','VWO','AGG','IJH','IEMG','IWM','IJR','VTV','IWF','IWD','VUG','BND','LQD','XLF','VNQ','VIG','EEM','GLD','VB','BSV','VO','TIP','VEU','IVW','DIA','XLK','VYM','VGT','VCSH','MDY','IWB','VCIT','XLV','IWR','XLE','DVY','USMV','EWJ','VGK','PFF','SCHF','SDY','RSP','XLY','ITOT','IVE','SCHX','HYG','SHV','VBR','EMB','SHY','VV','SCHB','XLI','BIV','VT','MBB','BNDX','IWS','VXUS','FLOT','IWO','IXUS','MINT','SCZ','PYZ','MXI','IYM','IXP','RXI','VCR','RHS','VDC','PXI','PXE','IEO','RYF','IYG','KIE','FBT','PTH','IHI','ITA','VIS','ICF','REZ','RWR','PSJ','IGV','RYU','IDU','VPU']
etf_list = ['SPY']

In [3]:
def lstm_clean_data(data):
    for i in range(len(data)):
        data[i] = data[i].reset_index().dropna()
        data[i]['date'] = pd.to_datetime(data[i]['date'])
#         data[i]['friday'] = ((((data[i]['date'].diff(-1).astype('int'))/86400000000000)+1)/(-2)).astype('int')
        data[i]['Reg_Target'] = data[i]['close'].shift(-1)
#         data[i] = pd.merge(data[i], prices, on='date')
    return data
def clean_data(data):
    for i in range(len(data)):
        data[i] = data[i].reset_index().dropna()
        data[i]['date'] = pd.to_datetime(data[i]['date'])
        data[i] = data[i].set_index('date')
        data[i]['Clf_Target'] = (np.sign(-data[i]['close'].diff(periods=-1))+1)/2
    return data

def add_lags(etf_list, window):
    for i in range(len(etf_list)):
        for n in range(1,window+1):
            etf_list[i]['{}day_change'.format(n)] = -etf_list[i]['close'].diff(periods=n)
    return etf_list

def RSI(series, period):
    delta = series.diff().dropna()
    u = delta * 0
    d = u.copy()
    u[delta > 0] = delta[delta > 0]
    d[delta < 0] = -delta[delta < 0]
    u[u.index[period-1]] = np.mean( u[:period] )
    u = u.drop(u.index[:(period-1)])
    d[d.index[period-1]] = np.mean( d[:period] )
    d = d.drop(d.index[:(period-1)])
    rs = Series.ewm(u, com=period-1, adjust=False).mean() / \
    Series.ewm(d, com=period-1, adjust=False).mean()
    return 100 - 100 / (1 + rs)

def add_indicators(etf_list):
    for i in range(len(etf_list)):
        etf_list[i]['ewma7'] = etf_list[i]['close'].ewm(span=7,min_periods=0,adjust=True,ignore_na=False).mean()
        etf_list[i]['ewma50'] = etf_list[i]['close'].ewm(span=50,min_periods=0,adjust=True,ignore_na=False).mean()
        etf_list[i]['ewma200'] = etf_list[i]['close'].ewm(span=200,min_periods=0,adjust=True,ignore_na=False).mean()
        etf_list[i]['MACD'] = (etf_list[i]['close'].ewm(span=12,min_periods=0,adjust=True,ignore_na=False).mean() - \
                               etf_list[i]['close'].ewm(span=26,min_periods=0,adjust=True,ignore_na=False).mean())
        etf_list[i]['RSI'] = RSI(etf_list[i]['close'], 14)
        etf_list[i] = etf_list[i][15:-1]
    return etf_list

In [4]:
data = clean_data(get_iex_data(etf_list))
data = add_lags(data, 5)
data = add_indicators(data)


invalid value encountered in sign



In [5]:
data

[              open    high     low   close     volume ticker  Clf_Target  \
 date                                                                       
 2015-01-29  200.38  202.30  198.68  201.99  173585424    SPY         0.0   
 2015-01-30  200.57  202.17  199.13  199.45  197729724    SPY         1.0   
 2015-02-02  200.05  202.03  197.86  201.92  163106969    SPY         1.0   
 2015-02-03  203.00  204.85  202.55  204.84  124212881    SPY         0.0   
 2015-02-04  203.92  205.38  203.51  204.06  134306728    SPY         1.0   
 ...            ...     ...     ...     ...        ...    ...         ...   
 2019-12-23  321.59  321.65  321.06  321.22   53015641    SPY         1.0   
 2019-12-24  321.47  321.52  320.90  321.23   20270007    SPY         1.0   
 2019-12-26  321.65  322.95  321.64  322.94   31024188    SPY         0.0   
 2019-12-27  323.74  323.80  322.28  322.86   42554820    SPY         0.0   
 2019-12-30  322.95  323.10  320.55  321.08   49782730    SPY         1.0   

In [6]:
def save_metrics(y_true, y_pred, ticker):
    accuracy = accuracy_score(y_true, y_pred)
#     precision = precision_score(y_true, y_pred)
#     f1 = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    conf = confusion_matrix(y_true, y_pred)
    return {'Ticker':ticker,'Accuracy':accuracy,'ROC AUC':roc,'R2':r2,'mse':mse,'conf':conf}

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

def produce_confs(results):
    plt.figure()    
    for i in results:
        plot_confusion_matrix(i['conf'], classes =['down', 'up'], title=i['Ticker']+' Predicted Movement')
        plt.show()

In [9]:
def time_test_split(X, y, date):
    scaler = StandardScaler()
    X_train = X[X['date'] < date].drop(columns='date')
    scaler.fit(X_train)
    X_test = X[X['date'] >= date].drop(columns='date')
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    y_train = np.array(y[y['date'] < date].drop(columns='date')).ravel().astype('int')
    y_test = np.array(y[y['date'] >= date].drop(columns='date')).ravel().astype('int')
    return X_train, X_test, y_train, y_test

def plant_forests(df_list, split_date):
    results = []
    importances = []
    for i in range(len(df_list)):
        ticker = df_list[i]['ticker'].iloc[0]
        X = df_list[i].reset_index().drop(columns=['ticker','open', 'Clf_Target'])
        feat_labels = X.drop(columns='date').columns
        y = df_list[i]['Clf_Target'].reset_index()
        X_train, X_test, y_train, y_test = time_test_split(X, y, split_date)
        clf = RandomForestClassifier(n_estimators=100)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append(save_metrics(y_test, y_pred, ticker))
        importances.append((ticker, {i:j for i, j in zip(feat_labels, clf.feature_importances_)}))
    return results, importances

def construct_SVMS(df_list, split_date):
    results = []
    for i in range(len(df_list)):
        ticker = df_list[i]['ticker'].iloc[0]
        X = df_list[i].reset_index().drop(columns=['ticker','open', 'Clf_Target'])
        y = df_list[i]['Clf_Target'].reset_index()
        X_train, X_test, y_train, y_test = time_test_split(X, y, split_date)
        clf = SVC()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append(save_metrics(y_test, y_pred, ticker))
    return results

def logistic_regression(df_list, split_date):
    results = []
    for i in range(len(df_list)):
        ticker = df_list[i]['ticker'].iloc[0]
        X = df_list[i].reset_index().drop(columns=['ticker','open', 'Clf_Target'])
        y = df_list[i]['Clf_Target'].reset_index()
        X_train, X_test, y_train, y_test = time_test_split(X, y, split_date)
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append(save_metrics(y_test, y_pred, ticker))
    return results

def XG_Boost(df_list, split_date):
    results = []
    parameters = []
    for i in range(len(df_list)):
        ticker = df_list[i]['ticker'].iloc[0]
        X = df_list[i].reset_index().drop(columns=['ticker','open', 'Clf_Target'])
        y = df_list[i]['Clf_Target'].reset_index()
        X_train, X_test, y_train, y_test = time_test_split(X, y, split_date)
        clf = XGBClassifier()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append(save_metrics(y_test, y_pred, ticker))
        param_grid = {"learning_rate": [.05, .01, .05],
            'max_depth': [10, 15, 20],
            'min_child_weight': [10],
            'subsample': [.7],
            'n_estimators': [50, 100, 250, 350]}
        grid_clf = GridSearchCV(clf, param_grid)
        grid_clf.fit(X_train, y_train)
        val_preds = grid_clf.predict(X_test)
        results.append(save_metrics(y_test, val_preds, ticker))
        best_parameters = grid_clf.best_params_
        parameters.append({ticker:best_parameters})
    return results, parameters

In [10]:
split = '2019-11'
def run_models(window):
    data = clean_data(get_iex_data(etf_list))
    data = add_lags(data, window)
    data = add_indicators(data)
    rf_results, rf_importances = plant_forests(data, split)
    svm_results = construct_SVMS(data, split)
    lr_results = logistic_regression(data, split)
    xg_results, parameters = XG_Boost(data,split)
    return rf_results, rf_importances, svm_results, xg_results, parameters

In [11]:
rf_results, rf_importances, svm, xg, parameters = run_models(5)



























































































































































































































































































































































































































In [12]:
rf_results_df = round(pd.DataFrame(rf_results).drop(columns = ['conf']),4)
svm_results_df = round(pd.DataFrame(svm).drop(columns = ['conf']),4)
xg_results_df = round(pd.DataFrame(xg).drop(columns = ['conf']),4)

In [13]:
rf_confs = {i['Ticker']: i['conf'] for i in rf_results}
xg_confs = {i['Ticker']: i['conf'] for i in xg}
svm_confs = {i['Ticker']: i['conf'] for i in svm}

## Predicting direction with RF, SVM, XGBoost

In [19]:
def classification_metrics(ETFs):
    trace = go.Table(header=dict(values=['','Random Forest', 'SVM', 'XGBoost']),
        cells=dict(values=[['Accuracy', 'R-Squared', 'ROC-AUC'], [rf_results_df[rf_results_df['Ticker']==ETFs]['Accuracy'], rf_results_df[rf_results_df['Ticker']==ETFs]['R2'], [rf_results_df[rf_results_df['Ticker']==ETFs]['ROC AUC']]],
                          [svm_results_df[svm_results_df['Ticker']==ETFs]['Accuracy'], svm_results_df[svm_results_df['Ticker']==ETFs]['R2'], [svm_results_df[svm_results_df['Ticker']==ETFs]['ROC AUC']]],
                          [xg_results_df[xg_results_df['Ticker']==ETFs]['Accuracy'], xg_results_df[xg_results_df['Ticker']==ETFs]['R2'], [xg_results_df[xg_results_df['Ticker']==ETFs]['ROC AUC']]]
                          ]))
    layout = {'height':270}
#     trace2 = go.Table(header=dict(values=['','Random Forest Conf', 'SVM Conf', 'XGBoost Conf']),
#         cells=dict(values=[forest, vector, boost]))
    data = [trace] 
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    plot_confusion_matrix(rf_confs[ETFs], classes=['down', 'up'], title = 'Random Forest '+ETFs+' Price Classification')
    plt.savefig('rf_cm_spy.png')
    plt.show()
    plot_confusion_matrix(svm_confs[ETFs], classes=['down', 'up'], title = 'SVM '+ETFs+' Price Classification')
    plt.savefig('svm_cm_spy.png')
    plt.show()
    plot_confusion_matrix(xg_confs[ETFs], classes=['down', 'up'], title = 'XGBoost '+ETFs+' Price Classification')
    plt.savefig('xg_cm_spy.png')
    plt.show()
interact(classification_metrics, ETFs=etf_list)

interactive(children=(Dropdown(description='ETFs', options=('SPY', 'IVV', 'VTI', 'VOO', 'QQQ', 'VEA', 'EFA', '…

<function __main__.classification_metrics(ETFs)>

In [15]:
data[0]

Unnamed: 0_level_0,open,high,low,close,volume,ticker,Clf_Target,1day_change,2day_change,3day_change,4day_change,5day_change,ewma7,ewma50,ewma200,MACD,RSI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-01-29,200.38,202.30,198.68,201.99,173585424,SPY,0.0,-1.85,0.75,3.46,2.98,4.11,202.607510,202.813380,202.823244,-0.051351,49.664254
2015-01-30,200.57,202.17,199.13,199.45,197729724,SPY,1.0,2.54,0.69,3.29,6.00,5.52,201.812154,202.546072,202.608549,-0.250332,44.689405
2015-02-02,200.05,202.03,197.86,201.92,163106969,SPY,1.0,-2.47,0.07,-1.78,0.82,3.53,201.839268,202.498240,202.566959,-0.243913,49.940725
2015-02-03,203.00,204.85,202.55,204.84,124212881,SPY,0.0,-2.92,-5.39,-2.85,-4.70,-2.10,202.592637,202.670737,202.697663,-0.048698,55.339046
2015-02-04,203.92,205.38,203.51,204.06,134306728,SPY,1.0,0.78,-2.14,-4.61,-2.07,-3.92,202.960644,202.769663,202.772444,0.051507,53.673969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-23,321.59,321.65,321.06,321.22,53015641,SPY,1.0,-0.49,-0.32,-1.63,-1.65,-1.72,319.588712,310.365505,296.481980,3.306089,74.296484
2019-12-24,321.47,321.52,320.90,321.23,20270007,SPY,1.0,-0.01,-0.50,-0.33,-1.64,-1.66,319.999034,310.791563,296.728230,3.310214,74.315306
2019-12-26,321.65,322.95,321.64,322.94,31024188,SPY,0.0,-1.71,-1.72,-2.21,-2.04,-3.35,320.734276,311.267973,296.989045,3.412134,77.367412
2019-12-27,323.74,323.80,322.28,322.86,42554820,SPY,0.0,0.08,-1.63,-1.64,-2.13,-1.96,321.265707,311.722562,297.246468,3.446719,76.906976


In [16]:
data

[              open    high     low   close     volume ticker  Clf_Target  \
 date                                                                       
 2015-01-29  200.38  202.30  198.68  201.99  173585424    SPY         0.0   
 2015-01-30  200.57  202.17  199.13  199.45  197729724    SPY         1.0   
 2015-02-02  200.05  202.03  197.86  201.92  163106969    SPY         1.0   
 2015-02-03  203.00  204.85  202.55  204.84  124212881    SPY         0.0   
 2015-02-04  203.92  205.38  203.51  204.06  134306728    SPY         1.0   
 ...            ...     ...     ...     ...        ...    ...         ...   
 2019-12-23  321.59  321.65  321.06  321.22   53015641    SPY         1.0   
 2019-12-24  321.47  321.52  320.90  321.23   20270007    SPY         1.0   
 2019-12-26  321.65  322.95  321.64  322.94   31024188    SPY         0.0   
 2019-12-27  323.74  323.80  322.28  322.86   42554820    SPY         0.0   
 2019-12-30  322.95  323.10  320.55  321.08   49782730    SPY         1.0   

In [17]:
comb = data[0]
x=1
while x != 100:
    comb = comb.append(data[x])
    x += 1
    print (x)
comb

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


Unnamed: 0_level_0,open,high,low,close,volume,ticker,Clf_Target,1day_change,2day_change,3day_change,4day_change,5day_change,ewma7,ewma50,ewma200,MACD,RSI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-01-29,200.38,202.30,198.68,201.99,173585424,SPY,0.0,-1.85,0.75,3.46,2.98,4.11,202.607510,202.813380,202.823244,-0.051351,49.664254
2015-01-30,200.57,202.17,199.13,199.45,197729724,SPY,1.0,2.54,0.69,3.29,6.00,5.52,201.812154,202.546072,202.608549,-0.250332,44.689405
2015-02-02,200.05,202.03,197.86,201.92,163106969,SPY,1.0,-2.47,0.07,-1.78,0.82,3.53,201.839268,202.498240,202.566959,-0.243913,49.940725
2015-02-03,203.00,204.85,202.55,204.84,124212881,SPY,0.0,-2.92,-5.39,-2.85,-4.70,-2.10,202.592637,202.670737,202.697663,-0.048698,55.339046
2015-02-04,203.92,205.38,203.51,204.06,134306728,SPY,1.0,0.78,-2.14,-4.61,-2.07,-3.92,202.960644,202.769663,202.772444,0.051507,53.673969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-23,143.28,143.29,141.21,141.55,173189,VPU,1.0,1.49,0.41,0.53,-0.23,-0.62,141.546517,140.143844,135.478711,0.616715,56.661665
2019-12-24,141.59,141.78,141.04,141.62,52726,VPU,1.0,-0.07,1.42,0.34,0.46,-0.30,141.564887,140.201733,135.539819,0.617189,57.012288
2019-12-26,141.85,142.05,141.37,141.86,68987,VPU,1.0,-0.24,-0.31,1.18,0.10,0.22,141.638666,140.266763,135.602706,0.629673,58.259176
2019-12-27,142.01,142.27,141.71,142.25,107721,VPU,0.5,-0.39,-0.63,-0.70,0.79,-0.29,141.791499,140.344537,135.668849,0.663388,60.275586


In [18]:
comb.to_csv('stats_100_etfs.csv', encoding='utf-8')