In [10]:
import pandas as pd
import numpy as np
import os
import sklearn
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
import xgboost
from xgboost import XGBClassifier
%matplotlib notebook

In [11]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.options.mode.chained_assignment = None  # default='warn'

In [12]:
data_path = '/home/mvkrein/etf_model/data'
etf_data_file = os.path.join(data_path,'etf_new_var_20180910.csv')
etf_data = pd.read_csv(etf_data_file,index_col=0)

In [13]:
etf_data_20140106 = etf_data.loc[etf_data['Date']=='2014-01-06',['Date','sym','rank_p_L-21','delta_p_L-21','ivv_delta_p_L-21','target']]

In [14]:
etf_data_20140106.sort_values(['delta_p_L-21'],ascending=False,inplace=True)

In [15]:
etf_data_20140106.tail(5)

Unnamed: 0,Date,sym,rank_p_L-21,delta_p_L-21,ivv_delta_p_L-21,target
77209,2014-01-06,ILF,0.03268,-0.081157,-0.039743,0
77166,2014-01-06,EWZ,0.026144,-0.089469,-0.039743,0
77260,2014-01-06,TUR,0.019608,-0.102258,-0.039743,0
77167,2014-01-06,EWZS,0.013072,-0.104508,-0.039743,0
77126,2014-01-06,ECH,0.006536,-0.120288,-0.039743,0


In [16]:
etf_study_file = os.path.join(data_path,'ETF_list_min_6yr_history.csv')
etf_list = pd.read_csv(etf_study_file,index_col=0)
etf_data.sort_values(['Date','sym'],ascending=True,inplace=True)
etf_data.reset_index(drop=True,inplace=True)

In [17]:
drop_columns = ['Date','sym','p', 'v', 'p_L05', 'v_L05', 'p_L10', 'v_L10', 'p_L21', 'v_L21', 'p_L42', 'v_L42', 'p_L63', \
                'v_L63', 'p_L84', 'v_L84', 'p_L126', 'v_L126', 'p_L189', 'v_L189', 'p_L252', 'v_L252',\
                'p_L-21', 'v_L-21', 'delta_p_L05', 'delta_p_L10', 'delta_p_L21', 'delta_p_L42', 'delta_p_L63', \
                'delta_p_L84', 'delta_p_L126', 'delta_p_L189', 'delta_p_L252', 'delta_p_L-21', 'delta_v_L05',\
                'delta_v_L10', 'delta_v_L21', 'delta_v_L42', 'delta_v_L63', 'delta_v_L84', 'delta_v_L126', \
                'delta_v_L189', 'delta_v_L252','rank_p_L-21', 'ivv_delta_p_L-21', 'target']

rank_col = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
            'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']

In [18]:
all_columns = list(etf_data.columns)

In [19]:
include_columns = [x for x in all_columns if x not in drop_columns]
# include_columns = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
#             'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']
# include_columns = ['rank_p_L05','rank_p_L21','rank_p_L42','rank_p_L252']

In [20]:
len(include_columns)

360

In [21]:
dates = list(etf_data['Date'].unique())
# dates[1671]

In [22]:
for i in range(0,1671):
    if dates[i] == '2018-07-09':
        print (i,dates[i])

1638 2018-07-09


In [23]:
symbols = etf_list['Symbol'].unique()

In [24]:
len(symbols)

153

In [25]:
#Each year has 252 trading dates.  Need two years to fully develop variables.
dt1 = 504 #This is the first day that all variables are developed
# dt1 = 504 + 21 + 273 #This is the first day that all variables are developed 2016-04-07
# make all dates relative to dt1
# for one year training - add 252
# to evaluate for one month outside the training window - add 273
# to predict for the first day outside of the evaluation window (have to lag 21) - add 294
dt_end = (len(dates) - 273 - 21) - 1
# dt2 = 1638 - 273 #restart for '2018-05-08'
dt_end = dt1 + 1 #For testing on the first day only
etf_predict_file = os.path.join(data_path,'etf_pred_random_forest_20180927b.csv')

In [None]:
#Run grid search with cross validation to select best parameters
i = dt1
x_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns] #train with 12 mos
y_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['target']] #train with 12 mos
x_train_nmpy = x_train.as_matrix()
y_train_nmpy = np.ravel(y_train.as_matrix())

param_grid = { 
'n_estimators': [100,300,500,700,1000],
'max_features': ['auto','log2',None],
'max_depth' : [4,5,6,7,8,None],
'criterion' :['gini','entropy']}

rfc=RandomForestClassifier(n_jobs=32,random_state=54321)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs=32, verbose=1)
CV_rfc.fit(x_train_nmpy, y_train_nmpy)

#print(CV_rfc.best_params_)
#{'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'n_estimators': 700}


In [26]:
# for i in range(dt2,dt_end,21): #for restart
for i in range(dt1,dt_end,21):
    print("Building model to predict for ",dates[i+273])
    x_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns] #train with 12 mos
    y_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['target']] #train with 12 mos

#predict one day
    x_test = etf_data.loc[(etf_data['Date']==dates[i+273]),include_columns] #predict one day-must be 21 days removed from training
    y_test = etf_data.loc[(etf_data['Date']==dates[i+273]),['target']] #predict if etf >= market
    returns = etf_data.loc[(etf_data['Date']==dates[i+273]),['delta_p_L-21']]
    mkt_return = etf_data.loc[(etf_data['Date']==dates[i+273]),['ivv_delta_p_L-21']]
    x_train_nmpy = x_train.as_matrix()
    y_train_nmpy = np.ravel(y_train.as_matrix())
#     x_eval_nmpy = x_eval.as_matrix()
#     y_eval_nmpy = y_eval.as_matrix()
    x_test_nmpy = x_test.as_matrix()
    y_test_nmpy = np.ravel(y_test.as_matrix())
    returns_nmpy = returns.as_matrix()
    mkt_return_nmpy = mkt_return.as_matrix()
    
#     rf = RandomForestClassifier(**CV_rfc.best_params_,n_jobs=32,random_state=54321,verbose=1)

    rf = RandomForestClassifier(n_estimators=700, n_jobs=32, max_depth=5, criterion='gini',
                                verbose=0,max_features=None,random_state=54321)
    rf.fit(x_train_nmpy, y_train_nmpy)
    
    y_pred_model = rf.predict_proba(x_test_nmpy)[:,1]
    y_train_model = rf.predict(x_train_nmpy)
    y_test_model = rf.predict(x_test_nmpy)
    
    feat_import = pd.DataFrame({'Feature':include_columns,'Importance': rf.feature_importances_})
    feat_import.sort_values(by='Importance',inplace=True,ascending=False) 
    feat_import.reset_index(drop=True,inplace=True)

    y_check = np.column_stack((y_test_nmpy, y_pred_model,mkt_return_nmpy,returns_nmpy))
    y_check_df = pd.DataFrame(y_check,columns=['true_value','pred_value','mkt_return','returns'])
    y_check_df.sort_values('pred_value',inplace=True,ascending=False)
    y_check_df.reset_index(inplace=True,drop=True)
    t_hold = y_check_df['pred_value'][14]
    
    print('Model Variable Importance Top 10:')
    print(feat_import.iloc[0:10,])
    print('*****************************')
    print('roc_auc_score on training: {:.4f}'.format(roc_auc_score(y_train_nmpy,y_train_model)))
    print('roc_auc_score on testing: {:.4f}'.format(roc_auc_score(y_test_nmpy,y_test_model)))
    print('*****************************')
    print('Confusion Matrix Threshold: {:.6f}'.format(t_hold))
    print('*****************************')
    print('Model Eval Results:')
    print('*****************************')
    confusion = confusion_matrix(y_test_nmpy, (y_pred_model>=t_hold).astype(int))
#     fpr, tpr, thresholds = roc_curve(y_test_nmpy, (y_model_array>0.5).astype(int), pos_label=1)
    precision = confusion[1,1]/(confusion[1,1]+confusion[0,1])
    print('Confusion Matrix:\n',confusion)
    print('Precision:',round(precision,4))

    print("Model Top Five Picks")
    print(" Avg rtn top 5:", round(y_check_df['returns'][0:5].mean(),4))
    print(y_check_df.head(5))
    print("Model 2nd Five Picks")
    print(" Avg rtn 2nd 5:", round(y_check_df['returns'][5:10].mean(),4))
    print(y_check_df.iloc[5:10,])
    print("Model 3rd Five Picks")
    print(" Avg rtn 3rd 5:", round(y_check_df['returns'][10:15].mean(),4))
    print(y_check_df.iloc[10:15,])    
    print("Model Bottom Five Picks")
    print(" Avg rtn bottom 5:", round(y_check_df['returns'][-5:].mean(),4))
    print(y_check_df.tail(5))
    print('********************************************************')
    if i == dt1:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','rank_p_L-21']]
        predict_data['predict'] = y_pred_model
        predict_data.to_csv(etf_predict_file)
    else:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','rank_p_L-21']]
        predict_data['predict'] = y_pred_model
        predict_data.to_csv(etf_predict_file,header=False,mode='a')

Building model to predict for  2015-02-05
Model Variable Importance Top 10:
                   Feature  Importance
0  ivv_w252_rank_p_L84_avg    0.119896
1  ivv_w252_rank_v_L84_avg    0.112688
2  ivv_w84_rank_p_L252_avg    0.043596
3  ivv_w84_rank_v_L252_avg    0.042354
4      w252_rank_v_L10_avg    0.035036
5      w252_rank_p_L10_avg    0.034992
6      w252_rank_p_L84_avg    0.026371
7      w252_rank_v_L84_avg    0.022731
8     w252_rank_p_L252_avg    0.022171
9     w252_rank_v_L252_avg    0.021894
*****************************
roc_auc_score on training: 0.7013
roc_auc_score on testing: 0.5922
*****************************
Confusion Matrix Threshold: 0.498736
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[103   3]
 [ 35  12]]
Precision: 0.8
Model Top Five Picks
 Avg rtn top 5: 0.034
   true_value  pred_value  mkt_return   returns
0         1.0    0.571996    0.010741  0.038518
1         1.0    0.565181    0.010741  0.040056
2      