In [1]:
import pandas as pd
import numpy as np
import os
import sklearn
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
import xgboost
from xgboost import XGBClassifier
%matplotlib notebook

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
data_path = '/home/mvkrein/etf_model/data'
etf_data_file = os.path.join(data_path,'etf_new_var_20180910.csv')
etf_data = pd.read_csv(etf_data_file,index_col=0)

In [4]:
etf_study_file = os.path.join(data_path,'ETF_list_min_6yr_history.csv')
etf_list = pd.read_csv(etf_study_file,index_col=0)
etf_data.sort_values(['Date','sym'],ascending=True,inplace=True)
etf_data.reset_index(drop=True,inplace=True)

In [5]:
drop_columns = ['Date','sym','p', 'v', 'p_L05', 'v_L05', 'p_L10', 'v_L10', 'p_L21', 'v_L21', 'p_L42', 'v_L42', 'p_L63', \
                'v_L63', 'p_L84', 'v_L84', 'p_L126', 'v_L126', 'p_L189', 'v_L189', 'p_L252', 'v_L252',\
                'p_L-21', 'v_L-21', 'delta_p_L05', 'delta_p_L10', 'delta_p_L21', 'delta_p_L42', 'delta_p_L63', \
                'delta_p_L84', 'delta_p_L126', 'delta_p_L189', 'delta_p_L252', 'delta_p_L-21', 'delta_v_L05',\
                'delta_v_L10', 'delta_v_L21', 'delta_v_L42', 'delta_v_L63', 'delta_v_L84', 'delta_v_L126', \
                'delta_v_L189', 'delta_v_L252','rank_p_L-21', 'ivv_delta_p_L-21', 'target']

rank_col = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
            'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']

In [6]:
all_columns = list(etf_data.columns)

In [7]:
include_columns = [x for x in all_columns if x not in drop_columns]
# include_columns = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
#             'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']
# include_columns = ['rank_p_L05','rank_p_L21','rank_p_L42','rank_p_L252']

In [8]:
len(include_columns)

360

In [9]:
dates = list(etf_data['Date'].unique())
# dates[1671]

In [10]:
symbols = etf_list['Symbol'].unique()

In [11]:
len(symbols)

153

In [12]:
#Each year has 252 trading dates.  Need two years to fully develop variables.
dt1 = 504 #This is the first day that all variables are developed
# dt1 = 504 + 21 + 273 #This is the first day that all variables are developed 2016-04-07
# make all dates relative to dt1
# for one year training - add 252
# to evaluate for one month outside the training window - add 273
# to predict for the first day outside of the evaluation window (have to lag 21) - add 294
dt_end = (len(dates) - 273 - 21) - 1
# dt_end = dt1 + 1
etf_predict_file = os.path.join(data_path,'etf_pred_random_forest_regress_20181008.csv')

In [16]:
print(dates[504], dates[504+251], dates[504+273])

2014-01-06 2015-01-05 2015-02-05


In [17]:
print(dates[504+21], dates[504+251+21], dates[504+273+21])

2014-02-05 2015-02-04 2015-03-09


In [13]:
for i in range(dt1,dt_end,21):
    print("Building model to predict for ",dates[i+273])
    x_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns] #train with 12 mos
    y_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['rank_p_L-21']] #train with 12 mos
#     x_eval = etf_data.loc[((etf_data['Date']>=dates[i+252]) & (etf_data['Date']<dates[i+273])),include_columns] #eval with 1 mos
#     y_eval = etf_data.loc[((etf_data['Date']>=dates[i+252]) & (etf_data['Date']<dates[i+273])),['rank_p_L-21']] #eval with 1 mos
#     x_train, x_eval, y_train, y_eval = train_test_split\
#     (etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns],\
#      etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['rank_p_L-21']],\
#      test_size=0.3, random_state=1234)
#     x_eval = x_train
#     y_eval = y_train
#predict one day
    x_test = etf_data.loc[(etf_data['Date']==dates[i+273]),include_columns] #predict one day-must be 21 days removed from training
    y_test = etf_data.loc[(etf_data['Date']==dates[i+273]),['rank_p_L-21']] #predict if etf >= market
    returns = etf_data.loc[(etf_data['Date']==dates[i+273]),['delta_p_L-21']]
    mkt_return = etf_data.loc[(etf_data['Date']==dates[i+273]),['ivv_delta_p_L-21']]
    x_train_nmpy = x_train.as_matrix()
    y_train_nmpy = np.ravel(y_train.as_matrix())
#     x_eval_nmpy = x_eval.as_matrix()
#     y_eval_nmpy = y_eval.as_matrix()
    x_test_nmpy = x_test.as_matrix()
    y_test_nmpy = np.ravel(y_test.as_matrix())
    returns_nmpy = returns.as_matrix()
    mkt_return_nmpy = mkt_return.as_matrix()
    
#     param_grid = { 
#     'n_estimators': [100,300,500,700,1000],
#     'max_features': ['auto','log2',None],
#     'max_depth' : [4,5,6,7,8,None],
#     'criterion' :['gini','entropy']

#     param_grid = { 
#     'n_estimators': [500],
#     'max_features': ['auto',None],
#     'max_depth' : [4,5,6,7,8,None],
#     'criterion' :['gini']}

    
#     rfc=RandomForestClassifier(n_jobs=32,random_state=54321)
#     CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs=32, verbose=1)
#     CV_rfc.fit(x_train_nmpy, y_train_nmpy)
    
#     print(CV_rfc.best_params_)
    #{'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'n_estimators': 700}
    
#     rf = RandomForestClassifier(**CV_rfc.best_params_,n_jobs=32,random_state=54321,verbose=1)

    rf = RandomForestRegressor(n_estimators=500, n_jobs=32, max_depth=8, criterion='mse',max_features='auto',random_state=54321)
    rf.fit(x_train_nmpy, y_train_nmpy)
    
    y_pred_model = rf.predict(x_test_nmpy)

    y_check = np.column_stack((y_test_nmpy, y_pred_model,mkt_return_nmpy,returns_nmpy))
    y_check_df = pd.DataFrame(y_check,columns=['true_value','pred_value','mkt_return','returns'])
    y_check_df.sort_values('pred_value',inplace=True,ascending=False)

    feat_import = pd.DataFrame({'Feature':include_columns,'Importance': rf.feature_importances_})
    feat_import.sort_values(by='Importance',inplace=True,ascending=False) 
    feat_import.reset_index(drop=True,inplace=True)
    train_error = mean_squared_error(y_train_nmpy,rf.predict(x_train_nmpy))
    eval_error = mean_squared_error(y_test_nmpy,y_pred_model)

    print('Model Eval Results:')
    print('*****************************')

    print(f'training error: {train_error:.3}')
    print(f'evaluation error: {eval_error:.3}')
    print('Model Variable Importance Top 10:')
    print(feat_import.iloc[0:10,])
    print('*****************************')
    print("Model Top Five Picks")
    print(" Avg rtn top 5:", round(y_check_df['returns'][0:5].mean(),4))
    print(y_check_df.head(5))
    print("Model Bottom Five Picks")
    print(" Avg rtn bottom 5:", round(y_check_df['returns'][-5:].mean(),4))
    print(y_check_df.tail(5))
    print('********************************************************')
    if i == dt1:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','rank_p_L-21']]
        predict_data['predict'] = y_pred_model
        predict_data.to_csv(etf_predict_file)
    else:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','rank_p_L-21']]
        predict_data['predict'] = y_pred_model
        predict_data.to_csv(etf_predict_file,header=False,mode='a')

Building model to predict for  2015-02-05
Model Eval Results:
*****************************
training error: 0.039
evaluation error: 0.0905
Model Variable Importance Top 10:
                  Feature  Importance
0  ivv_w42_rank_v_L21_avg    0.032759
1  ivv_w42_rank_p_L21_avg    0.031055
2      w42_rank_v_L21_avg    0.025841
3      w42_rank_p_L21_avg    0.025527
4    w252_rank_v_L252_avg    0.021734
5    w252_rank_p_L252_avg    0.020819
6      w21_rank_p_L42_avg    0.012042
7      w21_rank_v_L42_avg    0.011510
8      w84_rank_p_L21_avg    0.009998
9      w10_rank_v_L84_avg    0.009857
*****************************
Model Top Five Picks
 Avg rtn top 5: -0.0455
    true_value  pred_value  mkt_return   returns
32    0.993464    0.790951    0.010741  0.091887
97    0.026144    0.739632    0.010741 -0.089870
54    0.013072    0.734960    0.010741 -0.133239
44    0.104575    0.733633    0.010741 -0.047549
81    0.098039    0.729124    0.010741 -0.048962
Model Bottom Five Picks
 Avg rtn bottom 

Model Eval Results:
*****************************
training error: 0.0356
evaluation error: 0.088
Model Variable Importance Top 10:
                    Feature  Importance
0  ivv_w189_rank_v_L126_avg    0.031798
1  ivv_w189_rank_p_L126_avg    0.031224
2      w252_rank_v_L252_avg    0.028235
3      w252_rank_p_L252_avg    0.026110
4        w21_rank_p_L42_avg    0.025407
5        w21_rank_v_L42_avg    0.025014
6        w21_rank_p_L21_avg    0.023646
7        w21_rank_v_L21_avg    0.023580
8        w10_rank_v_L21_avg    0.014938
9        w10_rank_p_L21_avg    0.014195
*****************************
Model Top Five Picks
 Avg rtn top 5: -0.0753
     true_value  pred_value  mkt_return   returns
90     0.346405    0.672112   -0.076446 -0.083280
87     0.359477    0.659506   -0.076446 -0.081408
135    0.431373    0.650628   -0.076446 -0.078000
106    0.581699    0.648015   -0.076446 -0.069621
121    0.660131    0.644903   -0.076446 -0.064390
Model Bottom Five Picks
 Avg rtn bottom 5: -0.1102
   

Model Eval Results:
*****************************
training error: 0.043
evaluation error: 0.0932
Model Variable Importance Top 10:
                    Feature  Importance
0      w252_rank_p_L252_avg    0.035602
1      w252_rank_v_L252_avg    0.030602
2        w21_rank_p_L21_avg    0.022431
3        w21_rank_v_L21_avg    0.021779
4      w189_rank_v_L252_avg    0.020566
5      w189_rank_p_L252_avg    0.018233
6  ivv_w252_rank_p_L252_avg    0.012715
7  ivv_w252_rank_v_L252_avg    0.011848
8       w63_rank_v_L126_avg    0.011728
9        w10_rank_p_L42_avg    0.011427
*****************************
Model Top Five Picks
 Avg rtn top 5: 0.0678
     true_value  pred_value  mkt_return   returns
110    0.581699    0.623244    0.055758  0.055450
83     0.803922    0.615856    0.055758  0.077209
87     0.921569    0.614514    0.055758  0.096087
150    0.509804    0.603525    0.055758  0.049184
106    0.692810    0.592967    0.055758  0.060871
Model Bottom Five Picks
 Avg rtn bottom 5: 0.0959
    t

Model Eval Results:
*****************************
training error: 0.0406
evaluation error: 0.091
Model Variable Importance Top 10:
                   Feature  Importance
0  ivv_w10_rank_v_L189_avg    0.036611
1  ivv_w10_rank_p_L189_avg    0.036500
2       w21_rank_v_L42_avg    0.032270
3       w21_rank_p_L42_avg    0.031244
4       w63_rank_p_L84_avg    0.018120
5       w84_rank_p_L63_avg    0.016503
6       w63_rank_v_L84_avg    0.015492
7       w84_rank_v_L63_avg    0.015374
8       w84_rank_v_L84_avg    0.014962
9       w84_rank_p_L84_avg    0.013798
*****************************
Model Top Five Picks
 Avg rtn top 5: 0.0014
     true_value  pred_value  mkt_return   returns
140    0.124183    0.576887    0.004421  0.000006
125    0.222222    0.572651    0.004421  0.003602
86     0.150327    0.570291    0.004421  0.000728
5      0.196078    0.569595    0.004421  0.002742
137    0.130719    0.567254    0.004421  0.000119
Model Bottom Five Picks
 Avg rtn bottom 5: 0.0538
     true_value 

Model Eval Results:
*****************************
training error: 0.0366
evaluation error: 0.0949
Model Variable Importance Top 10:
                 Feature  Importance
0   w252_rank_p_L126_avg    0.023956
1   w252_rank_v_L126_avg    0.023326
2    w126_rank_v_L05_avg    0.017281
3    w126_rank_p_L05_avg    0.017237
4  ivv_w5_rank_v_L42_avg    0.016182
5  ivv_w5_rank_p_L42_avg    0.014439
6   w189_rank_v_L189_avg    0.012742
7     w21_rank_p_L42_avg    0.012641
8     w42_rank_v_L42_avg    0.012610
9   w189_rank_p_L189_avg    0.012445
*****************************
Model Top Five Picks
 Avg rtn top 5: 0.0118
    true_value  pred_value  mkt_return   returns
72    0.849673    0.714583    0.033467  0.031213
47    0.810458    0.693497    0.033467  0.029747
51    0.392157    0.682616    0.033467  0.002550
79    0.346405    0.673631    0.033467 -0.000303
61    0.228758    0.653927    0.033467 -0.004268
Model Bottom Five Picks
 Avg rtn bottom 5: -0.0237
    true_value  pred_value  mkt_return   r

Model Eval Results:
*****************************
training error: 0.0408
evaluation error: 0.101
Model Variable Importance Top 10:
                   Feature  Importance
0      w189_rank_p_L05_avg    0.024894
1      w189_rank_v_L05_avg    0.024866
2     w252_rank_v_L252_avg    0.020466
3     w252_rank_p_L252_avg    0.019810
4      w84_rank_v_L189_avg    0.017362
5  ivv_w252_rank_v_L05_avg    0.016457
6  ivv_w252_rank_p_L05_avg    0.016064
7      w84_rank_p_L189_avg    0.014713
8  ivv_w126_rank_v_L05_avg    0.014395
9  ivv_w126_rank_p_L05_avg    0.013868
*****************************
Model Top Five Picks
 Avg rtn top 5: 0.0045
     true_value  pred_value  mkt_return   returns
25     0.398693    0.705955   -0.003647 -0.001242
133    0.843137    0.701253   -0.003647  0.019033
53     0.045752    0.698513   -0.003647 -0.035303
139    0.738562    0.692165   -0.003647  0.008196
87     0.921569    0.680102   -0.003647  0.031861
Model Bottom Five Picks
 Avg rtn bottom 5: 0.0044
    true_value  

Model Eval Results:
*****************************
training error: 0.0362
evaluation error: 0.0837
Model Variable Importance Top 10:
                    Feature  Importance
0       w21_rank_v_L252_avg    0.059641
1       w21_rank_p_L252_avg    0.058053
2   ivv_w63_rank_v_L189_avg    0.020758
3   ivv_w63_rank_p_L189_avg    0.020529
4      w252_rank_p_L252_avg    0.015873
5       w84_rank_v_L126_avg    0.015346
6       w84_rank_p_L126_avg    0.015236
7      w252_rank_v_L252_avg    0.015228
8  ivv_w126_rank_v_L189_avg    0.015038
9  ivv_w126_rank_p_L189_avg    0.013987
*****************************
Model Top Five Picks
 Avg rtn top 5: -0.0093
     true_value  pred_value  mkt_return   returns
133    0.745098    0.731948    0.017953  0.018685
24     0.555556    0.655470    0.017953  0.004421
79     0.666667    0.653721    0.017953  0.013029
58     0.124183    0.653591    0.017953 -0.015653
78     0.006536    0.653083    0.017953 -0.066834
Model Bottom Five Picks
 Avg rtn bottom 5: -0.0189
  