In [1]:
import pandas as pd
import numpy as np
import os
import sklearn
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import xgboost
from xgboost import XGBClassifier
%matplotlib notebook

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
data_path = '/home/mvkrein/etf_model/data'
etf_data_file = os.path.join(data_path,'etf_new_var_20180910.csv')
etf_data = pd.read_csv(etf_data_file,index_col=0)

In [4]:
etf_study_file = os.path.join(data_path,'ETF_list_min_6yr_history.csv')
etf_list = pd.read_csv(etf_study_file,index_col=0)
etf_data.sort_values(['Date','sym'],ascending=True,inplace=True)
etf_data.reset_index(drop=True,inplace=True)

In [5]:
drop_columns = ['Date','sym','p', 'v', 'p_L05', 'v_L05', 'p_L10', 'v_L10', 'p_L21', 'v_L21', 'p_L42', 'v_L42', 'p_L63', \
                'v_L63', 'p_L84', 'v_L84', 'p_L126', 'v_L126', 'p_L189', 'v_L189', 'p_L252', 'v_L252',\
                'p_L-21', 'v_L-21', 'delta_p_L05', 'delta_p_L10', 'delta_p_L21', 'delta_p_L42', 'delta_p_L63', \
                'delta_p_L84', 'delta_p_L126', 'delta_p_L189', 'delta_p_L252', 'delta_p_L-21', 'delta_v_L05',\
                'delta_v_L10', 'delta_v_L21', 'delta_v_L42', 'delta_v_L63', 'delta_v_L84', 'delta_v_L126', \
                'delta_v_L189', 'delta_v_L252','rank_p_L-21', 'ivv_delta_p_L-21', 'target']

rank_col = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
            'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']

In [6]:
all_columns = list(etf_data.columns)

In [7]:
include_columns = [x for x in all_columns if x not in drop_columns]
# include_columns = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
#             'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']
# include_columns = ['rank_p_L05','rank_p_L21','rank_p_L42','rank_p_L252']

In [8]:
len(include_columns)

360

In [9]:
dates = list(etf_data['Date'].unique())
# dates[1671]

In [10]:
for i in range(0,1671):
    if dates[i] == '2016-04-07':
        print (i,dates[i])

1071 2016-04-07


In [11]:
symbols = etf_list['Symbol'].unique()

In [12]:
len(symbols)

153

In [13]:
#Each year has 252 trading dates.  Need two years to fully develop variables.
dt1 = 504 #This is the first day that all variables are developed
# dt1 = 504 + 21 + 273 #This is the first day that all variables are developed 2016-04-07
# make all dates relative to dt1
# for one year training - add 252
# to evaluate for one month outside the training window - add 273
# to predict for the first day outside of the evaluation window (have to lag 21) - add 294
dt_end = (len(dates) - 273 - 21) - 1
dt2 = 1071 - 273 #restart for '2018-05-08'
# dt_end = dt1 + 1 #For testing on the first day only
etf_predict_file = os.path.join(data_path,'etf_pred_random_forest_20180927_depth8.csv')

In [14]:
#Run grid search with cross validation to select best parameters
# i = dt1
# x_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns] #train with 12 mos
# y_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['target']] #train with 12 mos
# x_train_nmpy = x_train.as_matrix()
# y_train_nmpy = np.ravel(y_train.as_matrix())

# param_grid = { 
# 'n_estimators': [100,300,500,700,1000],
# 'max_features': ['auto','log2',None],
# 'max_depth' : [4,5,6,7,8,None],
# 'criterion' :['gini','entropy']}

# rfc=RandomForestClassifier(n_jobs=32,random_state=54321)
# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs=32, verbose=1)
# CV_rfc.fit(x_train_nmpy, y_train_nmpy)

#print(CV_rfc.best_params_)
#{'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'n_estimators': 700}


In [14]:
#for i in range(dt2,dt_end,21): #for restart
for i in range(dt2,dt_end,21):
    print("Building model to predict for ",dates[i+273])
    x_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns] #train with 12 mos
    y_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['target']] #train with 12 mos

#predict one day
    x_test = etf_data.loc[(etf_data['Date']==dates[i+273]),include_columns] #predict one day-must be 21 days removed from training
    y_test = etf_data.loc[(etf_data['Date']==dates[i+273]),['target']] #predict if etf >= market
    returns = etf_data.loc[(etf_data['Date']==dates[i+273]),['delta_p_L-21']]
    mkt_return = etf_data.loc[(etf_data['Date']==dates[i+273]),['ivv_delta_p_L-21']]
    x_train_nmpy = x_train.as_matrix()
    y_train_nmpy = np.ravel(y_train.as_matrix())
#     x_eval_nmpy = x_eval.as_matrix()
#     y_eval_nmpy = y_eval.as_matrix()
    x_test_nmpy = x_test.as_matrix()
    y_test_nmpy = np.ravel(y_test.as_matrix())
    returns_nmpy = returns.as_matrix()
    mkt_return_nmpy = mkt_return.as_matrix()
    
#     rf = RandomForestClassifier(**CV_rfc.best_params_,n_jobs=32,random_state=54321,verbose=1)

    rf = RandomForestClassifier(n_estimators=700, n_jobs=32, max_depth=8, criterion='gini',
                                verbose=0,max_features=None,random_state=54321)
    rf.fit(x_train_nmpy, y_train_nmpy)
    
    y_pred_model = rf.predict_proba(x_test_nmpy)[:,1]
    y_train_model = rf.predict(x_train_nmpy)
    y_test_model = rf.predict(x_test_nmpy)
    
    feat_import = pd.DataFrame({'Feature':include_columns,'Importance': rf.feature_importances_})
    feat_import.sort_values(by='Importance',inplace=True,ascending=False) 
    feat_import.reset_index(drop=True,inplace=True)

    y_check = np.column_stack((y_test_nmpy, y_pred_model,mkt_return_nmpy,returns_nmpy))
    y_check_df = pd.DataFrame(y_check,columns=['true_value','pred_value','mkt_return','returns'])
    y_check_df.sort_values('pred_value',inplace=True,ascending=False)
    y_check_df.reset_index(inplace=True,drop=True)
    t_hold = y_check_df['pred_value'][14]
    
    print('Model Variable Importance Top 10:')
    print(feat_import.iloc[0:10,])
    print('*****************************')
    print('roc_auc_score on training: {:.4f}'.format(roc_auc_score(y_train_nmpy,y_train_model)))
    print('roc_auc_score on testing: {:.4f}'.format(roc_auc_score(y_test_nmpy,y_test_model)))
    print('*****************************')
    print('Confusion Matrix Threshold: {:.6f}'.format(t_hold))
    print('*****************************')
    print('Model Eval Results:')
    print('*****************************')
    confusion = confusion_matrix(y_test_nmpy, (y_pred_model>=t_hold).astype(int))
#     fpr, tpr, thresholds = roc_curve(y_test_nmpy, (y_model_array>0.5).astype(int), pos_label=1)
    precision = confusion[1,1]/(confusion[1,1]+confusion[0,1])
    print('Confusion Matrix:\n',confusion)
    print('Precision:',round(precision,4))
    print("Model Top Five Picks")
    print(" Avg rtn top 5:", round(y_check_df['returns'][0:5].mean(),4))
    print(y_check_df.head(5))
    print("Model 2nd Five Picks")
    print(" Avg rtn 2nd 5:", round(y_check_df['returns'][5:10].mean(),4))
    print(y_check_df.iloc[5:10,])
    print("Model 3rd Five Picks")
    print(" Avg rtn 3rd 5:", round(y_check_df['returns'][10:15].mean(),4))
    print(y_check_df.iloc[10:15,])    
    print("Model Bottom Five Picks")
    print(" Avg rtn bottom 5:", round(y_check_df['returns'][-5:].mean(),4))
    print(y_check_df.tail(5))
    print('********************************************************')
    if i == dt1:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','rank_p_L-21']]
        predict_data['predict'] = y_pred_model
        predict_data.to_csv(etf_predict_file)
    else:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','rank_p_L-21']]
        predict_data['predict'] = y_pred_model
        predict_data.to_csv(etf_predict_file,header=False,mode='a')

Building model to predict for  2016-04-07
Model Variable Importance Top 10:
                   Feature  Importance
0     w252_rank_v_L252_avg    0.023874
1     w252_rank_p_L252_avg    0.022771
2       w21_rank_v_L42_avg    0.018086
3       w21_rank_p_L42_avg    0.017960
4  ivv_w42_rank_v_L189_avg    0.017607
5  ivv_w42_rank_p_L189_avg    0.017142
6       w84_rank_p_L21_avg    0.015891
7       w84_rank_v_L21_avg    0.015470
8       w84_rank_p_L63_avg    0.014362
9   ivv_w21_rank_p_L84_avg    0.013907
*****************************
roc_auc_score on training: 0.8308
roc_auc_score on testing: 0.4675
*****************************
Confusion Matrix Threshold: 0.502893
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[55  8]
 [83  7]]
Precision: 0.4667
Model Top Five Picks
 Avg rtn top 5: 0.0066
   true_value  pred_value  mkt_return   returns
0         1.0    0.617652    0.008977  0.031162
1         0.0    0.598084    0.008977 -0.006562
2      

Model Variable Importance Top 10:
                   Feature  Importance
0  ivv_w42_rank_p_L189_avg    0.029163
1  ivv_w42_rank_v_L189_avg    0.027441
2  ivv_w189_rank_v_L42_avg    0.022062
3  ivv_w189_rank_p_L42_avg    0.020409
4  ivv_w10_rank_p_L189_avg    0.015766
5       w21_rank_p_L42_avg    0.015167
6  ivv_w10_rank_v_L189_avg    0.015107
7       w21_rank_v_L42_avg    0.014162
8     w252_rank_v_L252_avg    0.014073
9     w252_rank_p_L252_avg    0.013833
*****************************
roc_auc_score on training: 0.8245
roc_auc_score on testing: 0.6242
*****************************
Confusion Matrix Threshold: 0.563144
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[ 37   2]
 [101  13]]
Precision: 0.8667
Model Top Five Picks
 Avg rtn top 5: 0.0406
   true_value  pred_value  mkt_return   returns
0         1.0    0.592686    0.004421  0.024905
1         1.0    0.589547    0.004421  0.097877
2         1.0    0.574165    0.004421  0.0160

Model Variable Importance Top 10:
                   Feature  Importance
0  ivv_w63_rank_p_L126_avg    0.022934
1  ivv_w63_rank_v_L126_avg    0.021285
2       w21_rank_p_L42_avg    0.020197
3       w42_rank_v_L84_avg    0.018919
4       w21_rank_v_L42_avg    0.018363
5       w42_rank_p_L84_avg    0.017146
6  ivv_w42_rank_p_L189_avg    0.013306
7  ivv_w42_rank_v_L189_avg    0.013240
8     w126_rank_v_L126_avg    0.012745
9     w126_rank_p_L126_avg    0.012558
*****************************
roc_auc_score on training: 0.8213
roc_auc_score on testing: 0.4818
*****************************
Confusion Matrix Threshold: 0.655357
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[71 12]
 [67  3]]
Precision: 0.2
Model Top Five Picks
 Avg rtn top 5: 0.029
   true_value  pred_value  mkt_return   returns
0         0.0    0.716000    0.030848  0.029975
1         0.0    0.707344    0.030848  0.029843
2         1.0    0.700542    0.030848  0.030848
3    

Model Variable Importance Top 10:
                   Feature  Importance
0  ivv_w252_rank_v_L05_avg    0.053961
1  ivv_w252_rank_p_L05_avg    0.053224
2      w21_rank_p_L252_avg    0.030099
3      w21_rank_v_L252_avg    0.029466
4   ivv_w42_rank_p_L42_avg    0.029156
5   ivv_w42_rank_v_L42_avg    0.028964
6               rank_v_L10    0.015072
7               rank_p_L10    0.014812
8      w126_rank_p_L05_avg    0.010401
9      w63_rank_v_L252_avg    0.010081
*****************************
roc_auc_score on training: 0.8440
roc_auc_score on testing: 0.5745
*****************************
Confusion Matrix Threshold: 0.611691
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[76  8]
 [62  7]]
Precision: 0.4667
Model Top Five Picks
 Avg rtn top 5: 0.0116
   true_value  pred_value  mkt_return   returns
0         1.0    0.721616    0.018024  0.038997
1         0.0    0.703516    0.018024  0.004267
2         0.0    0.695073    0.018024 -0.015099
3

Model Variable Importance Top 10:
                   Feature  Importance
0  ivv_w252_rank_p_L05_avg    0.035024
1  ivv_w252_rank_v_L05_avg    0.033908
2  ivv_w63_rank_p_L189_avg    0.025083
3  ivv_w63_rank_v_L189_avg    0.024534
4      w63_rank_p_L252_avg    0.020094
5     w252_rank_p_L252_avg    0.019518
6      w63_rank_v_L252_avg    0.018748
7     w252_rank_v_L252_avg    0.018578
8  ivv_w10_rank_v_L126_avg    0.016274
9  ivv_w10_rank_p_L126_avg    0.014901
*****************************
roc_auc_score on training: 0.8707
roc_auc_score on testing: 0.3725
*****************************
Confusion Matrix Threshold: 0.573746
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[46  5]
 [92 10]]
Precision: 0.6667
Model Top Five Picks
 Avg rtn top 5: -0.0028
   true_value  pred_value  mkt_return   returns
0         0.0    0.657866   -0.003647 -0.035303
1         1.0    0.648236   -0.003647 -0.001206
2         1.0    0.644790   -0.003647  0.010811


Model Variable Importance Top 10:
                   Feature  Importance
0  ivv_w42_rank_v_L189_avg    0.054450
1  ivv_w42_rank_p_L189_avg    0.051230
2   ivv_w63_rank_v_L42_avg    0.029369
3   ivv_w63_rank_p_L42_avg    0.028745
4   ivv_w63_rank_v_L21_avg    0.019558
5   ivv_w63_rank_p_L21_avg    0.019123
6        w5_rank_v_L84_avg    0.017634
7       w84_rank_v_L21_avg    0.016609
8        w5_rank_p_L84_avg    0.016237
9       w84_rank_p_L21_avg    0.015853
*****************************
roc_auc_score on training: 0.8542
roc_auc_score on testing: 0.5427
*****************************
Confusion Matrix Threshold: 0.504921
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[74  5]
 [64 10]]
Precision: 0.6667
Model Top Five Picks
 Avg rtn top 5: 0.0587
   true_value  pred_value  mkt_return   returns
0         1.0    0.643982    0.043916  0.137926
1         1.0    0.589873    0.043916  0.053191
2         1.0    0.558382    0.043916  0.048639
3

Model Variable Importance Top 10:
                   Feature  Importance
0  ivv_w42_rank_p_L189_avg    0.046900
1  ivv_w42_rank_v_L189_avg    0.045668
2  ivv_w84_rank_p_L126_avg    0.035581
3  ivv_w84_rank_v_L126_avg    0.031360
4       w21_rank_v_L05_avg    0.019456
5       w21_rank_p_L05_avg    0.018685
6     w189_rank_p_L252_avg    0.016059
7     w189_rank_v_L252_avg    0.014988
8     w252_rank_v_L252_avg    0.014294
9     w252_rank_p_L252_avg    0.013821
*****************************
roc_auc_score on training: 0.8583
roc_auc_score on testing: 0.4825
*****************************
Confusion Matrix Threshold: 0.857671
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[97 15]
 [41  0]]
Precision: 0.0
Model Top Five Picks
 Avg rtn top 5: -0.0098
   true_value  pred_value  mkt_return   returns
0         0.0    0.895836    0.022764 -0.032006
1         0.0    0.891691    0.022764  0.000923
2         0.0    0.890619    0.022764 -0.003213
3  