In [1]:
import os
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import xgboost
from xgboost import XGBClassifier
from xgboostextension import XGBRanker
#%matplotlib notebook

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 2000)
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
data_path = '/home/mvkrein/etf_model/data'
etf_data_file = os.path.join(data_path,'etf_new_var_20180910.csv')
etf_data = pd.read_csv(etf_data_file,index_col=0)

In [4]:
etf_study_file = os.path.join(data_path,'ETF_list_min_6yr_history.csv')
etf_list = pd.read_csv(etf_study_file,index_col=0)
etf_data.sort_values(['Date','sym'],ascending=True,inplace=True)
etf_data.reset_index(drop=True,inplace=True)

In [5]:
drop_columns = ['Date','sym','p', 'v', 'p_L05', 'v_L05', 'p_L10', 'v_L10', 'p_L21', 'v_L21', 'p_L42', 'v_L42', 'p_L63', \
                'v_L63', 'p_L84', 'v_L84', 'p_L126', 'v_L126', 'p_L189', 'v_L189', 'p_L252', 'v_L252',\
                'p_L-21', 'v_L-21', 'delta_p_L05', 'delta_p_L10', 'delta_p_L21', 'delta_p_L42', 'delta_p_L63', \
                'delta_p_L84', 'delta_p_L126', 'delta_p_L189', 'delta_p_L252', 'delta_p_L-21', 'delta_v_L05',\
                'delta_v_L10', 'delta_v_L21', 'delta_v_L42', 'delta_v_L63', 'delta_v_L84', 'delta_v_L126', \
                'delta_v_L189', 'delta_v_L252','rank_p_L-21', 'ivv_delta_p_L-21', 'target']

rank_col = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
            'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']

In [6]:
all_columns = list(etf_data.columns)

In [7]:
include_columns = [x for x in all_columns if x not in drop_columns]
# include_columns = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
#             'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']
# include_columns = ['rank_p_L05','rank_p_L21','rank_p_L42','rank_p_L252']

In [8]:
dates = list(etf_data['Date'].unique())
# dates[1671]

In [9]:
#Each year has 252 trading dates.  Need two years to fully develop variables.
dt1 = 504 #This is the first day that all variables are developed
# make all dates relative to dt1
# for one year training - add 252
# to evaluate for one month outside the training window - add 273
# to predict for the first day outside of the evaluation window (have to lag 21) - add 294
dt_end = (len(dates) - 273 - 21) - 1
# dt_end = 505
rpeat = len(etf_data['sym'].unique())
etf_predict_file = os.path.join(data_path,'etf_pred_xgboost_binary_reg_20180928.csv')

In [11]:
for i in range(dt1,dt_end,21):
    print("Building model to predict for ",dates[i+273])
    x_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+231])),include_columns] #train with 12 mos
    y_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+231])),['target']] #train with 12 mos
    x_eval = etf_data.loc[((etf_data['Date']>=dates[i+231]) & (etf_data['Date']<dates[i+252])),include_columns] #eval with 1 mos
    y_eval = etf_data.loc[((etf_data['Date']>=dates[i+231]) & (etf_data['Date']<dates[i+252])),['rank_p_L-21']] #eval with 1 mos
#     x_train, x_eval, y_train, y_eval = train_test_split\
#     (etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns],\
#      etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['target']],\
#      test_size=0.2, random_state=54321)
#     x_eval = x_train
#     y_eval = y_train
#predict one day
    x_test = etf_data.loc[(etf_data['Date']==dates[i+273]),include_columns] #predict one day-must be 21 days removed from training
    y_test = etf_data.loc[(etf_data['Date']==dates[i+273]),['target']] #predict if etf >= market
    returns = etf_data.loc[(etf_data['Date']==dates[i+273]),['delta_p_L-21']]
    mkt_return = etf_data.loc[(etf_data['Date']==dates[i+273]),['ivv_delta_p_L-21']]
    x_train_nmpy = x_train.as_matrix()
    y_train_nmpy = y_train.as_matrix()
    x_eval_nmpy = x_eval.as_matrix()
    y_eval_nmpy = y_eval.as_matrix()
    x_test_nmpy = x_test.as_matrix()
    y_test_nmpy = y_test.as_matrix()
    returns_nmpy = returns.as_matrix()
    mkt_return_nmpy = mkt_return.as_matrix()
    x_train_groups = np.repeat(rpeat,231)
    x_eval_groups = np.repeat(rpeat,21)
    x_test_groups = np.repeat(rpeat,1)
    train_xgb = xgboost.DMatrix(x_train_nmpy,label=y_train_nmpy,feature_names=include_columns)
    train_xgb.set_group(x_train_groups)
    eval_xgb = xgboost.DMatrix(x_eval_nmpy,label=y_eval_nmpy,feature_names=include_columns)
    eval_xgb.set_group(x_eval_groups)
    pred_xgb = xgboost.DMatrix(x_test_nmpy,feature_names=include_columns)
    pred_xgb.set_group(x_test_groups)
#turn down learning rate from default value of 0.3, otherwise accept defaults

    param = {'eta':0.3,'seed':54321,'objective':'binary:logistic','max_depth':4, 'subsample': 0.8,'scale_pos_weight':1,\
             'colsample_bytree':0.8,'min_child_weight':1,'gamma':10,'nthread':32,'eval_metric':'auc'\
            ,'lambda':5,'alpha':2}
    param['nthread']=32  #change this to the number of cores on your machine
    param['random_state']=54321 #random number to start
    param['silent']=1 #set to 1 if you don't want all the output

    evallist = [(train_xgb,'train'),(eval_xgb,'eval')]
#     evallist = [(train_xgb,'train')]
    plst = param.items()
    num_round = 500 #max number of rounds -- shouldn't ever get there with early stopping
    evals_result = {}
    bst = xgboost.train(plst,train_xgb,num_round,evals=evallist,early_stopping_rounds=30,verbose_eval=False,\
                        evals_result=evals_result)

    y_model = bst.predict(pred_xgb)
    y_model_array = y_model.reshape((y_test_nmpy.shape))
#     y_model_df = pd.DataFrame(y_model_array,columns=['probability'])
#     y_model_df.sort_values(by='probability',ascending=False,inplace=True)
#     check_val = y_model_df.iloc[4][0]
    y_check = np.column_stack((y_test_nmpy, y_model_array,mkt_return_nmpy,returns_nmpy))
    y_check_df = pd.DataFrame(y_check,columns=['true_value','model_value','mkt_return','returns'])
    y_check_df.sort_values('model_value',inplace=True,ascending=False)
    var_fscore = pd.DataFrame.from_dict(bst.get_fscore(),orient='index')
    var_fscore.columns = ['fscore']
    var_fscore.sort_values(by='fscore',ascending=False,inplace=True)
#     train_error = evals_result['train']['auc'][len(evals_result['train']['auc'])-1]
#     eval_error = evals_result['eval']['auc'][len(evals_result['eval']['auc'])-1]
    number_of_rounds = len(evals_result['train']['auc'])
    train_error = evals_result['train']['auc'][bst.best_iteration]
    eval_error = evals_result['eval']['auc'][bst.best_iteration]
#     number_of_rounds = len(evals_result['eval']['ndcg'])
    print('Model Eval Results:')
    print('Training auc: ',train_error)
    print('Evaluation auc: ',eval_error)
    print('Number of rounds: ',number_of_rounds)
    print('Model Variable Importance:')
    print(var_fscore.iloc[0:10])
    print('*****************************')
    confusion = confusion_matrix(y_test_nmpy, (y_model_array>=0.8).astype(int))
#     fpr, tpr, thresholds = roc_curve(y_test_nmpy, (y_model_array>0.5).astype(int), pos_label=1)
    precision = confusion[1,1]/(confusion[1,1]+confusion[0,1])
    print('xgboost prediction - confusion matrix\n', confusion,'\nprecision: ',round(precision,3))
    print('*****************************')
    print("Model Top 5 Picks")
    print(" Avg rtn top 5:", round(y_check_df['returns'][0:5].mean(),4))
    print(y_check_df.head(5))
    print("Model Bottom 5 Picks")
    print(" Avg rtn bottom 5:", round(y_check_df['returns'][-5:].mean(),4))
    print(y_check_df.tail(5))
    print('********************************************************')
    if i == dt1:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','target']]
        predict_data['predict'] = y_model_array
        predict_data.to_csv(etf_predict_file)
    else:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','target']]
        predict_data['predict'] = y_model_array
        predict_data.to_csv(etf_predict_file,header=False,mode='a')

Building model to predict for  2015-02-05
Model Eval Results:
Training auc:  0.976447
Evaluation auc:  0.560165
Number of rounds:  224
Model Variable Importance:
                      fscore
w252_rank_p_L252_avg      59
w189_rank_p_L252_avg      29
w189_rank_p_L42_avg       29
w42_rank_p_L21_avg        26
w63_rank_p_L63_avg        25
w252_rank_p_L126_avg      22
w252_rank_p_L21_avg       22
w189_rank_p_L21_avg       21
w84_rank_p_L252_avg       20
w252_rank_p_L189_avg      20
*****************************
xgboost prediction - confusion matrix
 [[106   0]
 [ 41   6]] 
precision:  1.0
*****************************
Model Top 5 Picks
 Avg rtn top 5: 0.0411
     true_value  model_value  mkt_return   returns
70          1.0     0.963484    0.010741  0.071770
83          1.0     0.950708    0.010741  0.038518
88          1.0     0.907849    0.010741  0.069374
129         1.0     0.866472    0.010741  0.013101
113         1.0     0.810586    0.010741  0.012604
Model Bottom 5 Picks
 Avg rtn bot

Model Eval Results:
Training auc:  0.949889
Evaluation auc:  0.582422
Number of rounds:  102
Model Variable Importance:
                      fscore
w252_rank_p_L252_avg      43
w21_rank_p_L21_avg        21
w252_rank_p_L63_avg       20
w252_rank_p_L126_avg      19
w252_rank_p_L84_avg       16
w42_rank_p_L42_avg        16
w84_rank_p_L84_avg        15
w252_rank_p_L05_avg       15
w42_rank_p_L21_avg        15
w252_rank_p_L189_avg      15
*****************************
xgboost prediction - confusion matrix
 [[69  1]
 [82  1]] 
precision:  0.5
*****************************
Model Top 5 Picks
 Avg rtn top 5: -0.073
     true_value  model_value  mkt_return   returns
55          0.0     0.811831   -0.076446 -0.131111
121         1.0     0.800127   -0.076446 -0.064390
132         1.0     0.782817   -0.076446 -0.033645
105         1.0     0.733822   -0.076446 -0.075197
122         1.0     0.733147   -0.076446 -0.060650
Model Bottom 5 Picks
 Avg rtn bottom 5: -0.1249
     true_value  model_value  m



Model Eval Results:
Training auc:  0.876377
Evaluation auc:  0.597048
Number of rounds:  50
Model Variable Importance:
                          fscore
w252_rank_p_L252_avg          19
w252_rank_p_L126_avg          16
w189_rank_p_L252_avg          11
ivv_w189_rank_p_L126_avg      11
rank_p_L05                    10
w21_rank_p_L21_avg             9
w42_rank_p_L05_avg             9
w126_rank_p_L05_avg            9
w84_rank_p_L63_avg             8
w42_rank_v_L42_avg             8
*****************************
xgboost prediction - confusion matrix
 [[60  0]
 [93  0]] 
precision:  nan
*****************************
Model Top 5 Picks
 Avg rtn top 5: -0.0601
     true_value  model_value  mkt_return   returns
93          0.0     0.799279   -0.054663 -0.071105
90          0.0     0.752710   -0.054663 -0.065998
97          1.0     0.745717   -0.054663  0.001966
104         0.0     0.741275   -0.054663 -0.107185
78          0.0     0.711851   -0.054663 -0.058267
Model Bottom 5 Picks
 Avg rtn botto

Model Eval Results:
Training auc:  0.962091
Evaluation auc:  0.569914
Number of rounds:  135
Model Variable Importance:
                      fscore
w252_rank_p_L252_avg      43
w252_rank_p_L189_avg      28
w126_rank_p_L63_avg       20
w189_rank_p_L252_avg      20
w252_rank_p_L42_avg       20
w84_rank_p_L05_avg        18
w126_rank_p_L05_avg       18
w252_rank_p_L126_avg      17
w126_rank_p_L42_avg       17
w84_rank_p_L21_avg        17
*****************************
xgboost prediction - confusion matrix
 [[57  4]
 [87  5]] 
precision:  0.556
*****************************
Model Top 5 Picks
 Avg rtn top 5: 0.0716
    true_value  model_value  mkt_return   returns
83         1.0     0.894033    0.041489  0.085523
15         0.0     0.875649    0.041489  0.040554
25         1.0     0.862342    0.041489  0.052247
38         1.0     0.856404    0.041489  0.102897
45         1.0     0.856001    0.041489  0.076716
Model Bottom 5 Picks
 Avg rtn bottom 5: 0.0048
     true_value  model_value  mkt_re

Model Eval Results:
Training auc:  0.843735
Evaluation auc:  0.617806
Number of rounds:  47
Model Variable Importance:
                      fscore
w84_rank_p_L42_avg        12
w252_rank_p_L252_avg      12
w63_rank_p_L63_avg         9
w126_rank_p_L252_avg       9
w189_rank_p_L84_avg        9
w252_rank_p_L189_avg       9
w126_rank_p_L189_avg       9
w189_rank_p_L42_avg        9
w63_rank_p_L126_avg        8
w189_rank_p_L05_avg        8
*****************************
xgboost prediction - confusion matrix
 [[82  0]
 [71  0]] 
precision:  nan
*****************************
Model Top 5 Picks
 Avg rtn top 5: 0.0228
     true_value  model_value  mkt_return   returns
84          1.0     0.794751    0.011592  0.017468
66          1.0     0.764820    0.011592  0.025671
117         0.0     0.733234    0.011592  0.010482
135         1.0     0.726794    0.011592  0.018192
83          1.0     0.714155    0.011592  0.042040
Model Bottom 5 Picks
 Avg rtn bottom 5: 0.0083
     true_value  model_value  mkt

Model Eval Results:
Training auc:  0.954535
Evaluation auc:  0.587712
Number of rounds:  114
Model Variable Importance:
                      fscore
w252_rank_p_L252_avg      32
w189_rank_p_L189_avg      23
w189_rank_p_L252_avg      22
w21_rank_p_L21_avg        18
w63_rank_p_L126_avg       17
w126_rank_p_L126_avg      17
w252_rank_p_L42_avg       16
w252_rank_p_L21_avg       16
w63_rank_p_L63_avg        16
w126_rank_p_L252_avg      15
*****************************
xgboost prediction - confusion matrix
 [[76  1]
 [75  1]] 
precision:  0.5
*****************************
Model Top 5 Picks
 Avg rtn top 5: 0.0465
     true_value  model_value  mkt_return   returns
134         1.0     0.838012     0.02384  0.057068
145         0.0     0.835141     0.02384  0.014380
80          1.0     0.763814     0.02384  0.049929
46          1.0     0.752847     0.02384  0.090909
142         0.0     0.752639     0.02384  0.020180
Model Bottom 5 Picks
 Avg rtn bottom 5: 0.0136
    true_value  model_value  mkt

Model Eval Results:
Training auc:  0.871452
Evaluation auc:  0.612633
Number of rounds:  50
Model Variable Importance:
                      fscore
w252_rank_p_L252_avg      22
w126_rank_p_L252_avg      11
w252_rank_p_L21_avg       11
w126_rank_p_L126_avg      10
w252_rank_p_L84_avg       10
w252_rank_p_L10_avg        9
w21_rank_p_L21_avg         8
w10_rank_p_L42_avg         8
w189_rank_p_L126_avg       8
w10_rank_p_L10_avg         8
*****************************
xgboost prediction - confusion matrix
 [[77  4]
 [65  7]] 
precision:  0.636
*****************************
Model Top 5 Picks
 Avg rtn top 5: -0.0139
     true_value  model_value  mkt_return   returns
138         0.0     0.875298   -0.014782 -0.035142
137         1.0     0.868354   -0.014782 -0.001240
34          0.0     0.849306   -0.014782 -0.021213
74          1.0     0.840786   -0.014782 -0.010541
86          1.0     0.840782   -0.014782 -0.001512
Model Bottom 5 Picks
 Avg rtn bottom 5: -0.0171
     true_value  model_value 

Model Eval Results:
Training auc:  0.844852
Evaluation auc:  0.473767
Number of rounds:  43
Model Variable Importance:
                         fscore
w252_rank_p_L252_avg         16
w252_rank_p_L63_avg          11
w189_rank_p_L252_avg         11
w126_rank_p_L42_avg          11
ivv_w21_rank_p_L252_avg      10
w252_rank_p_L10_avg           9
w63_rank_p_L05_avg            8
w84_rank_p_L42_avg            8
w189_rank_p_L189_avg          7
w126_rank_p_L126_avg          7
*****************************
xgboost prediction - confusion matrix
 [[83 41]
 [19 10]] 
precision:  0.196
*****************************
Model Top 5 Picks
 Avg rtn top 5: 0.007
    true_value  model_value  mkt_return   returns
11         0.0     0.973264    0.027376 -0.014706
16         0.0     0.964194    0.027376  0.002942
46         1.0     0.961844    0.027376  0.035387
17         0.0     0.960229    0.027376  0.012281
6          0.0     0.938943    0.027376 -0.001107
Model Bottom 5 Picks
 Avg rtn bottom 5: 0.0194
     