In [1]:
import pandas as pd
import numpy as np
import os
import sklearn
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
import xgboost
from xgboost import XGBClassifier
%matplotlib notebook

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
data_path = '/home/mvkrein/etf_model/data'
etf_data_file = os.path.join(data_path,'etf_new_var_20180910.csv')
etf_data = pd.read_csv(etf_data_file,index_col=0)

In [4]:
etf_study_file = os.path.join(data_path,'ETF_list_min_6yr_history.csv')
etf_list = pd.read_csv(etf_study_file,index_col=0)
etf_data.sort_values(['Date','sym'],ascending=True,inplace=True)
etf_data.reset_index(drop=True,inplace=True)

In [5]:
drop_columns = ['Date','sym','p', 'v', 'p_L05', 'v_L05', 'p_L10', 'v_L10', 'p_L21', 'v_L21', 'p_L42', 'v_L42', 'p_L63', \
                'v_L63', 'p_L84', 'v_L84', 'p_L126', 'v_L126', 'p_L189', 'v_L189', 'p_L252', 'v_L252',\
                'p_L-21', 'v_L-21', 'delta_p_L05', 'delta_p_L10', 'delta_p_L21', 'delta_p_L42', 'delta_p_L63', \
                'delta_p_L84', 'delta_p_L126', 'delta_p_L189', 'delta_p_L252', 'delta_p_L-21', 'delta_v_L05',\
                'delta_v_L10', 'delta_v_L21', 'delta_v_L42', 'delta_v_L63', 'delta_v_L84', 'delta_v_L126', \
                'delta_v_L189', 'delta_v_L252','rank_p_L-21', 'ivv_delta_p_L-21', 'target']

rank_col = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
            'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']

In [6]:
all_columns = list(etf_data.columns)

In [7]:
include_columns = [x for x in all_columns if x not in drop_columns]
# include_columns = ['rank_p_L05','rank_p_L10','rank_p_L21','rank_p_L42',\
#             'rank_p_L63','rank_p_L84','rank_p_L126','rank_p_L189','rank_p_L252']
# include_columns = ['rank_p_L05','rank_p_L21','rank_p_L42','rank_p_L252']

In [8]:
len(include_columns)

360

In [9]:
dates = list(etf_data['Date'].unique())
# dates[1671]

In [10]:
for i in range(0,1671):
    if dates[i] == '2018-07-09':
        print (i,dates[i])

1638 2018-07-09


In [11]:
symbols = etf_list['Symbol'].unique()

In [12]:
len(symbols)

153

In [22]:
#Each year has 252 trading dates.  Need two years to fully develop variables.
dt1 = 504 #This is the first day that all variables are developed
# dt1 = 504 + 21 + 273 #This is the first day that all variables are developed 2016-04-07
# make all dates relative to dt1
# for one year training - add 252
# to evaluate for one month outside the training window - add 273
# to predict for the first day outside of the evaluation window (have to lag 21) - add 294
dt_end = (len(dates) - 273 - 21) - 1
# dt2 = 1638 - 273 #restart for '2018-05-08'
# dt_end = dt1 + 1 #For testing on the first day only
etf_predict_file = os.path.join(data_path,'etf_pred_logistic_reg_L2_20180928.csv')

In [14]:
#Run grid search with cross validation to select best parameters
# i = dt1
# x_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns] #train with 12 mos
# y_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['target']] #train with 12 mos
# x_train_nmpy = x_train.as_matrix()
# y_train_nmpy = np.ravel(y_train.as_matrix())

# param_grid = { 
# 'n_estimators': [100,300,500,700,1000],
# 'max_features': ['auto','log2',None],
# 'max_depth' : [4,5,6,7,8,None],
# 'criterion' :['gini','entropy']}

# rfc=RandomForestClassifier(n_jobs=32,random_state=54321)
# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs=32, verbose=1)
# CV_rfc.fit(x_train_nmpy, y_train_nmpy)

#print(CV_rfc.best_params_)
#{'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'n_estimators': 700}


In [27]:
# for i in range(dt2,dt_end,21): #for restart
for i in range(dt1,dt_end,21):
    print("Building model to predict for ",dates[i+273])
    x_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),include_columns] #train with 12 mos
    y_train = etf_data.loc[((etf_data['Date']>=dates[i]) & (etf_data['Date']<dates[i+252])),['target']] #train with 12 mos

#predict one day
    x_test = etf_data.loc[(etf_data['Date']==dates[i+273]),include_columns] #predict one day-must be 21 days removed from training
    y_test = etf_data.loc[(etf_data['Date']==dates[i+273]),['target']] #predict if etf >= market
    returns = etf_data.loc[(etf_data['Date']==dates[i+273]),['delta_p_L-21']]
    mkt_return = etf_data.loc[(etf_data['Date']==dates[i+273]),['ivv_delta_p_L-21']]
    x_train_nmpy = x_train.as_matrix()
    y_train_nmpy = np.ravel(y_train.as_matrix())
#     x_eval_nmpy = x_eval.as_matrix()
#     y_eval_nmpy = y_eval.as_matrix()
    x_test_nmpy = x_test.as_matrix()
    y_test_nmpy = np.ravel(y_test.as_matrix())
    returns_nmpy = returns.as_matrix()
    mkt_return_nmpy = mkt_return.as_matrix()
    
    lr = LogisticRegression(penalty='l2',C=0.8,random_state=54321)

    lr.fit(x_train_nmpy, y_train_nmpy)
    
    y_pred_model = lr.predict_proba(x_test_nmpy)[:,1]
    y_train_model = lr.predict(x_train_nmpy)
    y_test_model = lr.predict(x_test_nmpy)

    y_check = np.column_stack((y_test_nmpy, y_pred_model,mkt_return_nmpy,returns_nmpy))
    y_check_df = pd.DataFrame(y_check,columns=['true_value','pred_value','mkt_return','returns'])
    y_check_df.sort_values('pred_value',inplace=True,ascending=False)
    y_check_df.reset_index(inplace=True,drop=True)
    t_hold = y_check_df['pred_value'][14]
    
    model_coef = pd.DataFrame({'Feature':include_columns,'Coefficient': np.ravel(lr.coef_)})
    model_coef.sort_values(by='Coefficient',inplace=True,ascending=False) 
    model_coef.reset_index(drop=True,inplace=True)
    print('*****************************')
    print('Variables with largest coefficients:')
    print(model_coef.head(5))
    print(model_coef.tail(5))
    print('*****************************')
    print('roc_auc_score on training: {:.4f}'.format(roc_auc_score(y_train_nmpy,y_train_model)))
    print('roc_auc_score on testing: {:.4f}'.format(roc_auc_score(y_test_nmpy,y_test_model)))
    print('*****************************')
    print('Confusion Matrix Threshold: {:.6f}'.format(t_hold))
    print('*****************************')
    print('Model Eval Results:')
    print('*****************************')
    confusion = confusion_matrix(y_test_nmpy, (y_pred_model>=t_hold).astype(int))
#     fpr, tpr, thresholds = roc_curve(y_test_nmpy, (y_model_array>0.5).astype(int), pos_label=1)
    precision = confusion[1,1]/(confusion[1,1]+confusion[0,1])
    print('Confusion Matrix:\n',confusion)
    print('Precision:',round(precision,4))

    print("Model Top Five Picks")
    print(" Avg rtn top 5:", round(y_check_df['returns'][0:5].mean(),4))
    print(y_check_df.head(5))
    print("Model 2nd Five Picks")
    print(" Avg rtn 2nd 5:", round(y_check_df['returns'][5:10].mean(),4))
    print(y_check_df.iloc[5:10,])
    print("Model 3rd Five Picks")
    print(" Avg rtn 3rd 5:", round(y_check_df['returns'][10:15].mean(),4))
    print(y_check_df.iloc[10:15,])    
    print("Model Bottom Five Picks")
    print(" Avg rtn bottom 5:", round(y_check_df['returns'][-5:].mean(),4))
    print(y_check_df.tail(5))
    print('********************************************************')
    if i == dt1:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','rank_p_L-21']]
        predict_data['predict'] = y_pred_model
        predict_data.to_csv(etf_predict_file)
    else:
        predict_data = etf_data.loc[(etf_data['Date']==dates[i+273]),['Date','sym','rank_p_L-21']]
        predict_data['predict'] = y_pred_model
        predict_data.to_csv(etf_predict_file,header=False,mode='a')

Building model to predict for  2015-02-05
*****************************
Variables with largest coefficients:
   Coefficient                 Feature
0     3.725513      w63_rank_p_L42_avg
1     3.725513      w63_rank_v_L42_avg
2     3.472036  ivv_w42_rank_p_L84_avg
3     3.472036  ivv_w42_rank_v_L84_avg
4     2.521587     w42_rank_v_L252_avg
     Coefficient                  Feature
355    -3.168311  ivv_w21_rank_p_L189_avg
356    -3.365727      w63_rank_p_L252_avg
357    -3.365727      w63_rank_v_L252_avg
358    -3.388579       w84_rank_v_L84_avg
359    -3.388579       w84_rank_p_L84_avg
*****************************
roc_auc_score on training: 0.6664
roc_auc_score on testing: 0.5000
*****************************
Confusion Matrix Threshold: 0.314949
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[103   3]
 [ 35  12]]
Precision: 0.8
Model Top Five Picks
 Avg rtn top 5: 0.0483
   true_value  pred_value  mkt_return   returns
0         1.

*****************************
Variables with largest coefficients:
   Coefficient                 Feature
0     4.163242  ivv_w42_rank_p_L84_avg
1     4.163242  ivv_w42_rank_v_L84_avg
2     3.092213      w63_rank_v_L42_avg
3     3.092213      w63_rank_p_L42_avg
4     2.981030     w42_rank_v_L252_avg
     Coefficient                 Feature
355    -2.949286  ivv_w63_rank_v_L21_avg
356    -3.637342      w42_rank_p_L63_avg
357    -3.637342      w42_rank_v_L63_avg
358    -4.230600     w63_rank_v_L252_avg
359    -4.230600     w63_rank_p_L252_avg
*****************************
roc_auc_score on training: 0.7098
roc_auc_score on testing: 0.4946
*****************************
Confusion Matrix Threshold: 0.261727
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[81 11]
 [57  4]]
Precision: 0.2667
Model Top Five Picks
 Avg rtn top 5: -0.1152
   true_value  pred_value  mkt_return   returns
0         0.0    0.548020    -0.01377 -0.367158
1         1.

*****************************
Variables with largest coefficients:
   Coefficient                 Feature
0     4.624650    w189_rank_v_L252_avg
1     4.624650    w189_rank_p_L252_avg
2     3.358237    w189_rank_v_L126_avg
3     3.358237    w189_rank_p_L126_avg
4     3.297175  ivv_w42_rank_p_L84_avg
     Coefficient              Feature
355    -3.608783  w252_rank_p_L63_avg
356    -4.122415   w42_rank_v_L63_avg
357    -4.122415   w42_rank_p_L63_avg
358    -4.346905  w126_rank_v_L42_avg
359    -4.346905  w126_rank_p_L42_avg
*****************************
roc_auc_score on training: 0.7257
roc_auc_score on testing: 0.5271
*****************************
Confusion Matrix Threshold: 0.877118
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[121  10]
 [ 17   5]]
Precision: 0.3333
Model Top Five Picks
 Avg rtn top 5: 0.0551
   true_value  pred_value  mkt_return   returns
0         0.0    0.956159    0.063911 -0.008892
1         1.0    0.936116  

*****************************
Variables with largest coefficients:
   Coefficient                  Feature
0     4.591959     w189_rank_p_L252_avg
1     4.591959     w189_rank_v_L252_avg
2     3.839508      w84_rank_v_L126_avg
3     3.839508      w84_rank_p_L126_avg
4     3.276969  ivv_w42_rank_v_L252_avg
     Coefficient               Feature
355    -3.382601  w126_rank_v_L126_avg
356    -3.675246   w63_rank_p_L252_avg
357    -3.675246   w63_rank_v_L252_avg
358    -4.343501   w126_rank_v_L42_avg
359    -4.343501   w126_rank_p_L42_avg
*****************************
roc_auc_score on training: 0.7147
roc_auc_score on testing: 0.4778
*****************************
Confusion Matrix Threshold: 0.609933
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[81  9]
 [57  6]]
Precision: 0.4
Model Top Five Picks
 Avg rtn top 5: 0.0456
   true_value  pred_value  mkt_return   returns
0         1.0    0.733237    0.055758  0.097434
1         0.0    0.724

*****************************
Variables with largest coefficients:
   Coefficient               Feature
0     4.377565   w126_rank_v_L21_avg
1     4.377565   w126_rank_p_L21_avg
2     4.341260   w252_rank_v_L42_avg
3     4.341260   w252_rank_p_L42_avg
4     4.218077  w189_rank_p_L252_avg
     Coefficient               Feature
355    -2.940986  w126_rank_p_L126_avg
356    -3.091004  w252_rank_p_L126_avg
357    -3.091004  w252_rank_v_L126_avg
358    -4.738212   w63_rank_v_L252_avg
359    -4.738212   w63_rank_p_L252_avg
*****************************
roc_auc_score on training: 0.6958
roc_auc_score on testing: 0.5603
*****************************
Confusion Matrix Threshold: 0.395024
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[93  2]
 [45 13]]
Precision: 0.8667
Model Top Five Picks
 Avg rtn top 5: 0.0551
   true_value  pred_value  mkt_return   returns
0         1.0    0.761727   -0.005191  0.077340
1         1.0    0.652789   -0.005191

*****************************
Variables with largest coefficients:
   Coefficient              Feature
0     4.636475  w84_rank_v_L126_avg
1     4.636475  w84_rank_p_L126_avg
2     3.370754  w252_rank_p_L42_avg
3     3.370754  w252_rank_v_L42_avg
4     2.960524   w63_rank_p_L63_avg
     Coefficient               Feature
355    -3.225815  w189_rank_v_L126_avg
356    -3.556778   w63_rank_p_L252_avg
357    -3.556778   w63_rank_v_L252_avg
358    -4.874071    w84_rank_p_L84_avg
359    -4.874071    w84_rank_v_L84_avg
*****************************
roc_auc_score on training: 0.7305
roc_auc_score on testing: 0.4729
*****************************
Confusion Matrix Threshold: 0.558762
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[64  9]
 [74  6]]
Precision: 0.4
Model Top Five Picks
 Avg rtn top 5: -0.0855
   true_value  pred_value  mkt_return   returns
0         0.0    0.857284   -0.031675 -0.147890
1         0.0    0.723553   -0.031675 -0.0476

*****************************
Variables with largest coefficients:
   Coefficient                  Feature
0     4.684491       w63_rank_v_L63_avg
1     4.684491       w63_rank_p_L63_avg
2     2.911769      w84_rank_p_L126_avg
3     2.911769      w84_rank_v_L126_avg
4     2.689002  ivv_w21_rank_v_L252_avg
     Coefficient               Feature
355    -2.949448  w189_rank_p_L126_avg
356    -3.620954   w63_rank_v_L189_avg
357    -3.620954   w63_rank_p_L189_avg
358    -4.327448    w84_rank_v_L84_avg
359    -4.327448    w84_rank_p_L84_avg
*****************************
roc_auc_score on training: 0.7063
roc_auc_score on testing: 0.5013
*****************************
Confusion Matrix Threshold: 0.923195
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[118  15]
 [ 20   0]]
Precision: 0.0
Model Top Five Picks
 Avg rtn top 5: -0.0022
   true_value  pred_value  mkt_return   returns
0         0.0    0.942640    0.033467 -0.009289
1         0.0    

*****************************
Variables with largest coefficients:
   Coefficient                  Feature
0     3.251380       w63_rank_v_L63_avg
1     3.251380       w63_rank_p_L63_avg
2     2.967044  ivv_w63_rank_p_L189_avg
3     2.967044  ivv_w63_rank_v_L189_avg
4     2.715345     w252_rank_v_L189_avg
     Coefficient               Feature
355    -2.616643  w126_rank_v_L252_avg
356    -3.883909  w189_rank_p_L126_avg
357    -3.883909  w189_rank_v_L126_avg
358    -3.894756    w42_rank_p_L63_avg
359    -3.894756    w42_rank_v_L63_avg
*****************************
roc_auc_score on training: 0.7122
roc_auc_score on testing: 0.3476
*****************************
Confusion Matrix Threshold: 0.704951
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[84  9]
 [54  6]]
Precision: 0.4
Model Top Five Picks
 Avg rtn top 5: -0.0029
   true_value  pred_value  mkt_return   returns
0         0.0    0.806393   -0.001747 -0.010426
1         0.0    0.79

*****************************
Variables with largest coefficients:
   Coefficient                  Feature
0     3.538151  ivv_w63_rank_p_L189_avg
1     3.538151  ivv_w63_rank_v_L189_avg
2     3.238233  ivv_w84_rank_v_L189_avg
3     3.238233  ivv_w84_rank_p_L189_avg
4     3.111061   ivv_w42_rank_p_L42_avg
     Coefficient                 Feature
355    -2.681798  ivv_w84_rank_v_L21_avg
356    -3.217718      w42_rank_v_L63_avg
357    -3.217718      w42_rank_p_L63_avg
358    -3.427003    w189_rank_v_L126_avg
359    -3.427003    w189_rank_p_L126_avg
*****************************
roc_auc_score on training: 0.6905
roc_auc_score on testing: 0.5663
*****************************
Confusion Matrix Threshold: 0.625989
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[103   8]
 [ 35   7]]
Precision: 0.4667
Model Top Five Picks
 Avg rtn top 5: -0.0208
   true_value  pred_value  mkt_return   returns
0         0.0    0.806883    0.014822 -0.018113
1 

*****************************
Variables with largest coefficients:
   Coefficient                  Feature
0     4.185671       w84_rank_p_L84_avg
1     4.185671       w84_rank_v_L84_avg
2     2.988259  ivv_w63_rank_p_L189_avg
3     2.988259  ivv_w63_rank_v_L189_avg
4     2.662234      w42_rank_v_L126_avg
     Coefficient                  Feature
355    -2.088279  ivv_w42_rank_v_L252_avg
356    -2.695566      w84_rank_p_L126_avg
357    -2.695566      w84_rank_v_L126_avg
358    -2.777379     w126_rank_v_L252_avg
359    -2.777379     w126_rank_p_L252_avg
*****************************
roc_auc_score on training: 0.6976
roc_auc_score on testing: 0.4781
*****************************
Confusion Matrix Threshold: 0.422355
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[101  13]
 [ 37   2]]
Precision: 0.1333
Model Top Five Picks
 Avg rtn top 5: -0.0026
   true_value  pred_value  mkt_return   returns
0         0.0    0.628140    0.017953 -0.037

*****************************
Variables with largest coefficients:
   Coefficient               Feature
0     3.456494  w252_rank_v_L189_avg
1     3.456494  w252_rank_p_L189_avg
2     2.850022   w42_rank_p_L126_avg
3     2.850022   w42_rank_v_L126_avg
4     2.463286    w84_rank_v_L05_avg
     Coefficient              Feature
355    -2.510651  w252_rank_p_L84_avg
356    -2.699566  w252_rank_v_L63_avg
357    -2.699566  w252_rank_p_L63_avg
358    -3.625628  w84_rank_v_L126_avg
359    -3.625628  w84_rank_p_L126_avg
*****************************
roc_auc_score on training: 0.7134
roc_auc_score on testing: 0.4732
*****************************
Confusion Matrix Threshold: 0.911744
*****************************
Model Eval Results:
*****************************
Confusion Matrix:
 [[87 10]
 [51  5]]
Precision: 0.3333
Model Top Five Picks
 Avg rtn top 5: -0.0098
   true_value  pred_value  mkt_return   returns
0         0.0    0.981246    0.006674 -0.025759
1         0.0    0.980675    0.006674 -0.0

In [20]:
np.ravel(lr.coef_).shape

(360,)