# Example Notebook #3 - ML Portfolio

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle
import talib
import seaborn as sns
from portfolio_swissknife import portfolio as ps
from portfolio_swissknife import models as mod

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

In [2]:
#load data
opn = pd.read_csv('../ext_data/00_db_SPX__PX_OPEN.csv',index_col=0, parse_dates=True)
opn.columns = opn.columns.map(lambda x:x.split(' ')[0])
opn = opn[::-1]
opn = opn.loc[:,~opn.columns.duplicated()]

high = pd.read_csv('../ext_data/00_db_SPX__PX_HIGH.csv',index_col=0, parse_dates=True)
high.columns = high.columns.map(lambda x:x.split(' ')[0])
high = high[::-1]
high = high.loc[:,~high.columns.duplicated()]

low = pd.read_csv('../ext_data/00_db_SPX__PX_LOW.csv',index_col=0, parse_dates=True)
low.columns = low.columns.map(lambda x:x.split(' ')[0])
low = low[::-1]
low = low.loc[:,~low.columns.duplicated()]


close = pd.read_csv('../ext_data/00_db_SPX__PX_LAST.csv',index_col=0, parse_dates=True)
close.columns = close.columns.map(lambda x:x.split(' ')[0])
close = close[::-1] #ascending dates
close = close.loc[:,~close.columns.duplicated()]

volume = pd.read_csv('../ext_data/00_db_SPX__PX_VOLUME.csv',index_col=0, parse_dates=True)
volume.columns = volume.columns.map(lambda x:x.split(' ')[0])
volume = volume[::-1]
volume = volume.loc[:,~volume.columns.duplicated()]


macro = pd.read_excel('../ext_data/macro.xlsx',sheet_name = 'data', index_col = 0, parse_dates = True)
# macro.index = macro.index.to_period('M')

with open('../ext_data/features.pkl', 'rb') as handle:
    fund = pickle.load(handle)

In [3]:
#test case
tc = pd.DataFrame(close['MSFT'])
to = opn['AAPL']
tl = low['AAPL']
th = high['AAPL']
tv = volume['AAPL']

In [4]:
# tc['SMA200'] = talib.SMA(tc['AAPL'], timeperiod = 200)
# tc['SMA50'] = talib.SMA(tc['AAPL'], timeperiod = 50)
# tc['MOM3M'] = talib.MOM(tc['AAPL'], timeperiod = 63)
# tc['RSI14d'] = talib.RSI(tc['AAPL'])
# tc['ADX14'] = talib.ADX(th, tl, tc['AAPL'])
# tc['ATR14'] = talib.ATR(th, tl, tc['AAPL'])
# # tc['CDL3OUT'] = talib.CDL3OUTSIDE(to,th,tl,tc['AAPL'])
# tc['DCPERIOD'] = talib.HT_DCPERIOD(tc['AAPL'])
# # tc['WILLR'] = talib.WILLR(th, tl, tc['AAPL'])
# _, tc['MACD'], _ = talib.MACD(tc['AAPL'])
# tc['BBANDUP'], tc['BBANDMID'], tc['BBANDDOWN'] = talib.BBANDS(tc['AAPL'])

In [5]:
# tc = tc.fillna(method = 'ffill').loc[tc.notna().all(1)]

In [6]:
macro = macro.loc[tc.index[0]:tc.index[-1]] #reindex

In [7]:
import functools
class DataHandler:
    #Rewrite so that it can be applicable at __init__ of more complex portfolios
    def __init__(self, func):
        functools.update_wrapper(self, func)
        self.func = func
    def __call__(self, *args, **kwargs):
        #identify args
        args = list(args)
        for arg in args:
            if isinstance(arg, pd.Series) or isinstance(arg, pd.DataFrame):
                if arg.shape[0] == 1 or arg.shape[1] == 1:
                    y_old = arg
                else:
                    x_old = arg
                    
        #merge and align data
        m1 = pd.merge(y_old, x_old, left_on = y_old.index,
                                    right_on = x_old.index,
                                    how = 'right')
        m1.index = m1['key_0']
        m1 = m1.drop('key_0',axis=1)
        m1 = m1.fillna(method='ffill').dropna()
        
        
        y_new = m1.iloc[:,0]
        x_new = m1.iloc[:,1:]
        new_args = {'y': y_new,
                    'X': x_new}
        
        self.func(**new_args)

In [8]:
# @DataHandler
# def test(y,X):
#     mod = LinearRegression().fit(X = X, y = y)
#     print(mod.coef_)
# test(tc, macro)

In [9]:
#monthly data

merge1 = pd.merge(close, macro, left_on = tc.index,
                    right_on = macro.index,
                    how = 'right')
merge1.index = merge1['key_0']
merge1 = merge1.drop('key_0',axis=1)
merge1 = merge1.fillna(method='ffill').to_period('M')
fundm = fund['AAPL'].to_period('M')

In [10]:
merge2 = pd.merge(merge1, fundm, left_on = merge1.index,
                               right_on = fundm.index,
                               how = 'left')
merge2.index = merge2['key_0']
merge2 = merge2.drop('key_0',axis=1)
merge2 = merge2.fillna(method = 'bfill').fillna(method='ffill')

In [11]:
y = merge2['ABMD']
#simple label threshold
y = y.diff().apply(lambda x: 1 if x >= 0.02 else 0).shift(-1).dropna()
# stdev bound label

X = merge2.iloc[:-1,1:]

In [12]:
#daily data
# merge1 = pd.merge(tc, fund['AAPL'], left_on = tc.index, 
#                   right_on = fund['AAPL'].index, 
#                   how = 'left')
# merge1.index = merge1['key_0']
# merge1 = merge1.drop('key_0',axis=1)
# merge1 = merge1.fillna(method = 'bfill').fillna(method='ffill')

In [13]:
# merge2 = pd.merge(merge1, macro, left_on = merge1.index,
#                   right_on = macro.index,
#                   how = 'left')
# merge2.index = merge2['key_0']
# merge2 = merge2.drop('key_0',axis=1)
# merge2 = merge2.fillna(method = 'bfill').fillna(method='ffill')

In [14]:
# #simple label threshold
# y = y.diff().apply(lambda x: 1 if x >= 0 else 0).shift(-1).dropna()
# #stdev bound label

# X = merge2.iloc[:-1,1:]

In [15]:
def rf_training_episode(y, X):
    #labeling
    y = y.apply(lambda x: 1 if x >= 0.02 else 0)
    
    #grid search
    grid = {'n_estimators': [200, 500], 'max_depth': [3, 9, 12],
            'max_features': [4, 8, 12], 'random_state': [42]}
    test_scores = []

    rf_model = RandomForestClassifier()

    for g in ParameterGrid(grid):
        rf_model.set_params(**g) 
        rf_model.fit(X, y)
        test_scores.append(rf_model.score(X, y))

    best_index = np.argmax(test_scores)
    
    #fitting the optimal model
    rf = RandomForestClassifier(**ParameterGrid(grid)[best_index])
    rf.fit(X, y)
    
    pred_prob_long = rf.predict_proba(X)[-1][1]
    return pred_prob_long

In [54]:
#loading ext. data -- close prices of SPX
universe = pd.read_csv('../ext_data/00_db_SPX__PX_LAST.csv', index_col = 0, parse_dates = True)
universe = universe[::-1].loc[:,universe.notna().all(axis=0)]
securities = [universe.columns[i].split(' ')[0] for i, _ in enumerate(universe.columns)]
universe.columns = securities
missing = ['BF/B','BRK/B','CXO','ETN','LB','VAR','UAL']
universe = universe.drop(missing, axis = 1)
securities = [elem for elem in securities if elem not in missing]

In [55]:
port_universe = ps.Portfolio(securities[357:])
port_universe.set_custom_prices(universe, 'daily')

In [56]:
pred_model = mod.PredictionModel(port_universe)
pred_model.set_features(macro.to_period('M')) #add macro features
pred_model.set_features({sec : fund[sec].to_period('M') for 
                         sec in fund.keys()}) #add fundamental features
pred_model.prepare_targets(macro)
pred_model.set_prediction_model(rf_training_episode)

In [57]:
pred_model.rolling_model_prediction(estimation_period=60, window = 1)

Training ML models:   0%|          | 0/39 [00:00<?, ?it/s]

In [58]:
## save intermediate results
with open('tree_preds_2.pkl', 'wb') as handle:
    pickle.dump(pred_model.prediction_measure, handle, protocol= pickle.HIGHEST_PROTOCOL)

In [59]:
with open('tree_preds_1.pkl', 'rb') as handle:
    preds_1 = pickle.load(handle)
with open('tree_preds_2.pkl', 'rb') as handle:
    preds_2 = pickle.load(handle)


In [60]:
preds_all = {**preds_1, **preds_2}