<font size="6"> **Multifactor Risk Models** </font>

In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

import pickle

from mle_quant_utils import quant_helper, mle_utils


%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

In [2]:
from zipline.data import bundles
from zipline.utils.calendars import get_calendar
from zipline.pipeline.factors import AverageDollarVolume
from zipline.pipeline import Pipeline
from zipline.pipeline.factors import Returns, SimpleMovingAverage
from zipline.pipeline.data import USEquityPricing

from zipline.pipeline.factors import CustomFactor, DailyReturns, Returns, SimpleMovingAverage, AnnualizedVolatility
from zipline.pipeline.data import USEquityPricing

from zipline.pipeline import Pipeline
from zipline.pipeline.factors import AverageDollarVolume
from zipline.utils.calendars import get_calendar


from zipline.data.data_portal import DataPortal

In [3]:
import yaml
import os

# Retrieve parameters from configuration file
with open("../conf.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile)

BUNDLE_FOLDER = cfg['quantopian']['dataset2']['bundle_folder']
BUNDLE_NAME = cfg['quantopian']['dataset2']['bundle_name']
SECTOR_FOLDER = cfg['quantopian']['dataset2']['sector_folder']
SECTOR_DATA = cfg['quantopian']['dataset2']['sector_data']
SECTOR_NAMES = cfg['quantopian']['dataset2']['sector_names']

# Specify the bundle path
bundle_path = os.path.join(os.getcwd(), '..', 'data', BUNDLE_FOLDER)
sector_path = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_DATA)
sector_file = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_NAMES)
os.path.isdir(bundle_path)

True

In [4]:
MODEL_VERSION = 'v01'
print('Model version: ', MODEL_VERSION)

Model version:  v01


In [5]:
INPATH =  "../data/" + cfg['output']['main'] + "/" + cfg['output']['alpha_factors_ml']['folder'] + "/" + MODEL_VERSION +"/"
INFILE1 = cfg['output']['alpha_factors_ml']['features']
INFILE2 = cfg['output']['alpha_factors_ml']['targets']
INFILE3 = cfg['output']['alpha_factors_ml']['model']

In [6]:
OUTPATH = "../data/" + cfg['output']['main'] + "/" + cfg['output']['raw']['folder'] + "/"
OUTFILE1 = cfg['output']['raw']['pricing']
OUTFILE2 = cfg['output']['raw']['volume']

In [7]:
split_col = cfg['models']['alpha_ml']['split_col']

# Load Data

## Data Bundle

In [8]:
adv_win = cfg['models']['universe']['window']
adv_top = cfg['models']['universe']['adv_top']
print('Universe ADV window: {} and top threshold: {}'.format(adv_win, adv_top))

Universe ADV window: 120 and top threshold: 500


In [9]:
# Data Bundle
os.environ['ZIPLINE_ROOT'] = bundle_path
ingest_func = bundles.csvdir.csvdir_equities(['daily'], BUNDLE_NAME)
bundles.register(BUNDLE_NAME, ingest_func)
print('Data Registered')

Data Registered


In [10]:
universe = AverageDollarVolume(window_length=adv_win).top(adv_top) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(BUNDLE_NAME)
engine =  quant_helper.build_pipeline_engine(bundle_data, trading_calendar)

In [11]:
universe_end_date =  pd.Timestamp( cfg['models']['universe']['start'], tz='UTC') # pd.Timestamp('2016-01-05', tz='UTC')
universe_start_date =  pd.Timestamp( cfg['models']['universe']['end_risk'], tz='UTC')  # universe_end_date - pd.DateOffset(years=5)
print('Universe start: {} and end: {} dates'.format(universe_end_date, universe_start_date))

Universe start: 2016-01-05 00:00:00+00:00 and end: 2011-01-05 00:00:00+00:00 dates


In [12]:
universe_tickers = engine\
    .run_pipeline(
        Pipeline(screen=universe),
        universe_end_date,
        universe_end_date)\
    .index.get_level_values(1)\
    .values.tolist()
    
universe_tickers[:5]

[Equity(0 [A]),
 Equity(1 [AAL]),
 Equity(2 [AAP]),
 Equity(3 [AAPL]),
 Equity(4 [ABBV])]

## Data Portal
Not that we have our pipeline built, let's access the returns data. We'll start by building a data portal.

In [13]:
data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

In [14]:
pricing = quant_helper.get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        universe_start_date,
        universe_end_date)
   
volume  = quant_helper.get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        universe_start_date,
        universe_end_date, field='volume')

  end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')
  start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')


In [15]:
five_year_returns = pricing .pct_change()[1:].fillna(0)
pd.concat([five_year_returns.head(2), five_year_returns.tail(2)],axis=0)

Unnamed: 0,Equity(0 [A]),Equity(1 [AAL]),Equity(2 [AAP]),Equity(3 [AAPL]),Equity(4 [ABBV]),Equity(5 [ABC]),Equity(6 [ABT]),Equity(7 [ACN]),Equity(8 [ADBE]),Equity(9 [ADI]),...,Equity(481 [XL]),Equity(482 [XLNX]),Equity(483 [XOM]),Equity(484 [XRAY]),Equity(485 [XRX]),Equity(486 [XYL]),Equity(487 [YUM]),Equity(488 [ZBH]),Equity(489 [ZION]),Equity(490 [ZTS])
2011-01-07 00:00:00+00:00,0.008437,0.01423,0.026702,0.007146,0.0,0.001994,0.004165,0.001648,-0.007127,-0.005818,...,-0.001838,-0.005619,0.005461,-0.004044,-0.013953,0.0,0.012457,-0.000181,-0.010458,0.0
2011-01-10 00:00:00+00:00,-0.004174,0.006195,0.007435,0.018852,0.0,-0.005714,-0.008896,-0.008854,0.028714,0.002926,...,0.000947,0.007814,-0.006081,0.010466,0.009733,0.0,0.00144,0.007784,-0.017945,0.0
2016-01-04 00:00:00+00:00,-0.028282,-0.033988,0.011494,0.000855,-0.027512,-0.017741,-0.044067,-0.025551,-0.020971,-0.015919,...,-0.024767,-0.024922,-0.006276,-0.032711,-0.031051,-0.01152,-0.011489,-0.007604,-0.021614,-0.013564
2016-01-05 00:00:00+00:00,0.004058,-0.009541,-0.00683,-0.025054,-0.004169,0.014629,-0.000247,0.005207,0.004023,-0.007347,...,0.002098,0.014863,0.008511,0.02039,-0.001957,-0.000286,-0.002495,0.02082,-0.010853,0.015647


## ML-Alpha

In [16]:
features_df = pd.read_csv(INPATH + INFILE1, parse_dates=['date'])
features_df['date'] = features_df['date'].dt.tz_localize('utc')
features_df.set_index(['date', 'asset'], inplace=True)
split = features_df.pop(split_col)
features = features_df.columns.tolist()
pd.concat([features_df.head(2), features_df.tail(2)],axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Mean_Reversion_Sector_Neutral_Smoothed,Momentum_1YR_Smoothed,Overnight_Sentiment_Smoothed,adv_120d,adv_20d,dispersion_120d,dispersion_20d,market_vol_120d,market_vol_20d,volatility_120d,...,sector_code_5,sector_code_6,sector_code_7,sector_code_8,sector_code_9,sector_code_10,qtr_1,qtr_2,qtr_3,qtr_4
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013-01-03 00:00:00+00:00,A,-0.795709,-1.230279,-1.301799,1.338573,1.397411,0.01327,0.011178,0.127654,0.135452,-0.836546,...,0,0,0,0,0,0,1,0,0,0
2013-01-03 00:00:00+00:00,AAL,0.558847,1.713471,-1.632765,1.139994,1.081155,0.01327,0.011178,0.127654,0.135452,1.639924,...,0,0,0,0,0,0,1,0,0,0
2015-12-28 00:00:00+00:00,ZION,0.773366,-0.188031,1.325552,-0.965002,-0.724635,0.014916,0.014731,0.18368,0.181479,0.547895,...,1,0,0,0,0,0,0,0,0,1
2015-12-28 00:00:00+00:00,ZTS,-0.944433,0.209727,0.441851,0.229762,-0.180275,0.014916,0.014731,0.18368,0.181479,0.413572,...,0,0,0,0,0,0,0,0,0,1


In [17]:
valid_dates = split[split=='valid'].index.get_level_values(0)
valid_start = valid_dates[0].strftime('%Y-%m')
valid_end = valid_dates[-1].strftime('%Y-%m')

In [18]:
test_dates = split[split=='test'].index.get_level_values(0)
test_start = test_dates[0].strftime('%Y-%m')
test_end = test_dates[-1].strftime('%Y-%m')

In [19]:
train_valid_idx = split[(split=='train') | (split=='valid')].index.get_level_values(0).drop_duplicates()

In [20]:
X_train_valid = features_df.loc[(train_valid_idx, slice(None)), :]
pd.concat([X_train_valid.head(2), X_train_valid.tail(2)],axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Mean_Reversion_Sector_Neutral_Smoothed,Momentum_1YR_Smoothed,Overnight_Sentiment_Smoothed,adv_120d,adv_20d,dispersion_120d,dispersion_20d,market_vol_120d,market_vol_20d,volatility_120d,...,sector_code_5,sector_code_6,sector_code_7,sector_code_8,sector_code_9,sector_code_10,qtr_1,qtr_2,qtr_3,qtr_4
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2013-01-03 00:00:00+00:00,A,-0.795709,-1.230279,-1.301799,1.338573,1.397411,0.01327,0.011178,0.127654,0.135452,-0.836546,...,0,0,0,0,0,0,1,0,0,0
2013-01-03 00:00:00+00:00,AAL,0.558847,1.713471,-1.632765,1.139994,1.081155,0.01327,0.011178,0.127654,0.135452,1.639924,...,0,0,0,0,0,0,1,0,0,0
2015-05-27 00:00:00+00:00,ZION,0.216057,-0.884221,-0.078567,-1.085659,-1.342789,0.012559,0.011522,0.12942,0.104706,0.664252,...,1,0,0,0,0,0,0,1,0,0
2015-05-27 00:00:00+00:00,ZTS,-1.418772,1.393649,1.378501,0.249987,0.378552,0.012559,0.011522,0.12942,0.104706,-0.321412,...,0,0,0,0,0,0,0,1,0,0


In [21]:
with open(INPATH + INFILE3, 'rb') as file:
    model = pickle.load(file)

In [22]:
ml_alpha = pd.Series(index=features_df.index, data=model.predict(features_df))
ml_alpha[split_col] = split

# Write raw data

In [23]:
pricing.columns = [x.symbol for x in pricing.columns]
pricing.index.name = 'date'
pd.concat([pricing.head(2), pricing.tail(2)], axis=0)

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACN,ADBE,ADI,...,XL,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-06 00:00:00+00:00,57.014,10.822,59.209,42.399,,31.088,19.926,41.27,32.27,30.936,...,19.045,25.095,59.697,33.386,25.945,,30.184,49.601,23.427,
2011-01-07 00:00:00+00:00,57.495,10.976,60.79,42.702,,31.15,20.009,41.338,32.04,30.756,...,19.01,24.954,60.023,33.251,25.583,,30.56,49.592,23.182,
2016-01-04 00:00:00+00:00,130.838,39.933,151.537,100.621,52.526,97.613,40.544,96.984,91.97,51.31,...,36.226,43.194,70.617,58.018,26.056,34.923,49.3,99.714,25.983,46.398
2016-01-05 00:00:00+00:00,131.369,39.552,150.502,98.1,52.307,99.041,40.534,97.489,92.34,50.933,...,36.302,43.836,71.218,59.201,26.005,34.913,49.177,101.79,25.701,47.124


In [24]:
pricing.to_csv(OUTPATH + OUTFILE1)

In [25]:
volume.columns = [x.symbol for x in volume.columns]
volume.index.name = 'date'
pd.concat([volume.head(2), volume.tail(2)], axis=0)

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACN,ADBE,ADI,...,XL,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-06 00:00:00+00:00,7507200.0,11968097.0,1513500.0,75107200.0,0.0,3344200.0,32471000.0,3859000.0,6230900.0,2987200.0,...,6792000.0,4915000.0,22525300.0,500700.0,1782475.0,0.0,4095300.0,4292700.0,4421700.0,0.0
2011-01-07 00:00:00+00:00,6797000.0,11708043.0,1160600.0,77982800.0,0.0,2971000.0,21604400.0,2914600.0,6828500.0,3963300.0,...,3849700.0,5513900.0,19297700.0,447700.0,2193150.0,0.0,3379100.0,2686900.0,5059400.0,0.0
2016-01-04 00:00:00+00:00,5719241.0,12037151.0,1935307.0,67649387.0,10308397.0,2326986.0,12227458.0,2817024.0,2993832.0,2648908.0,...,2922717.0,3472249.0,20399059.0,1387142.0,2280638.0,1353443.0,3466289.0,2042842.0,3523301.0,2871687.0
2016-01-05 00:00:00+00:00,3349871.0,10514164.0,1127677.0,55790992.0,7179634.0,2049940.0,8245208.0,2408961.0,1821298.0,2689457.0,...,2223793.0,3895832.0,11992697.0,1611309.0,1620568.0,1075354.0,3791087.0,2227988.0,2653585.0,3117022.0


In [26]:
volume.to_csv(OUTPATH + OUTFILE2)