<font size="6"> **Feature Engineering and Targets** </font>
* Alpha Factors
* Universal Quant Features
* Time-based features
* Sector

In [1]:
import yaml
import os

# Retrieve parameters from configuration file
with open("../conf.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile)

BUNDLE_FOLDER = cfg['quantopian']['dataset3']['bundle_folder']
BUNDLE_NAME = cfg['quantopian']['dataset3']['bundle_name']
SECTOR_FOLDER = cfg['quantopian']['dataset3']['sector_folder']
SECTOR_DATA = cfg['quantopian']['dataset3']['sector_data']
SECTOR_NAMES = cfg['quantopian']['dataset3']['sector_names']

# Specify the bundle path
bundle_path = os.path.join(os.getcwd(), '..', 'data', BUNDLE_FOLDER)
sector_path = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_DATA)
sector_file = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_NAMES)
os.path.isdir(bundle_path)

True

In [2]:
TAU = cfg['models']['alpha_ml']['tau']
target_col = cfg['models']['alpha_ml']['target_col']

In [3]:
import numpy as np
import pandas as pd

from mle_quant_utils import quant_helper, quant_factors, utils

In [4]:
from zipline.data import bundles
from zipline.utils.calendars import get_calendar
from zipline.pipeline.factors import AnnualizedVolatility, AverageDollarVolume, Returns, SimpleMovingAverage, RSI, MACDSignal
from zipline.pipeline.factors.technical import BollingerBands
from zipline.pipeline import Pipeline
from zipline.data.data_portal import DataPortal

In [5]:
import alphalens as al

# DataLoading

## Data Bundle

In [6]:
# Data Bundle
os.environ['ZIPLINE_ROOT'] = bundle_path
ingest_func = bundles.csvdir.csvdir_equities(['daily'], BUNDLE_NAME)
bundles.register(BUNDLE_NAME, ingest_func)
print('Data Registered')

Data Registered


In [7]:
adv_win = cfg['models']['universe']['window']
adv_top = cfg['models']['universe']['adv_top']
print('Universe ADV window: {} and top threshold: {}'.format(adv_win, adv_top))

Universe ADV window: 120 and top threshold: 500


In [9]:
universe_end_date =  pd.Timestamp( cfg['models']['universe']['start'], tz='UTC') # pd.Timestamp('2016-01-05', tz='UTC')
factor_start_date =  pd.Timestamp( cfg['models']['universe']['end'], tz='UTC')  # universe_end_date - pd.DateOffset(years=3, days=2)
print('Universe start: {} and end: {} dates'.format(universe_end_date, factor_start_date))

Universe start: 2016-01-05 00:00:00+00:00 and end: 2013-01-03 00:00:00+00:00 dates


In [10]:
universe = AverageDollarVolume(window_length=adv_win).top(adv_top) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(BUNDLE_NAME)
engine =  quant_helper.build_pipeline_engine(bundle_data, trading_calendar)

In [None]:
factor_start_date

In [None]:
pipeline = Pipeline(screen=universe)
pipeline_target = Pipeline(screen=universe)

## Data Portal

In [None]:
data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

# Feature Engineering

## One Hot Encode Sectors
For the model to better understand the sector data, we'll one hot encode this data.

In [None]:
sector = quant_helper.get_sectors(sector_path)
sector_lookup = pd.read_csv(sector_file, index_col='Sector_i')['Sector']
sector_lookup

## Alpha Factors

In [None]:
pipeline.add(
    quant_factors.momentum_smoothed(252, 5, universe, sector),
    'Momentum_1YR_Smoothed')
pipeline.add(
    quant_factors.mean_reversion_sector_neutral_smoothed(5, universe, sector),
    'Mean_Reversion_Sector_Neutral_Smoothed')
pipeline.add(
    quant_factors.overnight_sentiment_smoothed(2, 5, universe),
    'Overnight_Sentiment_Smoothed')

## Quant Features

In [None]:
pipeline.add(AnnualizedVolatility(window_length=20, mask=universe).rank().zscore(), 'volatility_20d')
pipeline.add(AnnualizedVolatility(window_length=120, mask=universe).rank().zscore(), 'volatility_120d')
pipeline.add(AverageDollarVolume(window_length=20, mask=universe).rank().zscore(), 'adv_20d')
pipeline.add(AverageDollarVolume(window_length=120, mask=universe).rank().zscore(), 'adv_120d')
pipeline.add(sector, 'sector_code')

## Regime Features

In [None]:
pipeline.add(SimpleMovingAverage(inputs=[quant_factors.MarketDispersion(mask=universe)], window_length=20), 'dispersion_20d')
pipeline.add(SimpleMovingAverage(inputs=[quant_factors.MarketDispersion(mask=universe)], window_length=120), 'dispersion_120d')
pipeline.add(quant_factors.MarketVolatility(window_length=20), 'market_vol_20d')
pipeline.add(quant_factors.MarketVolatility(window_length=120), 'market_vol_120d')

In [None]:
all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
all_factors.info()

## Date Features

In [None]:
all_factors['is_January'] = all_factors.index.get_level_values(0).month == 1
all_factors['is_December'] = all_factors.index.get_level_values(0).month == 12
all_factors['weekday'] = all_factors.index.get_level_values(0).weekday
all_factors['quarter'] = all_factors.index.get_level_values(0).quarter
all_factors['qtr_yr'] = all_factors.quarter.astype('str') + '_' + all_factors.index.get_level_values(0).year.astype('str')
all_factors['month_end'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BM'))
all_factors['month_start'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BMS'))
all_factors['qtr_end'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQ'))
all_factors['qtr_start'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQS'))

## OHE Categorical Features

In [None]:
all_factors['sector_code'].value_counts(normalize=True).sort_values(ascending=False)

In [None]:
sectors_ohe_df = pd.get_dummies(all_factors['sector_code'], prefix='sector_code')
sectors_ohe_cols = sectors_ohe_df.columns.tolist()
sectors_ohe_df.head(2)

In [None]:
qtr_ohe_df = pd.get_dummies(all_factors['quarter'], prefix='qtr')
qtr_ohe_cols = qtr_ohe_df.columns.tolist()
qtr_ohe_df.head(2)

In [None]:
weekday_ohe_df = pd.get_dummies(all_factors['weekday'], prefix='weekday')
weekday_ohe_cols = weekday_ohe_df.columns.tolist()
weekday_ohe_df.head(2)

In [None]:
all_factors[sectors_ohe_cols] = sectors_ohe_df
all_factors[qtr_ohe_cols] = qtr_ohe_df
all_factors[weekday_ohe_cols] = weekday_ohe_df

In [None]:
features = ['Mean_Reversion_Sector_Neutral_Smoothed',
 'Momentum_1YR_Smoothed',
 'Overnight_Sentiment_Smoothed',
 'adv_120d',
 'adv_20d',
 'dispersion_120d',
 'dispersion_20d',
 'market_vol_120d',
 'market_vol_20d',
 'volatility_120d',
 'volatility_20d',
 'is_January',
 'is_December',
 'month_start',
 'month_end',
 'qtr_end',
 'qtr_start'] + sectors_ohe_cols + qtr_ohe_cols + weekday_ohe_cols

In [None]:
all_factors = all_factors[features].copy()

# Target

In [None]:
return_d_raw = 'return_{}d_raw'.format(TAU)
return_d = 'return_{}d'.format(TAU)
return_d_5p = 'return_{}d_5p'.format(TAU)
return_d_25p = 'return_{}d_25p'.format(TAU)

pipeline_target.add(Returns(window_length=TAU, mask=universe), return_d_raw)
pipeline_target.add(Returns(window_length=TAU, mask=universe).quantiles(2), return_d)
pipeline_target.add(Returns(window_length=TAU, mask=universe).quantiles(5), return_d_5p)
pipeline_target.add(Returns(window_length=TAU, mask=universe).quantiles(25), return_d_25p)

In [None]:
targets_df = engine.run_pipeline(pipeline_target, factor_start_date, universe_end_date)
targets_df.head()

In [None]:
targets_df.dropna().groupby(return_d)[return_d_raw].describe()

In [None]:
pd.crosstab(index=targets_df[return_d],
            columns=targets_df[return_d_raw]>0)

Computing two quantiles yields a boundary that does not respect zero, therefore, same few points at target 0 are positive returns and viceversa. Any even number bucketization should use zero as a threshold, on the other hand, odd number quantiles may left a band around zero is variable distribution behaves properly

In [None]:
return_d_raw_is_positive = 'return_{}d_is_positive'.format(TAU)
targets_df[return_d_raw_is_positive] = targets_df[return_d_raw].apply(utils.safe_positive)

In [None]:
targets_df[target_col] = targets_df.groupby(level=1)[return_d_raw_is_positive].shift(-TAU)
targets_df[[return_d, target_col]].reset_index().sort_values(['level_1', 'level_0']).head(10)

# Data Preparation

In [None]:
select_rows = targets_df.index.get_level_values(0).unique()[:-TAU]
select_rows[:2]

In [None]:
targets_df = targets_df.loc[(select_rows, slice(None)), :]
all_factors = all_factors.loc[(select_rows, slice(None)), :]

In [None]:
all_factors_isna = all_factors.isna().sum(axis=1)
all_factors = all_factors.interpolate(method='linear')

In [None]:
print('Number of rows with any missing value: ', len(all_factors_isna[all_factors_isna>0]))

## Dataset conformance checking

In [None]:
len(targets_df)

In [None]:
len(targets_df[target_col].dropna())

In [None]:
len(all_factors)

In [None]:
len(all_factors.dropna())