# Feature Engineering and Targets

* Alpha Factors
* Universal Quant Features
* Time-based features
* Sector

In [1]:
import yaml
import os

# Retrieve parameters from configuration file
with open("../conf.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile)

BUNDLE_FOLDER = cfg['quantopian']['dataset3']['bundle_folder']
BUNDLE_NAME = cfg['quantopian']['dataset3']['bundle_name']
SECTOR_FOLDER = cfg['quantopian']['dataset3']['sector_folder']
SECTOR_DATA = cfg['quantopian']['dataset3']['sector_data']
SECTOR_NAMES = cfg['quantopian']['dataset3']['sector_names']

# Specify the bundle path
bundle_path = os.path.join(os.getcwd(), '..', 'data', BUNDLE_FOLDER)
sector_path = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_DATA)
sector_file = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_NAMES)
os.path.isdir(bundle_path)

True

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from mle_quant_utils import quant_helper, quant_factors

In [3]:
from zipline.data import bundles
from zipline.utils.calendars import get_calendar
from zipline.pipeline.factors import AnnualizedVolatility, AverageDollarVolume, Returns, SimpleMovingAverage, RSI, MACDSignal
from zipline.pipeline.factors.technical import BollingerBands
from zipline.pipeline import Pipeline
from zipline.data.data_portal import DataPortal

In [4]:
import alphalens as al

In [5]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

In [6]:
# Data Bundle
os.environ['ZIPLINE_ROOT'] = bundle_path
ingest_func = bundles.csvdir.csvdir_equities(['daily'], BUNDLE_NAME)
bundles.register(BUNDLE_NAME, ingest_func)
print('Data Registered')

Data Registered


In [7]:
universe = AverageDollarVolume(window_length=120).top(500) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(BUNDLE_NAME)
engine =  quant_helper.build_pipeline_engine(bundle_data, trading_calendar)

In [8]:
universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')
factor_start_date = universe_end_date - pd.DateOffset(years=3, days=2)

In [9]:
pipeline = Pipeline(screen=universe)
pipeline_target = Pipeline(screen=universe)

## One Hot Encode Sectors
For the model to better understand the sector data, we'll one hot encode this data.

In [10]:
sector = quant_helper.get_sectors(sector_path)
sector_lookup = pd.read_csv(sector_file, index_col='Sector_i')['Sector']
sector_lookup

Sector_i
0                 Healthcare
1                 Technology
2         Consumer Defensive
3                Industrials
4                  Utilities
5         Financial Services
6                Real Estate
7     Communication Services
8          Consumer Cyclical
9                     Energy
10           Basic Materials
Name: Sector, dtype: object

## Alpha Factors

In [11]:
pipeline.add(
    quant_factors.momentum_1yr(252, universe, sector),
    'Momentum_1YR')
pipeline.add(
    quant_factors.mean_reversion_5day_sector_neutral_smoothed(20, universe, sector),
    'Mean_Reversion_Sector_Neutral_Smoothed')
pipeline.add(
    quant_factors.overnight_sentiment_smoothed(2, 10, universe),
    'Overnight_Sentiment_Smoothed')

## Quant Features

In [12]:
pipeline.add(AnnualizedVolatility(window_length=20, mask=universe).rank().zscore(), 'volatility_20d')
pipeline.add(AnnualizedVolatility(window_length=120, mask=universe).rank().zscore(), 'volatility_120d')
pipeline.add(AverageDollarVolume(window_length=20, mask=universe).rank().zscore(), 'adv_20d')
pipeline.add(AverageDollarVolume(window_length=120, mask=universe).rank().zscore(), 'adv_120d')
pipeline.add(sector, 'sector_code')

## Regime Features

In [13]:
pipeline.add(SimpleMovingAverage(inputs=[quant_factors.MarketDispersion(mask=universe)], window_length=20), 'dispersion_20d')
pipeline.add(SimpleMovingAverage(inputs=[quant_factors.MarketDispersion(mask=universe)], window_length=120), 'dispersion_120d')
pipeline.add(quant_factors.MarketVolatility(window_length=20), 'market_vol_20d')
pipeline.add(quant_factors.MarketVolatility(window_length=120), 'market_vol_120d')

In [14]:
all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
all_factors.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 363734 entries, (2013-01-03 00:00:00+00:00, Equity(0 [A])) to (2016-01-05 00:00:00+00:00, Equity(490 [ZTS]))
Data columns (total 12 columns):
Mean_Reversion_Sector_Neutral_Smoothed    360790 non-null float64
Momentum_1YR                              357288 non-null float64
Overnight_Sentiment_Smoothed              363734 non-null float64
adv_120d                                  363734 non-null float64
adv_20d                                   363734 non-null float64
dispersion_120d                           363734 non-null float64
dispersion_20d                            363734 non-null float64
market_vol_120d                           363734 non-null float64
market_vol_20d                            363734 non-null float64
sector_code                               363734 non-null int64
volatility_120d                           363714 non-null float64
volatility_20d                            363714 non-null float64
dtypes: float64(11

## Date Features

In [15]:
all_factors['is_January'] = all_factors.index.get_level_values(0).month == 1
all_factors['is_December'] = all_factors.index.get_level_values(0).month == 12
all_factors['weekday'] = all_factors.index.get_level_values(0).weekday
all_factors['quarter'] = all_factors.index.get_level_values(0).quarter
all_factors['qtr_yr'] = all_factors.quarter.astype('str') + '_' + all_factors.index.get_level_values(0).year.astype('str')
all_factors['month_end'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BM'))
all_factors['month_start'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BMS'))
all_factors['qtr_end'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQ'))
all_factors['qtr_start'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQS'))

## OHE Categorical Features

In [16]:
all_factors['sector_code'].value_counts(normalize=True).sort_values(ascending=False)

 8     0.153401
 3     0.144061
 5     0.138662
 0     0.126469
 1     0.113264
 2     0.072883
 9     0.064517
 6     0.060355
 4     0.056800
 10    0.041624
 7     0.020812
-1     0.007154
Name: sector_code, dtype: float64

In [17]:
sectors_ohe_df = pd.get_dummies(all_factors['sector_code'], prefix='sector_code')
sectors_ohe_cols = sectors_ohe_df.columns.tolist()
sectors_ohe_df.head(2)

Unnamed: 0,Unnamed: 1,sector_code_-1,sector_code_0,sector_code_1,sector_code_2,sector_code_3,sector_code_4,sector_code_5,sector_code_6,sector_code_7,sector_code_8,sector_code_9,sector_code_10
2013-01-03 00:00:00+00:00,Equity(0 [A]),0,1,0,0,0,0,0,0,0,0,0,0
2013-01-03 00:00:00+00:00,Equity(1 [AAL]),0,0,0,0,1,0,0,0,0,0,0,0


In [18]:
qtr_ohe_df = pd.get_dummies(all_factors['quarter'], prefix='qtr')
qtr_ohe_cols = qtr_ohe_df.columns.tolist()
qtr_ohe_df.head(2)

Unnamed: 0,Unnamed: 1,qtr_1,qtr_2,qtr_3,qtr_4
2013-01-03 00:00:00+00:00,Equity(0 [A]),1,0,0,0
2013-01-03 00:00:00+00:00,Equity(1 [AAL]),1,0,0,0


In [19]:
weekday_ohe_df = pd.get_dummies(all_factors['weekday'], prefix='weekday')
weekday_ohe_cols = weekday_ohe_df.columns.tolist()
weekday_ohe_df.head(2)

Unnamed: 0,Unnamed: 1,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4
2013-01-03 00:00:00+00:00,Equity(0 [A]),0,0,0,1,0
2013-01-03 00:00:00+00:00,Equity(1 [AAL]),0,0,0,1,0


In [20]:
all_factors[sectors_ohe_cols] = sectors_ohe_df
all_factors[qtr_ohe_cols] = qtr_ohe_df
all_factors[weekday_ohe_cols] = weekday_ohe_df

In [21]:
features = ['Mean_Reversion_Sector_Neutral_Smoothed',
 'Momentum_1YR',
 'Overnight_Sentiment_Smoothed',
 'adv_120d',
 'adv_20d',
 'dispersion_120d',
 'dispersion_20d',
 'market_vol_120d',
 'market_vol_20d',
 'volatility_120d',
 'volatility_20d',
 'is_January',
 'is_December',
 'month_start',
 'month_end',
 'qtr_end',
 'qtr_start'] + sectors_ohe_cols + qtr_ohe_cols + weekday_ohe_cols

In [22]:
all_factors = all_factors[features].copy()

## Target

In [23]:
pipeline_target.add(Returns(window_length=5, mask=universe), 'return_5d_raw')
pipeline_target.add(Returns(window_length=5, mask=universe).quantiles(2), 'return_5d')
pipeline_target.add(Returns(window_length=5, mask=universe).quantiles(5), 'return_5d_p')

In [24]:
targets_df = engine.run_pipeline(pipeline_target, factor_start_date, universe_end_date)
targets_df.head()

Unnamed: 0,Unnamed: 1,return_5d,return_5d_p,return_5d_raw
2013-01-03 00:00:00+00:00,Equity(0 [A]),0,0,0.01382
2013-01-03 00:00:00+00:00,Equity(1 [AAL]),1,4,0.056681
2013-01-03 00:00:00+00:00,Equity(2 [AAP]),0,2,0.026772
2013-01-03 00:00:00+00:00,Equity(3 [AAPL]),1,4,0.070221
2013-01-03 00:00:00+00:00,Equity(4 [ABBV]),-1,-1,


In [25]:
targets_df['target'] = targets_df.groupby(level=1)['return_5d'].shift(-5)
targets_df[['return_5d','target']].reset_index().sort_values(['level_1', 'level_0']).head(10)

Unnamed: 0,level_0,level_1,return_5d,target
0,2013-01-03 00:00:00+00:00,Equity(0 [A]),0,0.0
471,2013-01-04 00:00:00+00:00,Equity(0 [A]),0,0.0
942,2013-01-07 00:00:00+00:00,Equity(0 [A]),0,0.0
1413,2013-01-08 00:00:00+00:00,Equity(0 [A]),0,1.0
1884,2013-01-09 00:00:00+00:00,Equity(0 [A]),0,0.0
2355,2013-01-10 00:00:00+00:00,Equity(0 [A]),0,0.0
2826,2013-01-11 00:00:00+00:00,Equity(0 [A]),0,0.0
3297,2013-01-14 00:00:00+00:00,Equity(0 [A]),0,0.0
3768,2013-01-15 00:00:00+00:00,Equity(0 [A]),1,0.0
4239,2013-01-16 00:00:00+00:00,Equity(0 [A]),0,0.0
