# Feature Engineering and Targets

* Alpha Factors
* Universal Quant Features
* Time-based features
* Sector

In [1]:
import yaml
import os

# Retrieve parameters from configuration file
with open("../conf.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile)

BUNDLE_FOLDER = cfg['quantopian']['dataset3']['bundle_folder']
BUNDLE_NAME = cfg['quantopian']['dataset3']['bundle_name']
SECTOR_FOLDER = cfg['quantopian']['dataset3']['sector_folder']
SECTOR_DATA = cfg['quantopian']['dataset3']['sector_data']
SECTOR_NAMES = cfg['quantopian']['dataset3']['sector_names']

# Specify the bundle path
bundle_path = os.path.join(os.getcwd(), '..', 'data', BUNDLE_FOLDER)
sector_path = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_DATA)
sector_file = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_NAMES)
os.path.isdir(bundle_path)

True

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import quant_helper, quant_factors

In [39]:
from zipline.data import bundles
from zipline.utils.calendars import get_calendar
from zipline.pipeline.factors import AnnualizedVolatility, AverageDollarVolume, Returns, SimpleMovingAverage, RSI, MACDSignal
from zipline.pipeline.factors.technical import BollingerBands
from zipline.pipeline import Pipeline
from zipline.data.data_portal import DataPortal

In [4]:
import alphalens as al

In [5]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

In [58]:
# Data Bundle
os.environ['ZIPLINE_ROOT'] = bundle_path
ingest_func = bundles.csvdir.csvdir_equities(['daily'], BUNDLE_NAME)
bundles.register(BUNDLE_NAME, ingest_func)
print('Data Registered')

Data Registered


  after removing the cwd from sys.path.


In [57]:
universe = AverageDollarVolume(window_length=120).top(500) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(BUNDLE_NAME)
engine =  quant_helper.build_pipeline_engine(bundle_data, trading_calendar)

In [56]:
universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')
factor_start_date = universe_end_date - pd.DateOffset(years=3, days=2)

In [20]:
pipeline = Pipeline(screen=universe)
pipeline_target = Pipeline(screen=universe)
tech_pl = Pipeline(screen=universe)

## One Hot Encode Sectors
For the model to better understand the sector data, we'll one hot encode this data.

In [10]:
sector = quant_helper.get_sectors(sector_path)
sector_lookup = pd.read_csv(sector_file, index_col='Sector_i')['Sector']
sector_lookup

Sector_i
0                 Healthcare
1                 Technology
2         Consumer Defensive
3                Industrials
4                  Utilities
5         Financial Services
6                Real Estate
7     Communication Services
8          Consumer Cyclical
9                     Energy
10           Basic Materials
Name: Sector, dtype: object

## Alpha Factors

In [11]:
pipeline.add(
    quant_factors.momentum_1yr(252, universe, sector),
    'Momentum_1YR')
pipeline.add(
    quant_factors.mean_reversion_5day_sector_neutral_smoothed(20, universe, sector),
    'Mean_Reversion_Sector_Neutral_Smoothed')
pipeline.add(
    quant_factors.overnight_sentiment_smoothed(2, 10, universe),
    'Overnight_Sentiment_Smoothed')

In [12]:
pipeline.add(
    RSI(window_length=15, mask=universe),
    'RSI_15')
pipeline.add(
    MACDSignal(fast_period=12, slow_period=26, signal_period=9),
    'MACD')

## Quant Features

In [13]:
pipeline.add(AnnualizedVolatility(window_length=20, mask=universe).rank().zscore(), 'volatility_20d')
pipeline.add(AnnualizedVolatility(window_length=120, mask=universe).rank().zscore(), 'volatility_120d')
pipeline.add(AverageDollarVolume(window_length=20, mask=universe).rank().zscore(), 'adv_20d')
pipeline.add(AverageDollarVolume(window_length=120, mask=universe).rank().zscore(), 'adv_120d')
pipeline.add(sector, 'sector_code')

## Regime Features

In [14]:
pipeline.add(SimpleMovingAverage(inputs=[quant_factors.MarketDispersion(mask=universe)], window_length=20), 'dispersion_20d')
pipeline.add(SimpleMovingAverage(inputs=[quant_factors.MarketDispersion(mask=universe)], window_length=120), 'dispersion_120d')
pipeline.add(quant_factors.MarketVolatility(window_length=20), 'market_vol_20d')
pipeline.add(quant_factors.MarketVolatility(window_length=120), 'market_vol_120d')

In [15]:
all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
all_factors.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 363734 entries, (2013-01-03 00:00:00+00:00, Equity(0 [A])) to (2016-01-05 00:00:00+00:00, Equity(490 [ZTS]))
Data columns (total 14 columns):
MACD                                      363083 non-null float64
Mean_Reversion_Sector_Neutral_Smoothed    360790 non-null float64
Momentum_1YR                              357288 non-null float64
Overnight_Sentiment_Smoothed              363734 non-null float64
RSI_15                                    363711 non-null float64
adv_120d                                  363734 non-null float64
adv_20d                                   363734 non-null float64
dispersion_120d                           363734 non-null float64
dispersion_20d                            363734 non-null float64
market_vol_120d                           363734 non-null float64
market_vol_20d                            363734 non-null float64
sector_code                               363734 non-null int64
volatility_120d   

In [214]:
from zipline.pipeline.data import USEquityPricing

class customBB(BollingerBands):

    inputs = (USEquityPricing.close,)
    outputs = 'lower', 'middle', 'upper', 'ind_upper' ,'ind_lower'
    
    def compute(self, today, assets, out, close, k):
        std = np.nanstd(close, axis=0)
        difference = k * std
        out.middle = middle = np.nanmean(close, axis=0)
        out.upper = middle + difference
        out.lower = middle - difference
        out.close = close[-1,:]
        out.ind_upper = np.where(out.close>out.upper,1,0)
        out.ind_lower = np.where(out.close<out.lower,1,0)

In [218]:
tech_factors = engine.run_pipeline(tech_pl, factor_start_date, universe_end_date)
tech_factors.values[0]

array([(59.979418203347045, 63.57166666666667, 67.1639151299863, 0.0, 0.0)],
      dtype=object)

In [219]:
tech_factors.head(1)['BB_60d'].values

array([(59.979418203347045, 63.57166666666667, 67.1639151299863, 0.0, 0.0)],
      dtype=object)

## Date Features

In [None]:
all_factors['is_January'] = all_factors.index.get_level_values(0).month == 1
all_factors['is_December'] = all_factors.index.get_level_values(0).month == 12
all_factors['weekday'] = all_factors.index.get_level_values(0).weekday
all_factors['quarter'] = all_factors.index.get_level_values(0).quarter
all_factors['qtr_yr'] = all_factors.quarter.astype('str') + '_' + all_factors.index.get_level_values(0).year.astype('str')
all_factors['month_end'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BM'))
all_factors['month_start'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BMS'))
all_factors['qtr_end'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQ'))
all_factors['qtr_start'] = all_factors.index.get_level_values(0).isin(pd.date_range(start=factor_start_date, end=universe_end_date, freq='BQS'))

In [None]:
sector_columns = []
for sector_i, sector_name in sector_lookup.items():
    sector_column = 'sector_code_{}'.format(sector_i)
    sector_columns.append(sector_column)
    all_factors[sector_column] = (all_factors['sector_code'] == sector_i)

all_factors[sector_columns].head()

In [None]:
features = ['Mean_Reversion_Sector_Neutral_Smoothed',
 'Momentum_1YR',
 'Overnight_Sentiment_Smoothed',
 'adv_120d',
 'adv_20d',
 'dispersion_120d',
 'dispersion_20d',
 'market_vol_120d',
 'market_vol_20d',
 #'sector_code', # removed sector_code
 'volatility_120d',
 'volatility_20d',
 'sector_code_0',
 'sector_code_1',
 'sector_code_2',
 'sector_code_3',
 'sector_code_4',
 'sector_code_5',
 'sector_code_6',
 'sector_code_7',
 'sector_code_8',
 'sector_code_9',
 'sector_code_10',
 'is_January',
 'is_December',
 'weekday',
 'quarter',
 'month_start',
 'qtr_end',
 'qtr_start']

## Target

In [None]:
pipeline_target.add(Returns(window_length=5, mask=universe), 'return_5d_raw')
pipeline_target.add(Returns(window_length=5, mask=universe).quantiles(2), 'return_5d')
pipeline_target.add(Returns(window_length=5, mask=universe).quantiles(5), 'return_5d_p')


In [None]:
targets_df = engine.run_pipeline(pipeline_target, factor_start_date, universe_end_date)
targets_df.head()

In [None]:
targets_df['target'] = targets_df.groupby(level=1)['return_5d'].shift(-5)
targets_df[['return_5d','target']].reset_index().sort_values(['level_1', 'level_0']).head(10)

In [None]:
# verify from prices data
ticker = prices_df.columns[0]
dates = list(pd.date_range('2014-01-03', '2014-01-07'))

In [None]:
targets_df.loc[(dates, ticker) , 'return_5d_raw']

In [None]:
# pct_change(tau): (p[t]-p[t-tau])/p[t-tau]
#5d returns in quantopian are weekly returns
prices_df[ticker].pct_change(4)['2014-01-01':].head()

In [None]:
# current: 2014-01-02
# tau=4 2013-12-26
(prices_df.loc['2014-01-02', ticker] - prices_df.loc['2013-12-26', ticker])/prices_df.loc['2013-12-26', ticker]

In [None]:
print(factor_start_date, universe_end_date)
targets_df.info()