<font size="6"> **Feature Engineering and Targets** </font>

This notebook runs a full feature enginering pipeline and persists a subset 3 years span of factor data

Alpha Factors and Universal Quant Features are computed by using Zipline

In [1]:
OUTFILE1 = 'all_factors_3yrs.csv'
OUTFILE2 = 'forw_returns_3yrs.csv'



In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%run ../nb_config.py

In [4]:
import os

from src import utils
from src.load_data import io_utils as read_utils
from src.feat_eng import date_features, feat_utils
from src.mle_quant_utils import quant_helper, quant_factors, mle_utils

In [5]:
from zipline.data import bundles
from zipline.utils.calendars import get_calendar
from zipline.pipeline.factors import AnnualizedVolatility, AverageDollarVolume, Returns, SimpleMovingAverage, RSI, MACDSignal
from zipline.pipeline.factors.technical import BollingerBands
from zipline.pipeline import Pipeline
from zipline.data.data_portal import DataPortal

import alphalens as al

In [47]:
cfg = utils.read_conf()

BUNDLE_FOLDER = cfg['quantopian']['dataset3']['bundle_folder']
BUNDLE_NAME = cfg['quantopian']['dataset3']['bundle_name']
SECTOR_FOLDER = cfg['quantopian']['dataset3']['sector_folder']
SECTOR_DATA = cfg['quantopian']['dataset3']['sector_data']
SECTOR_NAMES = cfg['quantopian']['dataset3']['sector_names']

# Specify the bundle path
bundle_path = os.path.join(read_utils.raw_path, 'market_data', BUNDLE_FOLDER)
sector_path = os.path.join(read_utils.raw_path, 'market_data',SECTOR_FOLDER, SECTOR_DATA)
sector_file = os.path.join(read_utils.raw_path, 'market_data',SECTOR_FOLDER, SECTOR_NAMES)
os.path.isdir(bundle_path)

True

In [48]:
OUTFILE1 = cfg['output']['interim']['all_factors']
OUTFILE2 = cfg['output']['interim']['all_forwreturns']

In [7]:
TAU = int(cfg['models']['alpha_ml']['tau'])
target_col = cfg['models']['alpha_ml']['target_col']

In [8]:
split_col = cfg['models']['alpha_ml']['split_col']
splits = cfg['models']['alpha_ml']['splits']

In [9]:
adv_win = int(cfg['models']['universe']['window'])
adv_top = int(cfg['models']['universe']['adv_top'])

In [10]:
universe_end_date =  pd.Timestamp( cfg['models']['universe']['start'], tz='UTC') # pd.Timestamp('2016-01-05', tz='UTC')
factor_start_date =  pd.Timestamp( cfg['models']['universe']['end_alpha'], tz='UTC')  # universe_end_date - pd.DateOffset(years=3, days=2)

In [11]:
print(factor_start_date, universe_end_date)

2013-01-03 00:00:00+00:00 2016-01-05 00:00:00+00:00


# DataLoading

## Universe

In [12]:
# Data Bundle
os.environ['ZIPLINE_ROOT'] = bundle_path
ingest_func = bundles.csvdir.csvdir_equities(['daily'], BUNDLE_NAME)
bundles.register(BUNDLE_NAME, ingest_func)
print('Data Registered')

Data Registered


In [13]:
print('Universe ADV window: {} and top threshold: {}'.format(adv_win, adv_top))

Universe ADV window: 120 and top threshold: 500


In [14]:
print('Universe start: {} and end: {} dates'.format(universe_end_date, factor_start_date))

Universe start: 2016-01-05 00:00:00+00:00 and end: 2013-01-03 00:00:00+00:00 dates


In [15]:
universe = AverageDollarVolume(window_length=adv_win).top(adv_top) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(BUNDLE_NAME)
engine =  quant_helper.build_pipeline_engine(bundle_data, trading_calendar)

In [16]:
pipeline = Pipeline(screen=universe)
pipeline_target = Pipeline(screen=universe)

## Data Portal

In [17]:
data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

## Sector Look-up

In [18]:
sector = quant_helper.get_sectors(sector_path)
sector_lookup = pd.read_csv(sector_file, index_col='Sector_i')['Sector']
sector_lookup

Sector_i
0                 Healthcare
1                 Technology
2         Consumer Defensive
3                Industrials
4                  Utilities
5         Financial Services
6                Real Estate
7     Communication Services
8          Consumer Cyclical
9                     Energy
10           Basic Materials
Name: Sector, dtype: object

# Feature Engineering

## Alpha Factors

In [19]:
pipeline.add(
    quant_factors.momentum_smoothed(252, 5, universe, sector),
    'Momentum_1YR_Smoothed')
pipeline.add(
    quant_factors.mean_reversion_sector_neutral_smoothed(5, universe, sector),
    'Mean_Reversion_Sector_Neutral_Smoothed')
pipeline.add(
    quant_factors.overnight_sentiment_smoothed(2, 5, universe),
    'Overnight_Sentiment_Smoothed')

## Quant Features

In [20]:
pipeline.add(AnnualizedVolatility(window_length=20, mask=universe).rank().zscore(), 'volatility_20d')
pipeline.add(AnnualizedVolatility(window_length=120, mask=universe).rank().zscore(), 'volatility_120d')
pipeline.add(AverageDollarVolume(window_length=20, mask=universe).rank().zscore(), 'adv_20d')
pipeline.add(AverageDollarVolume(window_length=120, mask=universe).rank().zscore(), 'adv_120d')
pipeline.add(sector, 'sector_code')

## Regime Features

In [21]:
pipeline.add(SimpleMovingAverage(inputs=[quant_factors.MarketDispersion(mask=universe)], window_length=20), 'dispersion_20d')
pipeline.add(SimpleMovingAverage(inputs=[quant_factors.MarketDispersion(mask=universe)], window_length=120), 'dispersion_120d')
pipeline.add(quant_factors.MarketVolatility(window_length=20), 'market_vol_20d')
pipeline.add(quant_factors.MarketVolatility(window_length=120), 'market_vol_120d')

In [22]:
all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
all_factors.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 363734 entries, (2013-01-03 00:00:00+00:00, Equity(0 [A])) to (2016-01-05 00:00:00+00:00, Equity(490 [ZTS]))
Columns: 12 entries, Mean_Reversion_Sector_Neutral_Smoothed to volatility_20d
dtypes: float64(11), int64(1)
memory usage: 34.7+ MB


## Date Features

In [23]:
all_date_features = date_features.compute_date_features(
    all_factors, factor_start_date, universe_end_date)
date_cols = all_date_features.columns.tolist()
all_date_features.sample(5)

Unnamed: 0,Unnamed: 1,is_January,is_December,month_end,month_start,qtr_end,qtr_start,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,qtr_1,qtr_2,qtr_3,qtr_4
2014-11-25 00:00:00+00:00,Equity(284 [LYB]),0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2015-06-05 00:00:00+00:00,Equity(242 [IR]),0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2013-04-15 00:00:00+00:00,Equity(11 [ADP]),0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
2013-01-30 00:00:00+00:00,Equity(386 [RL]),1,0,0,0,0,0,0,0,1,0,0,1,0,0,0
2014-01-15 00:00:00+00:00,Equity(438 [TXN]),1,0,0,0,0,0,0,0,1,0,0,1,0,0,0


## OHE Sector

In [24]:
all_factors['sector_code'].value_counts(normalize=True).sort_values(ascending=False)

 8     0.153401
 3     0.144061
 5     0.138662
 0     0.126469
 1     0.113264
 2     0.072883
 9     0.064517
 6     0.060355
 4     0.056800
 10    0.041624
 7     0.020812
-1     0.007154
Name: sector_code, dtype: float64

In [25]:
sectors_ohe_df = pd.get_dummies(all_factors['sector_code'], prefix='sector_code')
sectors_ohe_cols = sectors_ohe_df.columns.tolist()
sectors_ohe_df.sample(5)

Unnamed: 0,Unnamed: 1,sector_code_-1,sector_code_0,sector_code_1,sector_code_2,sector_code_3,sector_code_4,sector_code_5,sector_code_6,sector_code_7,sector_code_8,sector_code_9,sector_code_10
2015-08-17 00:00:00+00:00,Equity(38 [AMZN]),0,0,0,0,0,0,0,0,0,1,0,0
2013-11-20 00:00:00+00:00,Equity(80 [CBG]),0,0,0,0,0,0,0,1,0,0,0,0
2013-10-16 00:00:00+00:00,Equity(321 [NEE]),0,0,0,0,0,1,0,0,0,0,0,0
2014-03-28 00:00:00+00:00,Equity(113 [CRM]),0,0,1,0,0,0,0,0,0,0,0,0
2014-05-30 00:00:00+00:00,Equity(306 [MON]),0,0,0,0,0,0,0,0,0,0,0,1


In [26]:
all_factors = pd.concat([all_factors, sectors_ohe_df, all_date_features], axis=1)

In [27]:
features = ['Mean_Reversion_Sector_Neutral_Smoothed',
 'Momentum_1YR_Smoothed',
 'Overnight_Sentiment_Smoothed',
 'adv_120d',
 'adv_20d',
 'dispersion_120d',
 'dispersion_20d',
 'market_vol_120d',
 'market_vol_20d',
 'volatility_120d',
 'volatility_20d'] + sectors_ohe_cols + date_cols

In [28]:
all_factors = all_factors[features].copy()

# Target

In [29]:
return_d_raw = 'return_{}d_raw'.format(TAU)
return_d = 'return_{}d'.format(TAU)
return_d_5p = 'return_{}d_5p'.format(TAU)
return_d_25p = 'return_{}d_25p'.format(TAU)

pipeline_target.add(Returns(window_length=TAU, mask=universe), return_d_raw)
pipeline_target.add(Returns(window_length=TAU, mask=universe).quantiles(2), return_d)
pipeline_target.add(Returns(window_length=TAU, mask=universe).quantiles(5), return_d_5p)
pipeline_target.add(Returns(window_length=TAU, mask=universe).quantiles(25), return_d_25p)

In [30]:
targets_df = engine.run_pipeline(pipeline_target, factor_start_date, universe_end_date)
targets_df.head()

Unnamed: 0,Unnamed: 1,return_5d,return_5d_25p,return_5d_5p,return_5d_raw
2013-01-03 00:00:00+00:00,Equity(0 [A]),0,2,0,0.01382
2013-01-03 00:00:00+00:00,Equity(1 [AAL]),1,22,4,0.056681
2013-01-03 00:00:00+00:00,Equity(2 [AAP]),0,10,2,0.026772
2013-01-03 00:00:00+00:00,Equity(3 [AAPL]),1,24,4,0.070221
2013-01-03 00:00:00+00:00,Equity(4 [ABBV]),-1,-1,-1,


In [31]:
targets_df.dropna().groupby(return_d)[return_d_raw].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
return_5d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,182042.0,-0.014613,0.024828,-0.50828,-0.025614,-0.010641,0.001088,0.051352
1,181612.0,0.020367,0.02597,-0.099715,0.005502,0.017179,0.031336,0.709748


In [32]:
pd.crosstab(index=targets_df[return_d],
            columns=targets_df[return_d_raw]>0)

return_5d_raw,False,True
return_5d,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,80,0
0,132859,49183
1,29628,151984


Computing two quantiles yields a boundary that does not respect zero, therefore, same few points at target 0 are positive returns and viceversa. Any even number bucketization should use zero as a threshold, on the other hand, odd number quantiles may left a band around zero is variable distribution behaves properly

In [33]:
targets_df['return_5d_fshift'] = targets_df.groupby(level=1)[return_d_raw].shift(-TAU)
targets_df['return_5d_fshift_is_positive'] = targets_df['return_5d_fshift'].apply(feat_utils.safe_positive)

targets_df[['return_5d_fshift', 'return_5d_fshift_is_positive']].reset_index().sort_values(['level_1', 'level_0']).head(10)

Unnamed: 0,level_0,level_1,return_5d_fshift,return_5d_fshift_is_positive
0,2013-01-03 00:00:00+00:00,Equity(0 [A]),-0.009169,0.0
471,2013-01-04 00:00:00+00:00,Equity(0 [A]),-0.007723,0.0
942,2013-01-07 00:00:00+00:00,Equity(0 [A]),-0.012743,0.0
1413,2013-01-08 00:00:00+00:00,Equity(0 [A]),0.032639,1.0
1884,2013-01-09 00:00:00+00:00,Equity(0 [A]),0.002336,1.0
2355,2013-01-10 00:00:00+00:00,Equity(0 [A]),-0.03568,0.0
2826,2013-01-11 00:00:00+00:00,Equity(0 [A]),0.00133,1.0
3297,2013-01-14 00:00:00+00:00,Equity(0 [A]),-0.01973,0.0
3768,2013-01-15 00:00:00+00:00,Equity(0 [A]),-0.036124,0.0
4239,2013-01-16 00:00:00+00:00,Equity(0 [A]),-0.000665,0.0


# Data Preparation

In [34]:
select_rows = targets_df.index.get_level_values(0).unique()[:-TAU]
select_rows[:2]

DatetimeIndex(['2013-01-03', '2013-01-04'], dtype='datetime64[ns, UTC]', freq=None)

In [35]:
# Avoid nans due to shifting
targets_df = targets_df.loc[(select_rows, slice(None)), :]
all_factors = all_factors.loc[(select_rows, slice(None)), :]

In [36]:
all_factors_isna = all_factors.isna().sum(axis=1)
all_factors = all_factors.interpolate(method='linear')

In [37]:
print('Number of rows with any missing value: ', len(all_factors_isna[all_factors_isna>0]))

Number of rows with any missing value:  6393


## Dataset conformance checking

In [38]:
len(targets_df)

361284

In [39]:
len(targets_df['return_5d_fshift_is_positive'].dropna())

361284

In [40]:
len(all_factors)

361284

In [41]:
len(all_factors.dropna())

361284

# Data Partitioning

Train/Valid/Test must be partitioned taking into account two facts:
* Data is time series
* Do not split a single day in two

```python
def train_valid_test_split(all_x, all_y, train_size, valid_size, test_size):
    """
    Generate the train, validation, and test dataset. TimeSeries splitting
    Returns
    -------
    x_train, x_valid, x_test: DataFrame (MultiIndex)
    y_train, y_valid, y_test: Pandas Series (MultiIndex)
    """
```

In [42]:
X_train, X_valid, X_test, y_train, y_valid, y_test = mle_utils.train_valid_test_split(
    all_factors, targets_df['return_5d_fshift'], float(splits['train']), float(splits['valid']), float(splits['test']))

In [43]:
splits_s = pd.Series(index=y_train.index, data=['train']*len(y_train)).append(
        pd.Series(index=y_valid.index, data=['valid']*len(y_valid)), verify_integrity=True
    ).append(
         pd.Series(index=y_test.index, data=['test']*len(y_test)), verify_integrity=True
    )
splits_s.name = split_col
splits_s.value_counts(normalize=True)

train    0.595147
test     0.203870
valid    0.200983
Name: split, dtype: float64

In [44]:
all_factors[split_col] = splits_s

# Write Data

In [45]:
all_factors_cast = quant_helper.cast_zipline_multidx_to_symbol(all_factors)
all_factors_cast.to_csv(read_utils.interim_path + OUTFILE1)

In [46]:
targets_df_cast = quant_helper.cast_zipline_multidx_to_symbol(targets_df)
all_factors_cast.to_csv(read_utils.interim_path + OUTFILE2)