# Feature Engineering and Targets

* Alpha Factors
* Universal Quant Features
* Time-based features
* Sector

In [1]:
import yaml
import os

# Retrieve parameters from configuration file
with open("../conf.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile)

BUNDLE_FOLDER = cfg['quantopian']['dataset3']['bundle_folder']
BUNDLE_NAME = cfg['quantopian']['dataset3']['bundle_name']
SECTOR_FOLDER = cfg['quantopian']['dataset3']['sector_folder']
SECTOR_DATA = cfg['quantopian']['dataset3']['sector_data']
SECTOR_NAMES = cfg['quantopian']['dataset3']['sector_names']

# Specify the bundle path
bundle_path = os.path.join(os.getcwd(), '..', 'data', BUNDLE_FOLDER)
sector_path = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_DATA)
sector_file = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_NAMES)
os.path.isdir(bundle_path)

True

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from mle_quant_utils import quant_helper, quant_factors

In [3]:
from zipline.data import bundles
from zipline.utils.calendars import get_calendar
from zipline.pipeline.factors import AverageDollarVolume, Returns, SimpleMovingAverage, AnnualizedVolatility, DailyReturns, CustomFactor, RSI, MACDSignal
from zipline.pipeline import Pipeline
from zipline.data.data_portal import DataPortal
from zipline.pipeline.data import USEquityPricing

In [4]:
import alphalens as al

In [5]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

In [6]:
# Data Bundle
os.environ['ZIPLINE_ROOT'] = bundle_path
ingest_func = bundles.csvdir.csvdir_equities(['daily'], BUNDLE_NAME)
bundles.register(BUNDLE_NAME, ingest_func)
print('Data Registered')

Data Registered


In [7]:
sector = quant_helper.get_sectors(sector_path)

In [8]:
universe = AverageDollarVolume(window_length=120).top(500) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(BUNDLE_NAME)
engine =  quant_helper.build_pipeline_engine(bundle_data, trading_calendar)

In [9]:
universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')
factor_start_date = universe_end_date - pd.DateOffset(years=3, days=2)

In [10]:
pipeline = Pipeline(screen=universe)

## Alpha Factors
* 1 year momentum
* 5 day momentum
* overnight returns

In [11]:
# 1yr returns
def momentum(window_length, universe, sector):
    """
    Higher past 12-month (252 days) returns are proportional to future return
    
    Parameters
    ----------
    window_length : int
        Returns window length
    universe : Zipline Filter
        Universe of stocks filter
    sector : Zipline Classifier
        Sector classifier

    Returns
    -------
    factor : Zipline Factor
        Mean reversion 5 day sector neutral factor
    """
    return Returns(window_length=window_length, mask=universe) \
        .demean(groupby=sector) \
        .rank() \
        .zscore()

def momentum_smoothed(window_length, smooth_window_length, universe, sector):
    """
    Smoothed version of momentum. window_lenghth is used in returns and smoothing computations
     Parameters
    ----------
    smooth_window_length : int
        smoothing factor to applie to SimpleMovingAverage
    """
    unsmoothed_factor = mean_reversion_sector_neutral(window_length, universe, sector)
    return SimpleMovingAverage(inputs=[unsmoothed_factor], window_length=smooth_window_length) \
        .rank() \
        .zscore()

In [12]:
# 5d returns
def mean_reversion_sector_neutral(window_length, universe, sector):
    """
    Short-term outperformers(underperformers) compared to their sector will revert.
    Generate the mean reversion 5 day sector neutral factor

    Parameters
    ----------
    window_length : int
        Returns window length
    universe : Zipline Filter
        Universe of stocks filter
    sector : Zipline Classifier
        Sector classifier

    Returns
    -------
    factor : Zipline Factor
        Mean reversion 5 day sector neutral factor
    """
    return -Returns(window_length=window_length, mask=universe) \
        .demean(groupby=sector) \
        .rank(method='ordinal', ascending=True) \
        .zscore()


def mean_reversion_sector_neutral_smoothed(window_length, universe, sector):
    """
    Smoothed version of mean_reversion_5day_sector_neutral. window_lenghth is used in returns and smoothing computations
    """
    unsmoothed_factor = mean_reversion_sector_neutral(window_length, universe, sector)
    return SimpleMovingAverage(inputs=[unsmoothed_factor], window_length=window_length) \
        .rank() \
        .zscore()


In [13]:
# Overnight returns
class CTO(Returns):
    """
    Computes the overnight return, per hypothesis from
    https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2554010
    """
    inputs = [USEquityPricing.open, USEquityPricing.close]
    
    def compute(self, today, assets, out, opens, closes):
        """
        The opens and closes matrix is 2 rows x N assets, with the most recent at the bottom.
        As such, opens[-1] is the most recent open, and closes[0] is the earlier close
        """
        out[:] = (opens[-1] - closes[0]) / closes[0]

class TrailingOvernightReturns(Returns):
    """
    Sum of trailing 1m O/N returns
    """
    window_safe = True
    
    def compute(self, today, asset_ids, out, cto):
        out[:] = np.nansum(cto, axis=0)
        
def overnight_sentiment(cto_window_length, trail_overnight_returns_window_length, universe):
    cto_out = CTO(mask=universe, window_length=cto_window_length)
    return TrailingOvernightReturns(inputs=[cto_out], window_length=trail_overnight_returns_window_length) \
        .rank() \
        .zscore()

def overnight_sentiment_smoothed(cto_window_length, trail_overnight_returns_window_length, universe):
    unsmoothed_factor = overnight_sentiment(cto_window_length, trail_overnight_returns_window_length, universe)
    return SimpleMovingAverage(inputs=[unsmoothed_factor], window_length=trail_overnight_returns_window_length) \
        .rank() \
        .zscore()


In [14]:
pipeline.add(
    momentum_1yr(252, universe, sector),
    'Momentum_1YR')

pipeline.add(
    momentum_smoothed(252, 5, universe, sector),
    'Momentum_1YR_Smoothed')

NameError: name 'momentum_1yr' is not defined

In [None]:
pipeline.add(
    mean_reversion_sector_neutral(5, universe, sector),
    'Mean_Reversion_Sector_Neutral')

pipeline.add(
    mean_reversion_sector_neutral_smoothed(5, universe, sector),
    'Mean_Reversion_Sector_Neutral_Smoothed')

In [None]:
pipeline.add(
    overnight_sentiment(2, 5, universe),
    'Overnight_Sentiment')
pipeline.add(
    overnight_sentiment_smoothed(2, 5, universe),
    'Overnight_Sentiment_Smoothed')

## Technical Indicators

In [None]:
from zipline.pipeline.factors.technical import BollingerBands

class customBB(BollingerBands):

    inputs = (USEquityPricing.close,)
    outputs = 'lower', 'middle', 'upper', 'ind_upper' ,'ind_lower'
    
    def compute(self, today, assets, out, close, k):
        std = np.nanstd(close, axis=0)
        difference = k * std
        out.middle = middle = np.nanmean(close, axis=0)
        out.upper = middle + difference
        out.lower = middle - difference
        out.close = close[-1,:]
        out.ind_upper = np.where(out.close>out.upper,1,0)
        out.ind_lower = np.where(out.close<out.lower,1,0)
        
class lower_bb(CustomFactor):

    params=('k',)
    inputs = (USEquityPricing.close,)
    window_length = 60
    window_safe = True
    
    def compute(self, today, assets, out, close, k):
        std = np.nanstd(close, axis=0)
        difference = k * std
        middle = middle = np.nanmean(close, axis=0)
        lower = middle - difference
        out[:] = np.where(close[-1,:]<lower,1,0)
        
class upper_bb(CustomFactor):

    params=('k',)
    inputs = (USEquityPricing.close,)
    window_length = 60
    window_safe = True
    
    def compute(self, today, assets, out, close, k):
        std = np.nanstd(close, axis=0)
        difference = k * std
        middle = middle = np.nanmean(close, axis=0)
        upper = middle + difference
        out[:] = np.where(close[-1,:]>upper,1,0)

In [None]:
tech_pl = Pipeline(screen=universe)

In [None]:
tech_pl.add(
    customBB(window_length=60, mask=universe,k =2),
    'customBB_60d')

tech_pl.add(
    upper_bb(window_length=60, mask=universe,k =2),
    'upperBB_60d')

tech_pl.add(
    lower_bb(window_length=60, mask=universe,k =2),
    'lowerBB_60d')

In [None]:
tech_factors = engine.run_pipeline(tech_pl, factor_start_date, universe_end_date)
tech_factors.values[0:5]

In [None]:
pipeline.add(
    RSI(window_length=15, mask=universe).rank().zscore(),
    'RSI_15')
pipeline.add(
    MACDSignal(fast_period=12, slow_period=26, signal_period=9).rank().zscore(),
    'MACD')

pipeline.add(
    upper_bb(window_length=60, mask=universe,k=2),
    'upperBB_60d')
pipeline.add(
    lower_bb(window_length=60, mask=universe,k=2),
    'lowerBB_60d')

## Quant Features
* Stock volatility
* Average dollar volume

Stock volatility: zipline has a custom factor called AnnualizedVolatility.  The [source code is here](https://github.com/quantopian/zipline/blob/master/zipline/pipeline/factors/basic.py) and also pasted below:

```python 
class AnnualizedVolatility(CustomFactor):
    """
    Volatility. The degree of variation of a series over time as measured by
    the standard deviation of daily returns.
    https://en.wikipedia.org/wiki/Volatility_(finance)
    **Default Inputs:** :data:`zipline.pipeline.factors.Returns(window_length=2)`  # noqa
    Parameters
    ----------
    annualization_factor : float, optional
        The number of time units per year. Defaults is 252, the number of NYSE
        trading days in a normal year.
    """
    inputs = [Returns(window_length=2)]
    params = {'annualization_factor': 252.0}
    window_length = 252

    def compute(self, today, assets, out, returns, annualization_factor):
        out[:] = nanstd(returns, axis=0) * (annualization_factor ** .5)
```

In [None]:
pipeline.add(AnnualizedVolatility(window_length=20, mask=universe).rank().zscore(), 'volatility_20d')
pipeline.add(AnnualizedVolatility(window_length=120, mask=universe).rank().zscore(), 'volatility_120d')
pipeline.add(AverageDollarVolume(window_length=20, mask=universe).rank().zscore(), 'adv_20d')
pipeline.add(AverageDollarVolume(window_length=120, mask=universe).rank().zscore(), 'adv_120d')

## Regime Features
* Market dispersion
* Market volatility

Capture market-wide regimes (look at the aggregate movement of the universe of stocks)

High and low dispersion: dispersion is looking at the dispersion (standard deviation) of the cross section of all stocks at each period of time (on each day).  We'll inherit from [CustomFactor](http://www.zipline.io/appendix.html?highlight=customfactor#zipline.pipeline.CustomFactor).  Feed in [DailyReturns](http://www.zipline.io/appendix.html?highlight=dailyreturns#zipline.pipeline.factors.DailyReturns) as the `inputs`.  

### Market dispersion feature
Create a class that inherits from `CustomFactor`.  Override the `compute` function to calculate the population standard deviation of all the stocks over a specified window of time.

$$\mu = \sum_{t=0}^{T}\sum_{i=1}^{N}r_{i,t}$$

$$\sqrt{\frac{1}{T} \sum_{t=0}^{T}  \frac{1}{N}\sum_{i=1}^{N}(r_{i,t} - \mu)^2}$$

Use [numpy.nanmean](https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.nanmean.html) to calculate the average market return $\mu$ and to calculate the average of the squared differences.


In [None]:
class MarketDispersion(CustomFactor):
    inputs = [DailyReturns()]
    window_length = 1
    window_safe = True

    def compute(self, today, assets, out, returns):
        
        # calculate average returns to simulate market as an equal weighted portfolio
        mean_returns = np.nanmean(returns)
        
        # calculate standard deviation of returns
        out[:] = np.sqrt(np.nanmean((returns - mean_returns)**2))

In [None]:
pipeline.add(SimpleMovingAverage(inputs=[MarketDispersion(mask=universe)], window_length=20), 'dispersion_20d')
pipeline.add(SimpleMovingAverage(inputs=[MarketDispersion(mask=universe)], window_length=120), 'dispersion_120d')


### Market volatility feature 
Class for market volatility, which inherits from [CustomFactor](http://www.zipline.io/appendix.html?highlight=customfactor#zipline.pipeline.CustomFactor).  This will measure the standard deviation of the returns of the "market".  In this case, "market" is approximated as the equal weighted average return of all the stocks in the stock universe.

#### Market return
$$r_{m,t} = \frac{1}{N}\sum_{i=1}^{N}r_{i,t}$$ for each day $t$ in `window_length`.  

#### Average market return
Also calculate the average market return over the `window_length` $T$ of days:  
$$\mu_{m} = \frac{1}{T}\sum_{t=1}^{T} r_{m,t}$$

#### Standard deviation of market return
Then calculate the standard deviation of the market return  
$$\sigma_{m,t} = \sqrt{252 \times \frac{1}{N} \sum_{t=1}^{T}(r_{m,t} - \mu_{m})^2 } $$

In [None]:
class MarketVolatility(CustomFactor):
    inputs = [DailyReturns()]
    window_length = 1
    window_safe = True
    params = {'annualization_factor': 252.0}
    
    def compute(self, today, assets, out, returns, annualization_factor):
        
        """ 
        For each row (each row represents one day of returns), 
        calculate the average of the cross-section of stock returns
        So that market_returns has one value for each day in the window_length
        So choose the appropriate axis (please see hints above)
        """
        mkt_returns = np.nanmean(returns, axis=1) 
        
        # Calculate the mean of market returns
        mkt_returns_mu = np.nanmean(mkt_returns)

        # Calculate the standard deviation of the market returns, then annualize them.
        out[:] = np.sqrt(annualization_factor * np.nanmean((mkt_returns-mkt_returns_mu)**2))

In [None]:
pipeline.add(MarketVolatility(window_length=20), 'market_vol_20d')
pipeline.add(MarketVolatility(window_length=120), 'market_vol_120d')

In [None]:
all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
all_factors.info()

## Evaluation
* Clean factor and Forward returns
* Compute evaluation metrics

In [None]:
data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

In [None]:
universe_tickers = engine\
    .run_pipeline(
        Pipeline(screen=universe),
        universe_end_date,
        universe_end_date)\
    .index.get_level_values(1)\
    .values.tolist()

In [None]:
pricing = quant_helper.get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        factor_start_date,
        universe_end_date)

In [None]:
factor_names = ['Momentum_1YR', 'Momentum_1YR_Smoothed',
                'Mean_Reversion_Sector_Neutral', 'Mean_Reversion_Sector_Neutral_Smoothed',
                'Overnight_Sentiment', 'Overnight_Sentiment_Smoothed',
                'volatility_20d', 'volatility_120d', 'adv_20d', 'adv_120d',
                'RSI_15', 'MACD']

In [None]:
print("The factor names are {}".format(factor_names))

# Use a dictionary to store each dataframe, one for each factor and its associated forward returns
factor_data = {}
for factor_name in factor_names:
    print("Formatting factor data for: " + factor_name)
    factor_data[factor_name] = al.utils.get_clean_factor_and_forward_returns(
        factor=all_factors[factor_name],
        prices=pricing,
        periods=[5])

In [None]:
df_factor_return, df_sharpe, df_rank_ic, df_fra, df_qr = quant_factors.factor_evaluation(factor_data, factor_names)

### Sharpe Ratio

In [None]:
df_sharpe.sort_values(by='Sharpe ratio', ascending=False)

### Forward Returns

### Quantile Analysis

### Turnover (FRA)