# Feature Engineering and Targets

* Alpha Factors
* Universal Quant Features
* Time-based features
* Sector

In [1]:
import yaml
import os

# Retrieve parameters from configuration file
with open("../conf.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile)

BUNDLE_FOLDER = cfg['quantopian']['dataset3']['bundle_folder']
BUNDLE_NAME = cfg['quantopian']['dataset3']['bundle_name']
SECTOR_FOLDER = cfg['quantopian']['dataset3']['sector_folder']
SECTOR_DATA = cfg['quantopian']['dataset3']['sector_data']
SECTOR_NAMES = cfg['quantopian']['dataset3']['sector_names']

# Specify the bundle path
bundle_path = os.path.join(os.getcwd(), '..', 'data', BUNDLE_FOLDER)
sector_path = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_DATA)
sector_file = os.path.join(os.getcwd(), '..', 'data', SECTOR_FOLDER, SECTOR_NAMES)
os.path.isdir(bundle_path)

True

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from mle_quant_utils import quant_helper, quant_factors

In [3]:
from zipline.data import bundles
from zipline.utils.calendars import get_calendar
from zipline.pipeline.factors import AverageDollarVolume, Returns, SimpleMovingAverage, AnnualizedVolatility, DailyReturns, CustomFactor, RSI, MACDSignal
from zipline.pipeline import Pipeline
from zipline.data.data_portal import DataPortal
from zipline.pipeline.data import USEquityPricing

In [4]:
import alphalens as al

In [5]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

In [6]:
# Data Bundle
os.environ['ZIPLINE_ROOT'] = bundle_path
ingest_func = bundles.csvdir.csvdir_equities(['daily'], BUNDLE_NAME)
bundles.register(BUNDLE_NAME, ingest_func)
print('Data Registered')

Data Registered


In [7]:
sector = quant_helper.get_sectors(sector_path)

In [8]:
universe = AverageDollarVolume(window_length=120).top(500) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(BUNDLE_NAME)
engine =  quant_helper.build_pipeline_engine(bundle_data, trading_calendar)

In [9]:
universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')
factor_start_date = universe_end_date - pd.DateOffset(years=3, days=2)

## Technical Indicators

In [10]:
from zipline.pipeline.factors.technical import BollingerBands

class customBB(BollingerBands):

    inputs = (USEquityPricing.close,)
    outputs = 'lower', 'middle', 'upper', 'ind_upper' ,'ind_lower'
    
    def compute(self, today, assets, out, close, k):
        std = np.nanstd(close, axis=0)
        difference = k * std
        out.middle = middle = np.nanmean(close, axis=0)
        out.upper = middle + difference
        out.lower = middle - difference
        out.close = close[-1,:]
        out.ind_upper = np.where(out.close>out.upper,1,0)
        out.ind_lower = np.where(out.close<out.lower,1,0)
        
class lower_bb(CustomFactor):

    params=('k',)
    inputs = (USEquityPricing.close,)
    window_length = 60
    window_safe = True
    
    def compute(self, today, assets, out, close, k):
        std = np.nanstd(close, axis=0)
        difference = k * std
        middle = middle = np.nanmean(close, axis=0)
        lower = middle - difference
        out[:] = np.where(close[-1,:]<lower,1,0)
        
class upper_bb(CustomFactor):

    params=('k',)
    inputs = (USEquityPricing.close,)
    window_length = 60
    window_safe = True
    
    def compute(self, today, assets, out, close, k):
        std = np.nanstd(close, axis=0)
        difference = k * std
        middle = middle = np.nanmean(close, axis=0)
        upper = middle + difference
        out[:] = np.where(close[-1,:]>upper,1,0)

In [14]:
tech_pl = Pipeline(screen=universe)

In [15]:
tech_pl.add(
    customBB(window_length=60, mask=universe,k =2),
    'customBB_60d')

tech_pl.add(
    upper_bb(window_length=60, mask=universe,k =2),
    'upperBB_60d')

tech_pl.add(
    lower_bb(window_length=60, mask=universe,k =2),
    'lowerBB_60d')

In [16]:
tech_pl.add(
    RSI(window_length=15, mask=universe), 'RSI_15')#.rank().zscore(),
    
tech_pl.add(
    MACDSignal(fast_period=12, slow_period=26, signal_period=9), 'MACD')#.rank().zscore(),
    

In [22]:
all_factors = engine.run_pipeline(tech_pl, factor_start_date, universe_end_date)
all_factors.values[0:5]

array([[0.4035546314743341, 57.82508630609895,
        (59.979418203347045, 63.57166666666667, 67.1639151299863, 0.0, 0.0),
        0.0, 0.0],
       [0.18580160400429813, 66.57276995305165,
        (10.73721382067231, 11.979116666666668, 13.221019512661027, 1.0, 0.0),
        0.0, 1.0],
       [-0.7944264680723929, 47.83897643938223,
        (64.04825398510597, 72.33553333333333, 80.62281268156069, 0.0, 0.0),
        0.0, 0.0],
       [-1.5739213819571556, 53.04889442541264,
        (62.286520296786996, 73.31343333333334, 84.34034636987967, 0.0, 0.0),
        0.0, 0.0],
       [nan, nan,
        (28.865000000000002, 28.865000000000002, 28.865000000000002, 0.0, 0.0),
        0.0, 0.0]], dtype=object)

## Evaluation
* Clean factor and Forward returns
* Compute evaluation metrics

In [18]:
data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

In [19]:
universe_tickers = engine\
    .run_pipeline(
        Pipeline(screen=universe),
        universe_end_date,
        universe_end_date)\
    .index.get_level_values(1)\
    .values.tolist()

In [20]:
pricing = quant_helper.get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        factor_start_date,
        universe_end_date)

  end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')
  start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')


In [23]:
factor_names = ['RSI_15', 'MACD']

In [24]:
print("The factor names are {}".format(factor_names))

# Use a dictionary to store each dataframe, one for each factor and its associated forward returns
factor_data = {}
for factor_name in factor_names:
    print("Formatting factor data for: " + factor_name)
    factor_data[factor_name] = al.utils.get_clean_factor_and_forward_returns(
        factor=all_factors[factor_name],
        prices=pricing,
        periods=[5])

The factor names are ['RSI_15', 'MACD']
Formatting factor data for: RSI_15
Dropped 0.8% entries from factor data: 0.8% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Formatting factor data for: MACD
Dropped 1.0% entries from factor data: 1.0% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!


In [25]:
df_factor_return, df_sharpe, df_rank_ic, df_fra, df_qr = quant_factors.factor_evaluation(factor_data, factor_names)

Calculating the factor weighted returns and sharpe-ratio for: RSI_15
Calculating the Ranked IC for: RSI_15
Calculating the FRA for: RSI_15
Calculating Quantile returns for: RSI_15
Calculating the factor weighted returns and sharpe-ratio for: MACD
Calculating the Ranked IC for: MACD
Calculating the FRA for: MACD
Calculating Quantile returns for: MACD


### Sharpe Ratio

In [26]:
df_sharpe.sort_values(by='Sharpe ratio', ascending=False)

Unnamed: 0,Sharpe ratio
MACD,-0.392008
RSI_15,-0.793536


### Forward Returns

### Quantile Analysis

### Turnover (FRA)