### Smoothing Exercise
create a universe of stocks of QTradableStocksUS from the following website

https://www.quantopian.com/posts/working-on-our-best-universe-yet-qtradablestocksus

In [1]:
# Imports
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.data import morningstar as mstar
from quantopian.pipeline.factors import AverageDollarVolume
from quantopian.pipeline.factors.morningstar import MarketCap
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline import Pipeline
from quantopian.research import run_pipeline

from quantopian.pipeline.factors import SimpleMovingAverage
from quantopian.pipeline.factors import Returns
from zipline.pipeline.factors import DailyReturns

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame as df
import time

from quantopian.pipeline.experimental import QTradableStocksUS

### Pipeline construction using QTradableStocks as Screen

In [2]:


def make_pipeline():
    
    average_day_dv_200 = AverageDollarVolume(window_length = 200)
    market_cap = Fundamentals.market_cap.latest
    price = USEquityPricing.close.latest
    volume = USEquityPricing.volume.latest
    sector = Sector()
    
    # Ranked REturn
    returns = Returns(window_length = 252)
    ranked_return = returns.rank()
    
    # New factors
    # create a factor of 1 year returns, demeaned by sector, rank, zscore
    factor = (
        Returns(window_length = 252).\
        demean(groupby = Sector()).\
        rank().\
        zscore()
    
    )
    
    # Use the newly created factor as an input into SimpleMovingAverage, with a window length of 5
    # Also rank and zscore (don't need to demean by sector)
    factor_smoothed = (
                    SimpleMovingAverage(inputs = [factor], window_length=5).\
                    rank().\
                    zscore()
    )
    
    
    return Pipeline(
        columns={
            'AverageDollarVolume200': average_day_dv_200,
            'MarketCap': market_cap,
            'Price': price,
            'Volume': volume,
            'Sector': sector,
            '1yrReturns': factor,
            '5dAvgReturns': factor_smoothed,
            'ranked_return': ranked_return
        },
        screen = QTradableStocksUS()
    )

In [3]:
# Pipeline is run over this time range and outputs a dataframe indexed by asset name:
START_DATE = '2017'
END_DATE = '2019-07-30'

QTU_pipline2 = run_pipeline(make_pipeline(), START_DATE, END_DATE, chunksize = 252)



In [4]:
QTU_pipline2

Unnamed: 0,Unnamed: 1,1yrReturns,5dAvgReturns,AverageDollarVolume200,MarketCap,Price,Sector,Volume,ranked_return
2017-01-03 00:00:00+00:00,Equity(2 [ARNC]),-1.663162,-1.694499,1.736107e+08,8.129387e+09,18.550,101,3871123.0,609.0
2017-01-03 00:00:00+00:00,Equity(24 [AAPL]),0.242636,0.270176,3.048351e+09,6.175885e+11,115.840,311,23251752.0,3760.0
2017-01-03 00:00:00+00:00,Equity(31 [ABAX]),0.731982,0.638386,5.583182e+06,1.189172e+09,52.740,206,79145.0,1770.0
2017-01-03 00:00:00+00:00,Equity(41 [ARCB]),0.571808,0.623764,3.861210e+06,7.084623e+08,27.750,310,118470.0,5667.0
2017-01-03 00:00:00+00:00,Equity(52 [ABM]),1.027217,0.888291,8.639352e+06,2.268896e+09,40.680,310,257808.0,6267.0
2017-01-03 00:00:00+00:00,Equity(53 [ABMD]),1.407291,1.366831,4.395544e+07,4.888170e+09,112.700,206,91316.0,5479.0
2017-01-03 00:00:00+00:00,Equity(62 [ABT]),0.527013,0.414403,3.053665e+08,5.655141e+10,38.420,206,8426976.0,1350.0
2017-01-03 00:00:00+00:00,Equity(64 [GOLD]),1.425616,1.415350,2.981478e+08,1.862359e+10,15.990,101,17842566.0,6979.0
2017-01-03 00:00:00+00:00,Equity(67 [ADSK]),0.628819,0.669624,9.727194e+07,1.647140e+10,74.010,311,1035873.0,4775.0
2017-01-03 00:00:00+00:00,Equity(76 [TAP]),-0.091286,-0.100028,1.395543e+08,2.090877e+10,97.330,205,577000.0,3169.0


In [5]:
###  Lets see what stocks had the most gains ytd through
leading_stocks = QTU_pipline2.sort_values(by=['ranked_return'], ascending = False)
leading_stocks.filter(like='2019-07-30', axis = 0).head(50)

Unnamed: 0,Unnamed: 1,1yrReturns,5dAvgReturns,AverageDollarVolume200,MarketCap,Price,Sector,Volume,ranked_return
2019-07-30 00:00:00+00:00,Equity(49607 [AXSM]),1.729176,1.729251,15191030.0,883589900.0,26.5,206,925814.0,7295.0
2019-07-30 00:00:00+00:00,Equity(32726 [EHTH]),1.723426,1.721784,21218470.0,2397177000.0,105.86,103,534068.0,7284.0
2019-07-30 00:00:00+00:00,Equity(42749 [ENPH]),1.717676,1.715561,23389500.0,2565790000.0,21.21,311,3144991.0,7275.0
2019-07-30 00:00:00+00:00,Equity(50477 [IIPR]),1.715759,1.716806,24154530.0,1188126000.0,107.01,104,286071.0,7272.0
2019-07-30 00:00:00+00:00,Equity(50411 [RARX]),1.712564,1.717428,7275392.0,1563282000.0,33.35,206,596464.0,7266.0
2019-07-30 00:00:00+00:00,Equity(48628 [NVTA]),1.711926,1.713072,23092450.0,2413843000.0,25.76,206,1640214.0,7265.0
2019-07-30 00:00:00+00:00,Equity(50288 [TTD]),1.707453,1.706849,198568900.0,11592260000.0,260.187,311,2073258.0,7260.0
2019-07-30 00:00:00+00:00,Equity(50735 [AYX]),1.70362,1.706227,64077370.0,7504983000.0,120.0,311,913884.0,7259.0
2019-07-30 00:00:00+00:00,Equity(44991 [NSTG]),1.708731,1.709338,7289996.0,1133131000.0,32.33,206,280688.0,7254.0
2019-07-30 00:00:00+00:00,Equity(50449 [INSG]),1.701703,1.700627,3444448.0,405478100.0,5.155,311,776937.0,7252.0


### Evaluate Factors
We'll go over some tools that we can use to evaluate alpha factos. To do so, we'll use the [alphalens library](https://github.com/quantopian/alphalens)

### Import alphelens

In [6]:
import alphalens as al

### Get price data
Note we already got the price data and converted it to returns, which we used to calcualte a factor. We'll retrieve the price dta again, but won't covert these to returns. This is because we'll use alphalens functions that take their input as prices and not returns.

   ### Define the list of assets 
   Just to make sure we get the prices for the stocks that have factor values, we'll get the list of assets, which may be a subset of the original universe

In [7]:
# get list of of stocks in our portfolio (tickers that identify each stock)
assets = QTU_pipline2.index.levels[1].values.tolist()
len(assets)

2714

In [8]:
stock_prices = QTU_pipline2.Price.unstack().fillna(0)
stock_prices

Unnamed: 0,Equity(2 [ARNC]),Equity(24 [AAPL]),Equity(31 [ABAX]),Equity(39 [DDC]),Equity(41 [ARCB]),Equity(52 [ABM]),Equity(53 [ABMD]),Equity(62 [ABT]),Equity(64 [GOLD]),Equity(67 [ADSK]),...,Equity(52517 [PLAN]),Equity(52525 [REZI]),Equity(52529 [ACA]),Equity(52537 [STNE]),Equity(52548 [TWST]),Equity(52553 [SWI]),Equity(52571 [YETI]),Equity(52592 [LIN]),Equity(52594 [ETRN]),Equity(52603 [APHA])
2017-01-03 00:00:00+00:00,18.550,115.840,52.74,0.0,27.750,40.680,112.70,38.420,15.990,74.01,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-04 00:00:00+00:00,19.190,116.140,53.57,0.0,28.850,40.820,112.37,39.030,16.400,76.17,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-05 00:00:00+00:00,19.690,116.020,53.40,0.0,29.100,41.450,115.74,39.300,16.395,77.51,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-06 00:00:00+00:00,20.090,116.610,52.61,0.0,28.300,40.580,114.71,39.700,17.380,76.93,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-09 00:00:00+00:00,20.700,117.910,53.59,0.0,28.400,39.790,115.33,40.780,16.920,79.30,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-10 00:00:00+00:00,20.400,119.000,55.05,0.0,28.100,39.410,117.11,40.750,16.970,79.59,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-11 00:00:00+00:00,20.710,119.110,48.59,0.0,29.300,39.570,112.24,41.015,16.795,79.98,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-12 00:00:00+00:00,21.090,119.740,50.18,0.0,29.850,40.220,111.27,41.080,16.680,80.87,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-13 00:00:00+00:00,21.000,119.250,49.47,0.0,29.850,39.770,112.41,40.710,16.890,80.63,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2017-01-17 00:00:00+00:00,21.120,119.040,49.16,0.0,31.350,40.480,115.65,40.930,16.885,81.32,...,0.00,0.000,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [9]:
stock_prices.shape

(647, 2714)

### compare to the factor data for a single stock

In [10]:
stock_index_name = QTU_pipline2.index.get_level_values(1)[1]
single_stock_factor_df = QTU_pipline2[np.in1d(QTU_pipline2.index.get_level_values(1), stock_index_name)]

In [11]:
single_stock_factor_df.head(2)

Unnamed: 0,Unnamed: 1,1yrReturns,5dAvgReturns,AverageDollarVolume200,MarketCap,Price,Sector,Volume,ranked_return
2017-01-03 00:00:00+00:00,Equity(24 [AAPL]),0.242636,0.270176,3048351000.0,617588500000.0,115.84,311,23251752.0,3760.0
2017-01-04 00:00:00+00:00,Equity(24 [AAPL]),0.332272,0.29045,3044491000.0,619348200000.0,116.14,311,23411860.0,4009.0


### Quiz 1:
if you have factor values calcualted before time t, what price would you use to calcuate the factor return on that data?

### Answer 1:
Use data from time t to time t+1 to calcuate forward returns

### Prepare data for use in alphalens
Alphalens make sure the data is formatted properly so that other neat alphalens functions can work with the data. For instance, it lines up th price data and facto rdata and calculates forward returns taht are associated with each factor value. We'll use [alphalens.utils.get_clean_factor_and forward_returns](https://github.com/quantopian/alphalens/blob/master/alphalens/utils.py)

The source code describes what it's used for (im showing just the parameters that we'll use here):
```
def get_clean_factor_and_forward_returns(factor,
                                         prices,
                                         ...
                                         periods=(1, 5, 10),
                                         ...
                                         ):

...
```
We'll give it three inputs: the factor, prices, and periods.
* The factor is the Series containing the factor scores for each stock on each date.
* The prices are the Series of prices for each stock on eahc date (the same dates as for the factor). Note that if the period we give is greater than 1, we'll want ot make sure to pad our price data by the period amount so that forward returns can be calcualted. Keep reading for details about "periods" parameter
* periods: this is the period for which we'll compute forward returns. For instance if prices and facto rdata have one data point per day (daily data), and if we wich to calculate the return of our factor-weighted portfolio every day, then the period woudl be 1, and hte inptu as a list [1]. if we want ot calculate the weekly return we would input[5]. If we wanted both daily and weekly, we coudl input [1,5]

* returns: multi-index Pandas DataFrame containing the cleaned version of the data.

###  Quiz 2:
What alphalens funciton does get_clean_factor_and_forward_returns call to get the forward returns?

### Answer 2:
The function `compute_forward_returns` computes forward returns.

### Quiz 3 
Clean and line up the factors and forward returns using alphlens

In this case we hve the unsmoothed and smoothed factors

In [15]:
df = QTU_pipline2[['1yrReturns', '5dAvgReturns']]
df.head()

Unnamed: 0,Unnamed: 1,1yrReturns,5dAvgReturns
2017-01-03 00:00:00+00:00,Equity(2 [ARNC]),-1.663162,-1.694499
2017-01-03 00:00:00+00:00,Equity(24 [AAPL]),0.242636,0.270176
2017-01-03 00:00:00+00:00,Equity(31 [ABAX]),0.731982,0.638386
2017-01-03 00:00:00+00:00,Equity(41 [ARCB]),0.571808,0.623764
2017-01-03 00:00:00+00:00,Equity(52 [ABM]),1.027217,0.888291


In [20]:
factor_names = df.columns

# usa a dictionary to store each dataframe, one for each factor and its associated forward returns
factor_data = {}

for factor_name in factor_names:
    print("Formating factor data for: " + factor_name)
    
    # Todo: get clean factor and forward returns for each factor
    # choose a single period returns (daily returns)
    
    factor_data[factor_name] = al.utils.get_clean_factor_and_forward_returns(
        factor = df[factor_name],
        prices = stock_prices,
        periods = [1]
    )

Formating factor data for: 1yrReturns
Dropped 1.3% entries from factor data: 1.3% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Formating factor data for: 5dAvgReturns
Dropped 1.3% entries from factor data: 1.3% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!


### Inspect the cleaned data

In [22]:
cleaned_smoothed_factor = factor_data[factor_names[1]]
cleaned_smoothed_factor.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,1D,factor,factor_quantile
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-03 00:00:00+00:00,Equity(2 [ARNC]),0.034501,-1.694499,1
2017-01-03 00:00:00+00:00,Equity(24 [AAPL]),0.00259,0.270176,3
2017-01-03 00:00:00+00:00,Equity(31 [ABAX]),0.015738,0.638386,4
2017-01-03 00:00:00+00:00,Equity(41 [ARCB]),0.03964,0.623764,4
2017-01-03 00:00:00+00:00,Equity(52 [ABM]),0.003441,0.888291,4


### Quiz 4
What do you think the '1D' column represents?

### Answer 4:
The 1D column represents the forward returns