In [None]:
import pandas as pd
import numpy as np
import datetime as dt


import pandas_datareader.data as web
from statsmodels.regression.rolling import RollingOLS
import statsmodels.api as sm
import seaborn as sns

from talib import RSI, BBANDS, MACD

from pykalman import KalmanFilter
import pywt

from alphalens.utils import get_clean_factor_and_forward_returns
from alphalens.performance import *
from alphalens.plotting import *
from alphalens.tears import *

In [None]:
DATA_STORE = '../data/assets.h5'
START = 2000
END = 2018
idx = pd.IndexSlice
with pd.HDFStore(DATA_STORE) as store:
    prices = (store['quandl/wiki/prices']
              .loc[idx[str(START):str(END), :], 'adj_close']
              .unstack('ticker'))
    stocks = store['us_equities/stocks'].loc[:, ['marketcap', 'ipoyear', 'sector']]

In [None]:
def normalized_historical_returns(input_df,data_frequency="D",return_intervals=[1,4]):
    #input_df: Dataframe with daily/weekly/monthly stock price data
    #return_invervals should be in specified in the increment the input_df is
    #returns are normalized to the data_frequence (i.e. daily data is normalized to daily returns, monthly to per month, etc)

    df = input_df.copy()
    columns = df.columns
    returns = pd.DataFrame()

    for i in return_intervals:
        for col in columns:
            returns[f'{col}_{i}{data_frequency}_return'] = df[col].pct_change(i).add(1).pow(1/i).sub(1)

    return returns

In [None]:
#For this function to work, x_df and y_df need to have datetime indexes with the same dates to join on
def rolling_OLS(x_df,y_df,rolling_window):

    if isinstance(x_df.index,pd.DatetimeIndex):
        pass
    else:
        x_df.index = x_df.index.to_timestamp()

    if isinstance(y_df.index,pd.DatetimeIndex):
        pass
    else:
        y_df.index = y_df.index.to_timestamp()
   
    
    y_df.index.name = 'date'
    x_df.index.name = 'date'

    y_name = y_df.columns[0]
    regression_df = x_df.join(y_df).sort_index()


    regression = RollingOLS(endog=regression_df[y_name],
                            exog=sm.add_constant(regression_df.drop(y_name,axis=1)),
                            window=min(rolling_window,len(regression_df)-1)
                            ).fit(params_only=True)
    
    params = regression.params.drop('const',axis=1)
    
    return params

In [None]:
#How to pull Fama French Data
factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3', 'famafrench', start='2000')[0].drop('RF', axis=1)
factor_data.index = factor_data.index.to_timestamp()
factor_data = factor_data.div(100).resample('M').mean()

In [None]:
#BBands doesn't handle N/A well, need to drop them or fill in na witha average of edges
up, mid, low = BBANDS(prices['AAPL'].dropna(), timeperiod=10, nbdevup=2, nbdevdn=2, matype=0)

In [None]:
#Relative strength index, >=70 means overbought, <=30 means underbought
rsi = RSI(prices['AAPL'].dropna(),timeperiod=14)

In [None]:
#MACD is simply the difference between the fast EMA and slow EMA
#macdsignal is just the moving average of the macd, the time periods is based on the "signalperiod" variable
#macdhist is just the difference between the macd and the macdsignla

macd, macdsignal, macdhist = MACD(prices['AAPL'].dropna(), fastperiod=12, slowperiod=26, signalperiod=9)
sns.lineplot(macdsignal)

In [None]:
#Rolling Pandas Functions

#Below is an example of what it looks like, rolling provides a window of values to use in another function (mean in this case)
#df.rolling(window=...).mean()

In [None]:
#Kalman Filters


kf = KalmanFilter(transition_matrices = [1],
                  observation_matrices = [1],
                  initial_state_mean = 0,   #suggested in book
                  initial_state_covariance = 1, #suggested in book
                  observation_covariance=1,
                  transition_covariance=.01)

#prices_df would be daily prices of a stock
state_means, _ = kf.filter(prices_kf)

In [None]:
HOLDING_PERIODS = (5, 10, 21, 42)
QUANTILES = 5

#Factor_df can be a df, but needs to have a single factor. Index needs to be multiindex, first "date" and then "asset". "date" need to have a timezone to work
#so it will be three "columns", total, two of them are the indexes and one is the factor that you are analyzing

#prices df needs to be a "wide" df, with each asset's prices being a new column. The index needs to be a singel datetime index WITH timezone. 
factor_df = pd.read_csv('sample_factor_data.csv')

#reformatting the columns to be indexes
factor_df = factor_df.set_index([pd.to_datetime(factor_df['date']),'asset'])
factor_df.drop('date',axis=1,inplace=True)

prices = pd.read_csv('stock_prices_daily.csv')
prices['Unnamed: 0'] = pd.to_datetime(prices['Unnamed: 0'])
prices = prices.set_index(['Unnamed: 0'])

alphalens_data = get_clean_factor_and_forward_returns(factor=factor_df,
                                                      prices=prices,
                                                      periods=HOLDING_PERIODS,
                                                      quantiles=QUANTILES)


#The below simply plots a potential cumulative returns graphs based on quantiles, you need to specify a single period from your above quantile options twice below
mean_return_by_q_daily, std_err = mean_return_by_quantile(alphalens_data, by_date=True)
plot_cumulative_returns_by_quantile(mean_return_by_q_daily['5D'], period='5D', freq=None)
plt.tight_layout()

In [None]:
mean_return_by_q_daily, std_err = mean_return_by_quantile(alphalens_data, by_date=True)
plot_cumulative_returns_by_quantile(mean_return_by_q_daily['5D'], period='5D', freq=None)
plt.tight_layout()

#The resulting plot shows what an ideal factor should have, i.e. quantile 5 does the best and 1 does the worst

In [None]:
ic = factor_information_coefficient(alphalens_data)

plot_ic_ts(ic[['5D']])
plt.tight_layout()
sns.despine();

#Below is alternative plot, comment out above to see this one 
# ic_by_year = ic.resample('A').mean()
# ic_by_year.index = ic_by_year.index.year
# ic_by_year.plot.bar(figsize=(14, 6))
# plt.tight_layout();

#The above plots the information cooeficient but takes the inital alphalens data as the input so can only be for one factor at a time
#NOTE: This is NOT the IR (Information Ratio), just a proxy metric