In [1]:
import pandas as pd
from pandas.tseries.offsets import MonthEnd
import numpy as np
from beakerx import *
from beakerx.object import beakerx
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sci
from copy import copy
from IPython.display import HTML
from multiprocessing import Pool, cpu_count

import sys
sys.path.append('../Code/')
from utils import *

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

# Introduction

In most stock market, there is a long tail of very small firms. It is well known that even though small firms comprise a large share of total companies, they take up a vanishingly small share of the market cap. This fact is highly problematic for implementing factor portfolios that construct anomaly deciles by equalizing *counts* of companies within anomaly deciles. This notebook explores the implication of constructing the momentum portfolio by equalizing the market cap within each decile. In doing so I create a better approximation to a momentum portfolio that can be implemented at scale.

In [2]:
stocks = pd.read_hdf('../Output/merged.h5')

In [3]:
# Calculate what the percentiles by market cap are when just weighted by market cap itself
januaries = stocks.loc[(stocks.index.get_level_values(1).month == 1)]
break_quantiles = [0.05, 0.10, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99, 1]


I use the CRSP-Compustat merged database. Below I show the top 10 companies by market cap as of January 2018 to give transparency into the data I'm using.

In [4]:
januaries.xs('2018-01-31', level = 'datadate', drop_level = False).sort_values(['Market Cap (Billions, CRSP)'], ascending = False).loc[:, ['Company Name', 'Market Cap (Billions, CRSP)']].iloc[0:5, :]

### The Distribution of Firm Size

I first plot the distribution of market caps for the NYSE sample, weighting all companies equally. The median market cap is around $3 billion dollars as of January 2018. This is not tiny but still very small! 

In [5]:
def all_quantiles(quantiles, plot_title, **kwargs):
    plot = TimePlot(title = plot_title, legendLayout=LegendLayout.HORIZONTAL,\
                          legendPosition=LegendPosition(position=LegendPosition.Position.TOP),\
                        initWidth = 1000, **kwargs)
    
    for c in quantiles.columns:
        plot.add(Line(displayName = c, \
                      x = quantiles.index.get_level_values(0),\
                      y = quantiles[c]))
    return plot
raw_quantiles = januaries.loc[januaries['Exchange Code'] == 1, 'Market Cap (Billions, CRSP)'].groupby(by = ['datadate']).quantile(q = break_quantiles).unstack()
all_quantiles(raw_quantiles, 'Distribution of Firm Size (Unweighted)', logY = True, logYBase = np.exp(1))

Next, I compute the weighted quantiles, using market caps as weights. Thus the 50'th percentile weighted market cap means that half of all market capitalization is held by firms smaller than that size. These weighted quantiles are much higher. Roughly 5% of total market cap is contained in firms with market cap less than $3.5 billion dollars. Thus the total market cap of the firms that make up the bottom 50\% of the distribution of firm size is less than 5\% of total market capitalization.

In [6]:
def weighted_quantiles_to_dataframe(x, quantiles, weights = None):
    array = weighted_quantile(x, quantiles, weights)
    df_dict = {}
    for tup in zip(quantiles, array):
        df_dict[tup[0]] = [tup[1]]
    ret = pd.DataFrame.from_dict(df_dict)
    ret.index = [x.name]
    return ret

weighted_market_cap_quantiles = januaries.loc[januaries['Exchange Code'] == 1, 'Market Cap (Billions, CRSP)'].groupby(by = ['datadate']).apply(lambda x: weighted_quantiles_to_dataframe(x, break_quantiles, x))
all_quantiles(weighted_market_cap_quantiles, 'Distribution of Firm Size (Weighted)', logY = True, logYBase = np.exp(1))

## Rebuilding anomaly portfolios

In [7]:
stocks['Lagged Market Cap'] = stocks['Market Cap (Billions, CRSP)'].groupby(['Permco']).shift(1)

In [242]:
stocks['Cumulative Return at t - 13'] = stocks['Cumulative Return'].groupby(['Permco']).shift(13)
stocks['Cumulative Return at t - 3'] = stocks['Cumulative Return'].groupby(['Permco']).shift(3)
stocks['Return Momentum'] = stocks['Cumulative Return at t - 3'] - stocks['Cumulative Return at t - 13']

KeyError: 'Permco'

In [244]:
stocks = stocks.safe_index(['Permco', 'datadate'])

In [246]:
def simple_plot(dataframe, variable, plot_title, **kwargs):
    """
    Makes a simple line plot of "variable" from dataframe.
    
    :param dataframe -- a dataframe with a multi-index, the second level of which is the date variable
    :param variable -- the variable to plot
    :param plot_title -- the title to use for the plot
    """
    plot = TimePlot(title = plot_title, legendLayout=LegendLayout.HORIZONTAL,\
                          legendPosition=LegendPosition(position=LegendPosition.Position.TOP),\
                        initWidth = 500, **kwargs)
    plot.add(Line(displayName = variable, \
                  x = dataframe.index.get_level_values('datadate'),\
                  y = dataframe[variable]))
    return plot

apple = stocks.xs(7, level = 'Permco', drop_level = False)
simple_plot(apple, 'Cumulative Return', 'Apple')

In [10]:
simple_plot(apple, 'Return Momentum', 'Momentum')

In [100]:
df_in_time.head()

In [301]:
def normalize_positive_and_negative(array_like):
    """
    Function that rescales all positive elements so that they add up to 1 and all negative elements so they add to negative one
    """
    x = np.array(array_like, dtype = np.float64)
    positive = np.where(x > 0)
    negative = np.where(x < 0)
    
    pos_sum = np.sum(x[positive])
    neg_sum = np.sum(x[negative]) * -1
        
    x[positive] = np.divide(x[positive], float(pos_sum))
    x[negative] = np.divide(x[negative], float(neg_sum))
    return x

def build_portfolio_point_in_time(df_in_time, anomaly_variable, min_stock_count = 100):
    types = ['Unweighted Sort', 'Weighted Sort', 'Linear Factor']
    diag_vars = ['Company Name', 'Permco', 'Lagged Market Cap', 'Return', 'Sorting Variable']
    # if_vars = [w + ' Inclusion Factor' for w in types]
    weight_vars = [w + ' Weight' for w in types]
    contrib_vars = [w + ' Contribution' for w in types]
    all_cols = diag_vars + weight_vars + contrib_vars # + if_vars
    
    df_in_time = df_in_time.loc[~pd.isnull(df_in_time[anomaly_variable])]
    df_in_time = df_in_time.loc[~pd.isnull(df_in_time['Lagged Market Cap'])]
    
    if df_in_time.shape[0] < min_stock_count:
        ret = pd.DataFrame(index = [df_in_time.index[0]], columns = all_cols)
        ret.index.name = 'datadate'
        return ret
    
    # First trim the anomaly variable
    anomaly_cuts = weighted_quantile(df_in_time[anomaly_variable], [0.05, 0.5, 0.95], df_in_time['Lagged Market Cap'])
    df_in_time['Sorting Variable'] = df_in_time[anomaly_variable] - anomaly_cuts[1]
    anomaly_cuts = anomaly_cuts - anomaly_cuts[1]
    
    # Now do all the sorts
    unweighted_sort_breakpoints = weighted_quantile(df_in_time.loc[df_in_time['Exchange Code'] == 1, 'Sorting Variable'], [0, 0.33, 0.67, 1]) # Use NYSE breakpoints
    weighted_sort_breakpoints = weighted_quantile(df_in_time['Sorting Variable'], [0, 0.33, 0.67, 1], sample_weight = df_in_time['Lagged Market Cap'])
    
    df_in_time['Unweighted Sort Inclusion Factor'] = pd.cut(df_in_time['Sorting Variable'], unweighted_sort_breakpoints, labels = False) - 1
    df_in_time['Weighted Sort Inclusion Factor'] = pd.cut(df_in_time['Sorting Variable'], weighted_sort_breakpoints, labels = False) - 1
    df_in_time['Linear Factor Inclusion Factor'] = winsorize_at_explicit_input(df_in_time['Sorting Variable'], anomaly_cuts[0], anomaly_cuts[2])
    
    for weight_type in types:
        df_in_time['Unnormalized ' + weight_type + ' Portfolio Weights'] = df_in_time[weight_type + ' Inclusion Factor'] * df_in_time['Lagged Market Cap']
        df_in_time[weight_type + ' Weight'] = normalize_positive_and_negative(df_in_time['Unnormalized ' + weight_type + ' Portfolio Weights'])
        df_in_time[weight_type + ' Contribution'] = df_in_time[weight_type + ' Weight'] * df_in_time['Return']
    
    ret = df_in_time[all_cols]
    return ret

In [284]:
def build_portfolio_through_time(crsp_data, anomaly_variable):
    through_time_portfolio = crsp_data.groupby(by = ['datadate']).apply(build_portfolio_point_in_time, anomaly_variable = anomaly_variable)
    return through_time_portfolio

def calc_return_series(portfolio, ret_vars = ['Unweighted Sort Contribution', 'Weighted Sort Contribution', 'Linear Factor Contribution']):
    return portfolio.loc[:, ret_vars].groupby(['datadate']).sum()

In [None]:
stocks = stocks.safe_index(['datadate'])

In [302]:
momentum = build_portfolio_through_time(stocks, 'Return Momentum')

## Loading the Fama French Momentum Factor

In [303]:
return_series = calc_return_series(momentum)

In [304]:
long_returns = pd.melt(return_series.reset_index(), id_vars = ['datadate'], var_name = 'Portfolio Type', value_name = 'Return')

In [305]:
long_returns['Log Return'] = np.log(long_returns['Return'] + 1)
long_returns = long_returns.safe_index(['Portfolio Type'])
long_returns['Cumulative Return'] = long_returns['Log Return'].groupby(by = ['Portfolio Type']).cumsum()

In [306]:
long_returns

In [307]:
small_sample_returns = long_returns.reset_index().pivot(index = 'datadate', columns = 'Portfolio Type', values = 'Cumulative Return')

In [308]:
small_sample_returns

In [312]:
def comparison_plot(dataframe, **kwargs):
    """
    Makes a simple line plot of "variable" from dataframe.
    
    :param dataframe -- a dataframe with a multi-index, the second level of which is the date variable
    :param variable -- the variable to plot
    :param plot_title -- the title to use for the plot
    """
    plot = TimePlot(title = 'Comparison of Portfolios', legendLayout=LegendLayout.HORIZONTAL,\
                          legendPosition=LegendPosition(position=LegendPosition.Position.TOP),\
                        initWidth = 1000, **kwargs)
    
    for c in dataframe.columns:
        plot.add(Line(displayName = c, \
                      x = dataframe.index.get_level_values('datadate'),\
                      y = dataframe[c]))
    return plot

comparison_plot(small_sample_returns)