In [1]:
import pandas as pd
import numpy as np
from beakerx import *
from beakerx.object import beakerx
import seaborn as sns
import matplotlib.pyplot as plt

In [44]:
returns = read_hdf('/Users/yichuanwang/Data/CRSP_MF/data.h5', key = 'returns')

In [3]:
summary = read_hdf('/Users/yichuanwang/Data/CRSP_MF/data.h5', key = 'summary')

In [4]:
# Filter down to the important variables
salient_var = summary[['caldt', 'crsp_portno', 'crsp_fundno', 'tna_latest', 'nav_latest', 'per_com', 'per_cash', 'per_bond', \
                       'fund_name', 'crsp_obj_cd', 'lipper_class_name', 'lipper_asset_cd', 'index_fund_flag', \
                       'mgmt_fee']]
salient_var = salient_var.reset_index()

## Building Return Data

Logically, we should be summing up over portfolios. Therefore I'm going to first find a way to study the returns at the fund level and then see what's going on

First we figure out if there are duplicate observations

In [56]:
np.where(returns.reset_index().set_index(['crsp_fundno', 'caldt']).duplicated())

(array([    610,     611,     612, ..., 4056269, 4056271, 4056272]),)

In [45]:
# First null out the -99 aum
returns.loc[returns['mtna'] == -99] = np.nan

In [46]:
# Build a simple version of the return based just on the price data
returns = returns.reset_index().set_index('caldt')
returns['nav_lag'] = returns.groupby(['crsp_fundno'])['mnav'].shift(1)
returns = returns.assign(px_ret = returns.mnav / returns.nav_lag - 1)

It certainly looks like the levered ETF's are causing some trouble. Won't worry too much about them. There are some other return anomalies as well. I think if the return is really big but the price return is small, I'll take the price return. But otherwise I'll just trust the return number.

In [47]:
RET_THRESHOLD = 0.2
DIFF_THRESHOLD = 0.2

returns['ret_clean'] = [x[0] if abs(x[0]) < RET_THRESHOLD else x[1] if (abs(x[1]) < RET_THRESHOLD and abs(x[1] - x[0]) > DIFF_THRESHOLD) else x[0] \
                        for x in zip(returns['mret'], returns['px_ret'])]

In [48]:
returns = returns.assign(ret_diff = returns.mret - returns.px_ret)
returns[(abs(returns.ret_diff) > 0.2) * (np.abs(returns.mret) > 0.3)]

  .format(op=op_str, alt_op=unsupported[op_str]))


## Build the flow indicators

The idea here is pretty simple. For a given fund, define the flow as the residual from compounding the past AUM at that return

In [53]:
returns.reset_index().set_index('caldt', 'crsp_fundno').index.is_unique

False

In [51]:
returns['lag_aum'] = returns.groupby('crsp_fundno')['mtna'].shift(1)
returns = returns.assign(net_flow = returns.mtna - returns.lag_aum * (1 + returns.ret_clean),
                         log_ret = np.log(1 + returns.ret_clean))
returns['smoothed_ret'] = returns.groupby('crsp_fundno')['log_ret'].rolling(6).sum()

Exception: cannot handle a non-unique multi-index!

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x1a31273710>

In [262]:
fund_portfolio_map = summary[['crsp_fundno', 'crsp_portno', 'fund_name']]

In [263]:
fund_portfolio_map = fund_portfolio_map.loc[fund_portfolio_map.groupby(['crsp_fundno', 'crsp_portno']).cumcount() == 0, ]
fund_portfolio_map[0:100]

In [265]:
fund_portfolio = fund_portfolio_map.set_index('crsp_fundno')
returns = returns.set_index('crsp_fundno')

In [266]:
identified_returns = pd.DataFrame.join(returns, fund_portfolio, on = 'crsp_fundno')
identified_returns[0:40]

In [None]:
returns.set

In [220]:
TARGET_DATE = '2017-12-31'
big_on_target_date = salient_var.loc[salient_var.caldt == TARGET_DATE]

In [221]:
big_on_target_date = big_on_target_date.loc[big_on_target_date.per_com > 50].sort_values(by = 'tna_latest', ascending = False)

In [222]:
big_on_target_date[0:20]

In [223]:
contrafund = big_on_target_date.loc[big_on_target_date['fund_name'].str.startswith('Fidelity Contrafund', na = False)]

In [224]:
SPECIAL_FUND = 11809
aum_of_fund = summary.reset_index().query('crsp_fundno == ' + str(SPECIAL_FUND))
aum_of_fund = aum_of_fund.set_index('caldt').sort_index()
aum_of_fund = aum_of_fund.reset_index()

In [225]:
aum_of_fund

In [226]:
aum_of_fund['tna_latest'].values

array([4.4500000e+01, 3.8400000e+01, 2.9298000e+01, 2.9264000e+01,
       2.9447000e+01, 2.8529000e+01, 2.2602000e+01, 1.7963000e+01,
       2.8851000e+01, 5.7929000e+01, 4.3500000e+01, 4.2000000e+01,
       4.7000000e+01, 6.2000000e+01, 6.3233000e+01, 7.2500000e+01,
       8.6200000e+01, 7.9600000e+01, 8.6800000e+01, 8.4400000e+01,
       8.7700000e+01, 1.0500000e+02, 2.9660000e+02, 3.3210000e+02,
       1.0024000e+03, 1.9742000e+03, 6.1933000e+03, 8.6824000e+03,
       1.4831700e+04, 2.3797900e+04, 3.0808512e+04, 3.8821300e+04,
       4.6927000e+04, 4.0284800e+04, 3.2320900e+04, 2.7695100e+04,
       3.6051400e+04, 4.4484500e+04, 6.0093800e+04, 6.8565200e+04,
       8.0864000e+04, 4.5195200e+04, 5.7153400e+04, 6.1430600e+04,
       5.4719200e+04, 5.8819100e+04, 7.5076300e+04, 7.6029600e+04,
       7.7869100e+04, 7.3241400e+04, 9.0147300e+04])

In [228]:
aum_plot = TimePlot(title = 'AUM of Fund: ' + str(SPECIAL_FUND), logY = True)
aum_plot.add(Line(x = aum_of_fund['caldt'],
                  y = aum_of_fund['tna_latest']))
