In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
np.set_printoptions(suppress=True)

def get_ret_year(mf_rets, st, ed):
    mf_rets_year = mf_rets[(mf_rets['date']<=ed) & (mf_rets['date']>=st)]
    date_ls = sorted(list(set(mf_rets_year['date'])))
    mf_rets_dicts = mf_rets_year.to_dict('records')
    rets_year_dict = {}
    for dict_i in mf_rets_dicts:
        crsp_fundno = dict_i['crsp_fundno']
        date = dict_i['date']
        ret = dict_i['mret']
        if crsp_fundno not in rets_year_dict:
            rets_year_dict[crsp_fundno] = {}
        rets_year_dict[crsp_fundno][date] = ret
    
    fundno_rets = {}
    NAN = float('nan')
    for fund_i in rets_year_dict:
        dict_i = rets_year_dict[fund_i]
        rest_ls = [NAN for i in range(len(date_ls))]
        for dete_i in dict_i:
            ind = date_ls.index(dete_i)
            rest_ls[ind] = dict_i[dete_i]
        fundno_rets[fund_i] = rest_ls
    fundno_rets_year1 = pd.DataFrame.from_dict(fundno_rets)
    return fundno_rets_year1


def get_qu_ls(quts, fund_retavg):
    ls = [[], [], [], [], [], [], [], [], [], []]
    for id_i in fund_retavg:
        avg = fund_retavg[id_i]
        for i in range(10):
            if (avg>=quts[i]) & (avg<quts[i+1]):
                ls[i].append(int(id_i))
    return ls
        
def get_group_ls(fundno_rets_year1):
    fundno_rets_year1 = fundno_rets_year1.dropna(axis=1, how='any')
    fund_retavg = fundno_rets_year1.mean().to_dict()
    ret_avgls = [fund_retavg[i] for i in fund_retavg]
    # from low to high:
    quts = list(np.percentile(ret_avgls, [(i+1)*10 for i in range(9)]))
    quts.insert(0, -8888)
    quts.append(8888)
    portfolios_fundids = get_qu_ls(quts, fund_retavg)
    return portfolios_fundids

def get_returns(ret_i):
    ret_array = ret_i.to_numpy()
    rest1 = ret_array[0, :]
    inval_indx0 = np.isnan(rest1)
    val_indx0 = 1- np.isnan(rest1)
    num_vals = np.sum(val_indx0)  
    ws = np.array([1/num_vals for i in range(rest1.shape[0])])
    ws[inval_indx0] = 0
    rets = []
    for i in range(12):
        rest1 = ret_array[i, :]
        rest11 = ret_array[i, :]
        rest11[np.isnan(rest11)] = 0
        ret_f1 = np.matrix(rest11)*np.matrix(ws).T
        ret_f1 = ret_f1[0, 0]
        rets.append(ret_f1)
        if i <= 10:
            rest2 = ret_array[i+1, :]
            indx_inval  = np.isnan(rest2)
            indx_val = 1 - indx_inval
            asets_vals = (np.array(rest11)+1)*np.array(ws)
            asets_vals = asets_vals + np.sum(asets_vals[indx_val])/np.sum(indx_val)
            asets_vals[indx_inval] = 0
            ws = asets_vals/np.sum(asets_vals)
            ws = ws.tolist()
    return rets


def get_pf_rets(pf_ids, fundno_rets_year2):
    ret_ls = []
    for pf_i in pf_ids:
        ret_i = fundno_rets_year2[pf_i]
        rets_pf_i = get_returns(ret_i)
        ret_ls.append(rets_pf_i)
    rets = np.array(ret_ls).T
    return rets


def get_all_rets(mf_rets):
    months  = []
    cols = ['p{}'.format(i+1) for i in range(10)]
    NANs = [float('nan') for i in range(12)]
    for i in tqdm(range(55)):
        st = int(str(1964+i)+'01')
        ed = int(str(1964+i)+'12')
        fundno_rets_year1 = get_ret_year(mf_rets, st, ed)
        pf_group_ls = get_group_ls(fundno_rets_year1)
        st1 = int(str(1964+i+1)+'01')
        ed1 = int(str(1964+i+1)+'12')
        fundno_rets_year2 = get_ret_year(mf_rets, st1, ed1)
        new_cols = set(fundno_rets_year1.columns) - set(fundno_rets_year2.columns)
        for col_i in new_cols:
            fundno_rets_year2[col_i] = NANs
        rets_i = get_pf_rets(pf_group_ls, fundno_rets_year2)
        if i==0:
            rets = rets_i
        else:
            rets = np.vstack((rets, rets_i))
        mon_i = [int(str(1964+i+1)+str(j+1).zfill(2)) for j in range(12)]
        months += mon_i
    df_pf_rets = pd.DataFrame(rets, columns=cols)
    df_pf_rets['date'] = months
    return df_pf_rets

# 1 process data

In [88]:
mf_rets = pd.read_csv('./original_data/MFrets_1962to2012.csv')
mf_rets = mf_rets.dropna(axis=0, how='any')
years = list(mf_rets['year'])
months = list(mf_rets['month'].astype(str))
dates = []
for i in range(len(years)):
    date_i = str(years[i]) + str(months[i]).zfill(2)
    dates.append(date_i)
mf_rets['date'] = dates
mf_rets['mret'] = mf_rets['mret']*100
mf_rets = mf_rets.drop(['year', 'month'], axis=1)
mf_rets.to_csv('./MF_rets.csv')

In [61]:
mom = pd.read_csv('./original_data/Mom.csv')
df3 = pd.read_csv('./original_data/FF3.csv')
df3 = df3[df3['Unnamed: 0']>=192701]
df3 = df3.reset_index()
df3['Mom'] = mom['Mom']
dates1 = sorted(list(set(dates)))
df3['date'] = df3['Unnamed: 0']
df3 = df3.drop(['Unnamed: 0', 'index'], axis=1)
df3.to_csv('FF4.csv')

# 2 regression:

$$
\begin{gathered}
r_{i t}=\alpha_{i T}+\beta_{i T} \mathrm{VWRF}_{t}+e_{i t} \quad t=1,2, \cdots, T \\
r_{i t}=\alpha_{i T}+b_{i T} \mathrm{RMRF}_{t}+s_{i T} \mathrm{SMB}_{t}+h_{i T} \mathrm{HML}_{t}+e_{i t} \quad t=1,2, \cdots, T \\
r_{i t}=\alpha_{i T}+b_{i T} \mathrm{RMRF}_{t}+s_{i T} \mathrm{SMB}_{t}+h_{i T} \mathrm{HML}_{t}+p_{i T} \mathrm{PR} 1 \mathrm{YR}_{t}+e_{i t} \\
t=1,2, \cdots, T
\end{gathered}
$$

## 2.1 variable meaning:

- $r_{i t}$ is the return on a portfolio in excess of the one-month T-bill return;
- VWRF is the excess return on the CRSP value-weighted portfolio of all NYSE;
- RMRF is the excess return on a value-weighted aggregate market proxy;
- SMB, HML, PR1YR are returns on valueweighted, zero-investment, factor-mimicking portfolios for size, book-to-market equity, and one-year momentum in stock returns.


## 2.2 portfolio  construction:

On January 1 of each year, I form ten equal-weighted portfolios of mutual funds, using reported returns.  I hold the portfolios for one year, then reform them. This yields a time series of monthly returns on each decile portfolio from 1963 to 1993. Funds that disappear during the course of the year are included in the equal-weighted average until they disappear, then the portfolio weights are readjusted appropriately.

Mutual funds are sorted on January 1 each year from 1963 to 1993 into decile portfolios based on their previous calendar year's return. The portfolios are equally weighted monthly so the weights are readjusted whenever a fund disappears. Funds with the highest past one-year return comprise decile 1 and funds with the lowest comprise decile 10. Deciles 1 and 10 are further subdivided into thirds on the same measure. VWRF is the excess return on the CRSP value-weight market proxy.

RMRF, SMB, and HML are Fama and French's (1993) market proxy and factor-mimicking portfolios for size and book-to-market equity. PR1YR is a factor-mimicking portfolio for one-year return momentum. Alpha is the intercept of the Model. The t-statistics are in parentheses.

In [2]:
ff4_df = pd.read_csv('./FF4.csv')
ff4_df = ff4_df.drop('Unnamed: 0', axis=1)
mf_rets = pd.read_csv('./MF_rets.csv')
mf_rets = mf_rets.drop('Unnamed: 0', axis=1)

In [5]:
df_pf_rets = get_all_rets(mf_rets)

100%|███████████████████████████████████████████| 55/55 [00:29<00:00,  1.85it/s]


In [6]:
df_pf_rets = df_pf_rets.dropna(axis=0, how='any')

In [7]:
df_pf_rets = df_pf_rets[df_pf_rets['date']<201901]

In [8]:
df_pf_rets['ls'] = df_pf_rets['p10'] - df_pf_rets['p1']
ff4_df1 = ff4_df[(ff4_df['date']>=196501) & (ff4_df['date']<=201812)]
ff4_df1 = ff4_df1.drop('date',axis=1)
ff4_df1 = ff4_df1.reset_index()

In [9]:
df_pf_rets[ff4_df1.columns[1:]] = ff4_df1[ff4_df1.columns[1:]]
df_pf_rets1 = df_pf_rets.copy()
cls = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10', 'ls']
for cls_i in cls:
    df_pf_rets1[cls_i] = df_pf_rets1[cls_i] - df_pf_rets1['RF']
df_pf_rets1['MK'] = df_pf_rets1['Mkt-RF']
df_pf_rets2 = df_pf_rets1[df_pf_rets1['date']<=199312]
df_pf_rets3 = df_pf_rets1[(df_pf_rets1['date']>199312) & (df_pf_rets1['date']<201812)]

In [10]:
df_pf_rets1.to_csv('pf_rets1.csv')
df_pf_rets2.to_csv('pf_rets2.csv')
df_pf_rets3.to_csv('pf_rets3.csv')