In [1]:
pip install linearmodels


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import datetime
from statsmodels.regression.rolling import RollingOLS
from scipy.stats.mstats import winsorize
from linearmodels.panel import PanelOLS
import wrds
import time

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
class FlowSensitivity(object):
    """
    Class to replicate the flow sensitivity analysis in Goldstein et al. (2017).
    You will need to implement the methods in this class, wherever there is a TODO.

    """

    def __init__(self):
        self.db = None
        self.merge = None

    def _download_fund_summary(self, start_date='1992-01-01', end_date='2014-12-31'):
        """
        Download the fund summary data from WRDS
        """
        print('Downloading fund_summary_full...')
        t0 = time.time()
        df_fund_summary_query = f"SELECT * FROM crsp.fund_summary2 WHERE (caldt BETWEEN '{start_date}' AND '{end_date}')"
        df = self.db.raw_sql(df_fund_summary_query)
        df.to_csv("fund_summary_full.csv", index=False)
        t1 = time.time()
        print("Downloaded fund_summary_full Successfully in", t1 - t0, 's')
        return None

    def _download_monthly_returns(self, start_date='1992-01-01', end_date='2014-12-31'):
        """
        Download the monthly returns data from WRDS
        """
        print('Downloading MonthlyReturns')
        t0 = time.time()
        df_query = f"SELECT * FROM crsp.monthly_tna_ret_nav WHERE (caldt BETWEEN '{start_date}' AND '{end_date}')"
        df = self.db.raw_sql(df_query)
        df.to_csv("MonthlyReturns.csv", index=False)
        t1 = time.time()
        print("Downloaded MonthlyReturns Successfully in", t1 - t0, 's')

    def crsp_data_download(self, username, start_date='1992-01-01', end_date='2014-12-31'):
        """
        Connect to WRDS and download the data
        Note: you do NOT need to change any code here

        """
        self.db = wrds.Connection(wrds_username=username)

        self._download_fund_summary(start_date=start_date, end_date=end_date)
        self._download_monthly_returns(start_date=start_date, end_date=end_date)

        return None

    def _import_clean_mret(self):
        """
        Load and clean MonthlyReturns.csv

        """
        mret = pd.read_csv("MonthlyReturns.csv")
        # Rename specified columns
        mret.rename(columns={'crsp_fundno': 'Id', 'mtna': 'tna', 'mret': 'ret'}, inplace=True)
        # Convert 'caldt' column to datetime format
        mret['caldt'] = pd.to_datetime(mret['caldt'])
        # Create a new column 'date' (first day in the month of 'caldt') to merge with other datasets
        mret['date'] = mret['caldt'].dt.to_period('M').dt.to_timestamp()
        # Check primary key
        assert mret.duplicated(subset=['date', 'Id']).sum() == 0

        return mret

    def balanced_panel(self, mret):
        """
        Fills in skipping monthly dates for each unique Id in the DataFrame.

        """
        # Ensure 'date' is in datetime format and sort
        mret.sort_values(by=['Id', 'date'], inplace=True)
        # Generate a complete sequence of monthly dates for each Id
        def generate_date_range(group):
            min_date = group['date'].min()
            max_date = group['date'].max()
            all_dates = pd.date_range(start=min_date, end=max_date, freq='MS')  # MS is month start frequency
            return pd.DataFrame({'Id': group['Id'].iloc[0], 'date': all_dates})
        # Apply function to each group and concatenate the results
        mret_new = pd.concat([generate_date_range(group) for _, group in mret.groupby('Id')])
        # Merge back to include the newly generated dates
        mret_new = pd.merge(mret_new, mret, on=['Id', 'date'], how='left')
        # Sort and reset index
        mret_new.sort_values(by=['Id', 'date'], inplace=True)
        mret_new.reset_index(drop=True, inplace=True)

        return mret_new


    def _import_clean_qinfo(self):
        # Import data
        qinfo = pd.read_csv("fund_summary_full.csv")
        # Select and rename specified columns
        qinfo = qinfo[['crsp_fundno', 'caldt', 'exp_ratio', 'first_offer_dt',
                        'lipper_obj_cd', 'si_obj_cd', 'wbrger_obj_cd', 'crsp_obj_cd',
                        'index_fund_flag', 'et_flag', 'crsp_cl_grp']]
        qinfo.rename(columns={
            'crsp_fundno': 'Id',
            'caldt': 'date',
            'exp_ratio': 'expenseratio'
        }, inplace=True)
        # Convert dates and handle missing values
        qinfo['date'] = pd.to_datetime(qinfo['date']).dt.to_period('M').dt.to_timestamp()
        qinfo['first_offer_dt'] = pd.to_datetime(qinfo['first_offer_dt'], errors='coerce')
        # Handle NA values for numeric and string columns differently
        qinfo['expenseratio'] = qinfo['expenseratio'].fillna(-999)
        string_columns = ['index_fund_flag', 'et_flag', 'lipper_obj_cd', 'si_obj_cd', 'wbrger_obj_cd', 'crsp_obj_cd']
        qinfo[string_columns] = qinfo[string_columns].fillna("")
        # Drop duplicates
        qinfo = qinfo.groupby(['Id', 'date']).first().reset_index()
        # Check primary key
        assert qinfo.duplicated(subset=['date', 'Id']).sum() == 0

        return qinfo
    
    def _import_clean_benchmarkrate(self):
        # Import data
        benchmarkrate = pd.read_csv("stage1_vars.csv")
        # Convert dates
        benchmarkrate['date'] = pd.to_datetime(benchmarkrate['date'])

        return benchmarkrate

    def import_merge_and_standarize_data(self):
        """
        Merges mret with qinfo and benchmarkrate, then fills forward missing values
        in specified columns.

        """
        # Import data
        mret = self._import_clean_mret()
        mret = self.balanced_panel(mret)
        qinfo = self._import_clean_qinfo()
        benchmarkrate = self._import_clean_benchmarkrate()
        # Perform left joins based on 'date', and then on both 'date' and 'Id'
        merge = pd.merge(mret, qinfo, on=['date', 'Id'], how='left')
        merge = pd.merge(merge, benchmarkrate, on='date', how='left')
        # Sort by 'Id' and 'date'
        merge.sort_values(by=['Id', 'date'], inplace=True)
        # Fill forward missing values in specified columns
        columns_to_fill = ['expenseratio', 'first_offer_dt', 'lipper_obj_cd', 'si_obj_cd', 
                           'wbrger_obj_cd', 'crsp_obj_cd', 'index_fund_flag', 'et_flag']
        merge[columns_to_fill] = merge.groupby('Id')[columns_to_fill].fillna(method='bfill')
        
        return merge

    def calculate_flow_ratio(self, merge):
        """
        Calculates the flow ratio and one-period lagged flow ratio for each row in the DataFrame
        based on total net assets (tna), return (ret), and an identifier (Id).
        The flow ratio is calculated as:
            (tna - lag(tna) * (1 + ret)) / lag(tna)
        """
        # TODO: Write code here to calculates the flow ratio and one period lagged flow ratio        
        merge['prev_tna'] = merge.groupby('Id')['tna'].shift()
        merge['flow_ratio'] = (merge['tna'] - merge['prev_tna'] * (1 + merge['ret'])) / merge['prev_tna']
        merge['lagged_flow_ratio'] = merge.groupby('Id')['flow_ratio'].shift()

        return merge

    def calculate_excess_return(self, merge):
        # TODO: Write code here to calculates excess returns
        merge['ret_rf'] = merge['ret'].subtract(merge['rf'])

        return merge
    
    def filter_corporate_nonPassiveETF(self, merge):
        """
        Filters and adjusts the merge DataFrame based on various criteria.

        """
        # TODO: Write code here to filter for corporate, non-passive and non-ETF/ETN funds
        exclude_conditions = {
            'et_flag': {'F', 'N'},
            'index_fund_flag': {'B', 'D', 'E'}
        }
        # Applying filter conditions
        for column, values in exclude_conditions.items():
            merge = merge[~merge[column].isin(values)]
        
        corporate_bond_criteria = (
            merge['lipper_obj_cd'].isin({'A', 'BBB', 'HY', 'SII', 'SID', 'IID'}) |
            merge['si_obj_cd'].isin({'CGN', 'CHQ', 'CHY', 'CIM', 'CMQ', 'CPR', 'CSM'}) |
            merge['wbrger_obj_cd'].isin({'CBD', 'CHY'}) |
            merge['crsp_obj_cd'].str.startswith('IC')
        )
        merge = merge[corporate_bond_criteria]

        return merge

    def filter_ids_with_insufficient_data(self, merge, min_periods=12):
        """
        Filter DataFrame to include only Ids with at least `min_periods`
        non-missing entries in the 'ret_rf' column.

        """
        # TODO: Write code here to include only Ids with at least `min_periods`
        #       non-missing entries in the 'ret_rf' column.
        valid_ids = merge.groupby('Id').filter(lambda x: x['ret_rf'].count() >= min_periods)['Id'].unique()
        merge = merge[merge['Id'].isin(valid_ids)]
        return merge
    
    def rolling_capm(self, group,group_id):
        """
        Executes a rolling CAPM regression for each unique Id within the group.
        This function calculates alpha and beta coefficients over a specified rolling window.
        Parameters:
        - group: DataFrame for a specific mutual fund (grouped by Id).
        """
        # Setting the rolling window size for the regression
        rolling_window = 12

        # Prepare independent variables with a constant added for the intercept (alpha)
        independent_vars = sm.add_constant(group[['mkt_rf', 'vbnd_ret_rf']])
        dependent_var = group['ret_rf']

        # Prepare the DataFrame to store rolling regression outcomes
        group['alpha_next'] = np.nan
        group['beta_bond_next'] = np.nan
        group['beta_market_next'] = np.nan

        # Instantiate the rolling OLS model to calculate rolling regression parameters
        rolling_model = RollingOLS(dependent_var, independent_vars, window=rolling_window)
        results = rolling_model.fit()

        # Extracting regression coefficients and storing them
        group.loc[results.params.index, 'alpha_next'] = results.params['const'].values
        group.loc[results.params.index, 'beta_bond_next'] = results.params['vbnd_ret_rf'].values
        group.loc[results.params.index, 'beta_market_next'] = results.params['mkt_rf'].values

        # Shifting the calculated parameters to use them as predictors for the next period
        group['alpha_next'] = group['alpha_next'].shift()
        group['beta_bond_next'] = group['beta_bond_next'].shift()
        group['beta_market_next'] = group['beta_market_next'].shift()

        return group
    
    
    def lag_capm_paras(self, merge):
        # Ensure 'date' is in datetime format and sort values
        merge = merge.sort_values(by=['Id', 'date'])
        # Group by 'Id' and calculate lagged values
        merge['alpha'] = merge.groupby('Id')['alpha_next'].shift(1)
        merge['beta_bnd'] = merge.groupby('Id')['beta_bond_next'].shift(1)
        merge['beta_equity'] = merge.groupby('Id')['beta_market_next'].shift(1)

        return merge
    

    def rolling_regression(self, merge):
        # Balnaced panel
        merge = self.balanced_panel(merge)
        # Apply the filtering to ensure sufficient non-missing 'ret_rf' data
        merge = self.filter_ids_with_insufficient_data(merge)
        # Apply the rolling regression function
        merge = merge.groupby('Id').apply(lambda x: self.rolling_capm(x, x.name)).reset_index(drop=True)
        # lag one period so that we don't use unavailable information
        merge = self.lag_capm_paras(merge)

        return merge
    

    def filter_out_early_last_history(self, merge):
        """
        Filters and adjusts the merge DataFrame based on various criteria.
        """
        # Filter out first year and last year history of each mutual fund
        ## Calculate age in years
        merge['age'] = (pd.to_datetime(merge['date']) - pd.to_datetime(merge['first_offer_dt'])).dt.days / 365
        ## Calculate the 'end_age' as the number of years between 'date' and 'last_date'
        merge['last_date'] = merge.groupby('Id')['date'].transform('max')
        merge['end_age'] = (merge['last_date'] - merge['date']).dt.days / 365
        # Apply filters based on the provided lists and conditions
        conditions = (
            (merge['age'] > 1) &
            (merge['end_age'] > 1) 
        )
        merge = merge.loc[conditions]

        return merge

    def select_merged_data(self, merge):
        """
        Keep specific columns and drop NAs and Infs

        """
        # select specific columns
        merge = merge[['Id', 'date', 'flow_ratio', 'lagged_flow_ratio',
                    'alpha','beta_bnd', 'beta_equity',
                    'age', 'tna', 'expenseratio']]
        # drop NAs (negative expenseratio is also NAs) and Infs
        merge = merge[merge['expenseratio'] > 0]
        merge.replace([np.inf, -np.inf], np.nan, inplace=True)
        merge.dropna(inplace=True)

        return merge

    def winsorize_merged_data(self, merge):
        # TODO: Write code here to winsorize flow ratio and alpha at 1% and 99% level
        for column in ['flow_ratio', 'alpha','lagged_flow_ratio']:
            merge[f'{column}_w'] = winsorize(merge[column], limits=[0.01, 0.01])

        return merge

    def generate_all_variables(self, merge):
        merge['log_tna'] = np.log(merge['tna'])
        merge['log_age'] = np.log(merge['age'])
        merge['neg_alpha'] = (merge['alpha'] < 0).astype(int)
        merge['alpha_w_neg_alpha'] = merge['alpha_w'] * merge['neg_alpha']
        # drop NAs and Infs
        merge.replace([np.inf, -np.inf], np.nan, inplace=True)
        merge.dropna(inplace=True)

        return merge

    def flow_sensitivity_regression(self, merge):
        # TODO: write code here to run the regression and print the results

        merge.set_index(['Id', 'date'], inplace=True)

        regression_formula = 'flow_ratio_w ~ 1 + alpha_w + alpha_w_neg_alpha + neg_alpha + lagged_flow_ratio_w + log_tna + log_age + expenseratio + TimeEffects'
        regression_model = PanelOLS.from_formula(regression_formula, data=merge)
        regression_results = regression_model.fit(cov_type='clustered', cluster_entity=True)

        print(regression_results)
        return regression_results

    def save_results(self, results):
        """
        Save the results to a text file.
        Note: the format *does not matter*, as long as the results are printed in an
             understandable and readable format. Just printing the regression results
             /output directly and saving that to the file is enough

        """
        # TODO: Upload this file along with the submission template to the autograder
        #       (please do not change the name of the file)

        f = open("flow_sensitivity_regression_results.txt", "w")
        print(results, file=f)

In [9]:
hw = FlowSensitivity()

# To download the files "MonthlyReturns.csv" and "fund_summary_full.csv" from WRDS
# If you have already saved these files, you can comment out the following lines
# NOTE: they must be in the same directory as this script, with the same names
# TODO: make sure to change the username, and pick relevant start and end dates
wrds_username = 'wl2834'
hw.crsp_data_download(wrds_username, start_date='1992-01-01', end_date='2014-12-31')

Loading library list...
Done
Downloading fund_summary_full...


In [9]:

hw = FlowSensitivity()
# Homework begins here:
# Step 1: Download and clean data
merge = hw.import_merge_and_standarize_data()

# Step 2: Filter to just have corporate bond mutual funds
merge = hw.filter_corporate_nonPassiveETF(merge)

# Step 3: Calculate flow ratio and excess return
merge = hw.calculate_flow_ratio(merge)
merge = hw.calculate_excess_return(merge)

# Step 4: Run the rolling CAPM regression to calculate alpha and beta
merge = hw.rolling_regression(merge)

# Step 5 + some further data cleanup (filter, winsorize, lag variables)
merge = hw.filter_out_early_last_history(merge)
merge = hw.select_merged_data(merge)
merge = hw.winsorize_merged_data(merge)
merge = hw.generate_all_variables(merge)

# Step 6: Estimation
results = hw.flow_sensitivity_regression(merge)
hw.save_results(results)


                          PanelOLS Estimation Summary                           
Dep. Variable:           flow_ratio_w   R-squared:                        0.0567
Estimator:                   PanelOLS   R-squared (Between):              0.3277
No. Observations:               16021   R-squared (Within):              -0.0362
Date:                Fri, Apr 05 2024   R-squared (Overall):              0.0557
Time:                        15:10:37   Log-likelihood                 1.987e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      137.55
Entities:                        1694   P-value                           0.0000
Avg Obs:                       9.4575   Distribution:                 F(7,16004)
Min Obs:                       1.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             54.087
                            