In [1]:
import pandas as pd

In [26]:
def Bin_Portfolio(dataframe, vars_to_bin=['mkt'], go_long=[True], number_of_bins=5):
    """Parameters: 
        - vars_to_bin (list<string>): A list of variables around which binning should take place
        - coeff_signs (list[1 or -1]): A list indicating whether to go high or low in each bin
        - df (pd.Dataframe): a dataframe containing all of the necessary data.
        Returns:
        - portfolio: a dataframe containing stocks to go long in.
        - average return: the return of the portfolio (assuming equal weights)
    """

    if len(go_long) != len(vars_to_bin):
        raise ValueError(f"The length of coeff_signs needs to be the same as the length of vars_to_bin. {len(go_long)} != {len(vars_to_bin)}")

    # Find the subset of the dataframe that we care about
    df = dataframe[vars_to_bin + ['RET']]
    df = df.dropna()
    df = df.reset_index(drop=True)

    bin_names = []
    best_bins = []
    for i, bin_var in enumerate(vars_to_bin):
        column_name = f"{bin_var}_bins"
        bin_names.append(column_name)
        df[column_name] = pd.qcut(df[bin_var],number_of_bins,labels=False)

        # Based on go_long, take the highest or lowest bin

        best_bin = number_of_bins - 1 if go_long[i] else 0

        df = df[df[column_name] == best_bin]
        best_bins.append(best_bin)

    # NOTE: This port stuff can be useful to visualize if there's only to variables to bin
    #port = df.groupby(bin_names)['RET'].mean()*100
    #port = port.unstack(level=bin_names[-1])



    return df, df.groupby(bin_names)['RET'].mean()*100

In [18]:
# Load data into pandas dataframe
df = pd.read_parquet('Data/BoQ_Data.parquet')
df.head()

Unnamed: 0,A2ME,BEME,BEME_adj,BEME_ind,CEI,DATE,DEC_ME,DEC_SHROUT,Div,Div_ann,...,s2invt,s2rect,sale,sale_ind,sales_g,sga2s,spread_mean,std_turn,std_vol,suv
0,,,,0.824752,,1986-01-31,,,,,...,,,,,,,,,,
1,,,,0.825905,,1986-02-28,,,0.0,,...,,,,,,,0.076998,0.21208,7804.560824,
2,,,,0.825905,,1986-03-31,,,0.0,,...,,,,,,,0.055511,0.107977,3973.567637,-0.363314
3,,,,0.825015,,1986-04-30,,,0.0,,...,,,,,,,0.037231,0.174533,6422.82587,-0.571056
4,,,,0.824672,,1986-05-31,,,0.0,,...,,,,,,,0.048336,0.150228,5527.502665,-0.708679


In [32]:
vars_to_bin = ['ME', 'BEME']
go_long = [True, True]
portfolio, avg_returns = Bin_Portfolio(dataframe=df, vars_to_bin=vars_to_bin,go_long=go_long)
portfolio

Unnamed: 0,ME,BEME,RET,ME_bins,BEME_bins
5951,1.015796e+06,0.801141,0.032370,4,4
5952,1.088226e+06,0.801141,0.063048,4,4
5953,1.048880e+06,0.801141,-0.036156,4,4
5954,1.106256e+06,0.801141,0.050016,4,4
5955,1.096549e+06,0.801141,-0.008775,4,4
...,...,...,...,...,...
2723459,1.790098e+06,1.871800,-0.034660,4,4
2723460,1.644891e+06,1.871800,-0.081117,4,4
2723461,1.721065e+06,1.871800,0.046310,4,4
2723462,1.178323e+06,2.265606,-0.315353,4,4


In [33]:
avg_returns

ME_bins  BEME_bins
4        4            2.014249
Name: RET, dtype: float64