In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.impute import SimpleImputer
from scipy.stats.mstats import winsorize

In [2]:
# Define "micro-stocks" as any stock not within the top micro_stock_limit percent of market capitalization
# when sorting stocks in descending order of market cap.
micro_stock_limit = .98

# Specify the maximum percentage of NaNs tolerable to still use a particular feature for a particular month.
use_feature_limit = .1

# Specify the percentage of feature data on each extreme to winsorize.
winsorize_limits = (.01, .01)

In [6]:
def full_index(start, end):
    '''
    Return a full list of all months between start and end in yyyymm string format.
    '''
    start_year = int(str(start)[:-2])
    end_year = int(str(end)[:-2])
    start_month = int(str(start)[-2:])
    end_month = int(str(end)[-2:])
    return [
        int(str(y) + str(m + 1).rjust(2, "0"))
        for y, m in [
            divmod(ym, 12)
            for ym in range(12 * start_year + start_month - 1, 12 * end_year + end_month)
        ]
    ]

def lag_returns(df):
    '''
    Lag the "ret" column so that it behaves as a dependent variable for prediction.
    df must contain observations associated with one stock only.
    '''
    df.index = df.index.droplevel("permno")
    df = df.reindex(full_index(df.index.min(), df.index.max()))
    df.ret = df.ret.shift(-1)
    return df.dropna(subset="ret")

def clean_monthly_data(df):
    '''
    First, filter out "micro-stocks", defined as stocks occupying the bottom 2% of cumulative
    market cap of all stocks, sorted in descending order of market cap. Then, apply clean_signal_data
    to each individual signal.
    '''
    df.index = df.index.droplevel("yyyymm")
    df = df.sort_values(by="Size")
    df = df.assign(cumsize=np.exp(-df["Size"]).cumsum())
    return df[df["cumsize"] < micro_stock_limit * np.exp(-df["Size"]).sum()] \
        .drop(labels="cumsize", axis=1) \
        .apply(clean_signal_data)

def clean_signal_data(series):
    '''
    If this signal's data series is at least 90% complete (contains fewer than 10% NANs), impute the
    remaining NANs. Otherwise, do nothing, since this feature will be discarded during backtesting if
    even a single NAN is present.
    '''
    if series.name in is_continuous_signal and series.isna().sum() < use_feature_limit * series.shape[0]:
        simple_imputer = SimpleImputer(strategy="mean" if is_continuous_signal[series.name] else "most_frequent")
        imputed_series = pd.Series(simple_imputer.fit_transform(series.values.reshape([-1, 1])).reshape([-1]), index=series.index)
        return winsorize(imputed_series, winsorize_limits)
    else:
        return series

In [7]:
# Import a list of all signals, and whether they are a continuous or discrete signal
signal_doc = pd.read_csv("data/SignalDoc.csv")
is_continuous_signal = (signal_doc[signal_doc["Cat.Signal"] == "Predictor"].set_index("Acronym")["Cat.Form"] == "continuous").to_dict()

# Import joined dataset of signals and prices
df = pd.read_csv("data/predictors_returns_joined.csv", index_col=["permno", "yyyymm"])

In [8]:
# Exclude non-common shares
df = df[df["shrcd"].isin([10, 11])]

# Exclude financial shares
df = df[~df["siccd"].between(6000, 6999)]

# Exclude low liquidity stocks
df = df[df["prc"] > 0]

# Perform some basic cleaning and apply the functions defined above
df = df.drop(labels=["shrcd", "siccd", "prc"], axis=1) \
    .replace([np.inf, -np.inf], np.nan) \
    .groupby("permno").apply(lag_returns) \
    .groupby("yyyymm").apply(clean_monthly_data)

In [None]:
# Save cleaned data
df.to_csv("data/cleaned_data.csv")

# Save a dictionary of whether each signal is a continuous or discrete signal
with open("data/is_continuous_signal.pkl", "wb") as f:
    pickle.dump(is_continuous_signal, f)