In [21]:
import pandas as pd
import numpy as np

### Read Data

In [22]:
df = pd.read_pickle('../data/preprocessed_data_2.pkl')

In [23]:
df.head()

Unnamed: 0,gvkey,datacqtr,cusip,tic,gsector,announcement_date,analyst_date,eps_actual,eps_predicted_mean,eps_predicted_median,...,txdbq,txpq,txtq,wcapq,xaccq,xintq,xoprq,xrdq,xsgaq,prccq
0,1004,1985Q4,361105,AIR,20,1985-12-19,1985-11-14,0.1732,0.13,0.13,...,,,2.33,57.965,,0.723,54.194,,8.243,24.0
1,1004,1986Q1,361105,AIR,20,1986-03-21,1985-12-19,0.1419,0.13,0.13,...,,,2.35,59.787,,0.751,57.559,,8.746,20.375
2,1004,1986Q2,361105,AIR,20,1986-07-08,1986-03-20,0.1599,0.15,0.15,...,,1.054,2.46,70.657,,0.799,61.242,,10.526,23.625
3,1004,1986Q3,361105,AIR,20,1986-09-24,1986-07-17,0.1466,0.16,0.16,...,,,2.53,88.918,,1.1,58.741,,8.962,23.5
4,1004,1986Q4,361105,AIR,20,1986-12-16,1986-10-16,0.1665,0.18,0.18,...,,,3.36,91.195,,0.7,66.944,,9.385,23.5


### Prepare Data

In [24]:
missing_df = df.isna().sum()

In [25]:
columns_to_impute = [x for x in df.columns if x not in missing_df[missing_df==0].index]
columns_to_impute = [x for x in columns_to_impute if x!='eps_actual']
print(len(columns_to_impute))

66


In [26]:
df[columns_to_impute] = (df[columns_to_impute] - df[columns_to_impute].mean()) / df[columns_to_impute].std()

In [27]:
columns_dict = {col:{'data':df[['datacqtr', 'gvkey', col]].dropna()} for col in columns_to_impute}
len(columns_dict)

66

In [28]:
def quarter_to_float(yq):
    year, quarter = yq[:-2], yq[-2:]
    return float(year) + (int(quarter[-1]) - 1) * 0.25

In [29]:
for col in columns_dict.keys():
    col_df = columns_dict[col]['data'].copy(deep=True)

    col_df['yq_float'] = col_df['datacqtr'].apply(quarter_to_float)
    col_df.sort_values(['gvkey', 'yq_float'], inplace=True)

    col_df['diff'] = col_df.groupby('gvkey')['yq_float'].diff().fillna(0)
    col_df['break'] = (col_df['diff'] != 0.25).astype(int)
    
    col_df['sequence_id'] = col_df.groupby('gvkey')['break'].cumsum()
    sequence_counts = col_df.groupby(['gvkey', 'sequence_id']).size()
    max_sequence_id = sequence_counts.groupby('gvkey').idxmax()

    col_df2 = pd.concat([col_df[col_df['gvkey'] == cat].loc[col_df['sequence_id'] == seq_id] for cat, seq_id in max_sequence_id])

    col_df2 = col_df2.drop(columns = ['yq_float','diff','break','sequence_id']).reset_index(drop=True)
    col_df2 = col_df2.sort_values(by=['gvkey','datacqtr']).reset_index(drop=True)

    columns_dict[col]['data'] = col_df2

In [30]:
np.random.seed(42)

missing_perc = 0.2
for col in columns_dict.keys():
    col_missing_df = columns_dict[col]['data'].copy(deep=True)
    col_missing_df[col] =  col_missing_df[col].where(np.random.rand(col_missing_df.shape[0]) > missing_perc, np.nan)

    columns_dict[col]['data_missing'] = col_missing_df

### Impute missing values

In [31]:
rolling_methods = {f'Rolling Avg. {i}':i+1 for i in range(2,8+1)}
forward_methods = {f'Forward fill {i}':i for i in range(1,8+1)}
all_methods = {**rolling_methods, **forward_methods}
all_methods

{'Rolling Avg. 2': 3,
 'Rolling Avg. 3': 4,
 'Rolling Avg. 4': 5,
 'Rolling Avg. 5': 6,
 'Rolling Avg. 6': 7,
 'Rolling Avg. 7': 8,
 'Rolling Avg. 8': 9,
 'Forward fill 1': 1,
 'Forward fill 2': 2,
 'Forward fill 3': 3,
 'Forward fill 4': 4,
 'Forward fill 5': 5,
 'Forward fill 6': 6,
 'Forward fill 7': 7,
 'Forward fill 8': 8}

In [32]:
def impute_missing_with_avg(series, window):
    return series.rolling(window=window, min_periods=window-1).mean()

def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean())

In [33]:
methods_dict = {col:{} for col in columns_dict.keys()}
len(methods_dict)

66

In [34]:
for col in columns_dict.keys():
    for method in all_methods.keys():
        col_filled_df = columns_dict[col]['data_missing'].copy(deep=True)

        if 'Rolling' in method:
            col_filled_df[col] = col_filled_df.groupby('gvkey')[col].transform(lambda x: x.fillna(x.rolling(window=all_methods[method], min_periods=1).mean()))
        elif 'Forward' in method:
            col_filled_df[col] = col_filled_df.groupby('gvkey')[col].transform(lambda x: x.fillna(method='ffill', limit=all_methods[method]))

        rmse_value = rmse(columns_dict[col]['data'][col], col_filled_df[col])
        missing_filled_perc = col_filled_df[col].isna().sum()/columns_dict[col]['data_missing'][col].isna().sum()
        
        methods_dict[col][method] = [rmse_value, missing_filled_perc]

### Optimal imputation method

In [35]:
methods_rmse = {}
methods_miss_perc = {}
for col in methods_dict.keys():
    methods_rmse[col] = {}
    methods_miss_perc[col] = {}
    for method, result in methods_dict[col].items():
        methods_rmse[col][method] = result[0]
        methods_miss_perc[col][method] = result[1]

In [36]:
methods_rmse_df = pd.DataFrame(methods_rmse)
methods_miss_perc_df = pd.DataFrame(methods_miss_perc)

In [37]:
mean_rmse = methods_rmse_df.mean(axis=1)
filled_perc_miss = 1 - methods_miss_perc_df.mean(axis=1)

result_df = pd.DataFrame({'Mean RMSE': mean_rmse, 'Filled Percent of Missing Values':filled_perc_miss})

In [38]:
result_df = result_df.reset_index().rename(columns={'index':'Method'})
result_df['Period(s)'] = result_df['Method'].str.slice(-1,)
result_df['Method'] = result_df['Method'].str.slice(0,-1).str.strip()
result_df.loc[result_df['Method']=='Rolling Avg.','Method'] = 'Rolling average'

In [39]:
result_df = result_df[['Method','Period(s)','Mean RMSE','Filled Percent of Missing Values']]
result_df = result_df.sort_values(by=['Period(s)']).reset_index(drop=True)
result_df.round(3)

Unnamed: 0,Method,Period(s),Mean RMSE,Filled Percent of Missing Values
0,Forward fill,1,0.134,0.77
1,Rolling average,2,0.15,0.919
2,Forward fill,2,0.15,0.919
3,Rolling average,3,0.152,0.948
4,Forward fill,3,0.153,0.948
5,Rolling average,4,0.149,0.953
6,Forward fill,4,0.153,0.953
7,Rolling average,5,0.154,0.954
8,Forward fill,5,0.153,0.954
9,Rolling average,6,0.159,0.955


In [40]:
result_df.to_csv('../results/imputation_methods_performance.csv')