In [1]:
import pandas as pd

### Read Data

In [2]:
df = pd.read_pickle('../data/preprocessed_data_2.pkl')
df = df.sort_values(by=['gvkey', 'datacqtr'])

In [3]:
df.head()

Unnamed: 0,gvkey,datacqtr,cusip,tic,gsector,announcement_date,analyst_date,eps_actual,eps_predicted_mean,eps_predicted_median,...,txdbq,txpq,txtq,wcapq,xaccq,xintq,xoprq,xrdq,xsgaq,prccq
0,1004,1985Q4,361105,AIR,20,1985-12-19,1985-11-14,0.1732,0.13,0.13,...,,,2.33,57.965,,0.723,54.194,,8.243,24.0
1,1004,1986Q1,361105,AIR,20,1986-03-21,1985-12-19,0.1419,0.13,0.13,...,,,2.35,59.787,,0.751,57.559,,8.746,20.375
2,1004,1986Q2,361105,AIR,20,1986-07-08,1986-03-20,0.1599,0.15,0.15,...,,1.054,2.46,70.657,,0.799,61.242,,10.526,23.625
3,1004,1986Q3,361105,AIR,20,1986-09-24,1986-07-17,0.1466,0.16,0.16,...,,,2.53,88.918,,1.1,58.741,,8.962,23.5
4,1004,1986Q4,361105,AIR,20,1986-12-16,1986-10-16,0.1665,0.18,0.18,...,,,3.36,91.195,,0.7,66.944,,9.385,23.5


### Add rows for missing quarters

In [4]:
df['original'] = True

In [5]:
def generate_quarters(start, end):
    start_year, start_quarter = int(start[:4]), int(start[5])
    end_year, end_quarter = int(end[:4]), int(end[5])

    quarters = []

    current_year, current_quarter = start_year, start_quarter

    while current_year < end_year or (current_year == end_year and current_quarter <= end_quarter):
        quarters.append(f'{current_year}Q{current_quarter}')
        
        if current_quarter == 4:
            current_quarter = 1
            current_year += 1
        else:
            current_quarter += 1

    return quarters

In [6]:
df2 = []
for code, code_df in df.groupby('gvkey'):
    if len(code_df)<2:
        df2.append(code_df)
        continue
    
    all_quarters = generate_quarters(code_df['datacqtr'].min(), code_df['datacqtr'].max())
    all_quarters_df = pd.DataFrame({'datacqtr': all_quarters})

    merged_df = all_quarters_df.merge(code_df.copy(deep=True), on='datacqtr', how='left')
    merged_df['gvkey'] = code

    df2.append(merged_df)

df2 = pd.concat(df2)

In [7]:
len(df2)

116937

### Fill missing values using rolling average (4 qtrs)

In [8]:
df2 = df2.sort_values(by=['gvkey','datacqtr'])
df2.head()

Unnamed: 0,datacqtr,gvkey,cusip,tic,gsector,announcement_date,analyst_date,eps_actual,eps_predicted_mean,eps_predicted_median,...,txpq,txtq,wcapq,xaccq,xintq,xoprq,xrdq,xsgaq,prccq,original
0,1985Q4,1004,361105,AIR,20,1985-12-19,1985-11-14,0.1732,0.13,0.13,...,,2.33,57.965,,0.723,54.194,,8.243,24.0,True
1,1986Q1,1004,361105,AIR,20,1986-03-21,1985-12-19,0.1419,0.13,0.13,...,,2.35,59.787,,0.751,57.559,,8.746,20.375,True
2,1986Q2,1004,361105,AIR,20,1986-07-08,1986-03-20,0.1599,0.15,0.15,...,1.054,2.46,70.657,,0.799,61.242,,10.526,23.625,True
3,1986Q3,1004,361105,AIR,20,1986-09-24,1986-07-17,0.1466,0.16,0.16,...,,2.53,88.918,,1.1,58.741,,8.962,23.5,True
4,1986Q4,1004,361105,AIR,20,1986-12-16,1986-10-16,0.1665,0.18,0.18,...,,3.36,91.195,,0.7,66.944,,9.385,23.5,True


In [9]:
missing = df.isna().sum()
columns_to_fill = [x for x in df.columns if x not in missing[missing==0].index and x not in ['eps_actual','mkvaltq']]
print(len(columns_to_fill))
columns_to_fill

65


['acomincq',
 'acoq',
 'actq',
 'ancq',
 'aocipenq',
 'aoq',
 'apq',
 'capxy',
 'cheq',
 'ciotherq',
 'cogsq',
 'cshopq',
 'dcomq',
 'diladq',
 'dlcq',
 'dlttq',
 'doq',
 'dpactq',
 'dpq',
 'drcq',
 'drltq',
 'dvpq',
 'dvy',
 'esopctq',
 'fincfy',
 'gdwlq',
 'intanq',
 'invchy',
 'invtq',
 'ivchy',
 'ivltq',
 'ivncfy',
 'lcoq',
 'lctq',
 'lltq',
 'loq',
 'mibq',
 'mibtq',
 'miiq',
 'nopiq',
 'oancfy',
 'oibdpq',
 'ppentq',
 'pstkq',
 'rcpq',
 'rdipq',
 'recchy',
 'recdq',
 'rectq',
 'req',
 'sivy',
 'spiq',
 'sppivy',
 'stkcoq',
 'tstkq',
 'txdbq',
 'txpq',
 'txtq',
 'wcapq',
 'xaccq',
 'xintq',
 'xoprq',
 'xrdq',
 'xsgaq',
 'prccq']

In [10]:
for col in columns_to_fill:
    df2[col] = df2.groupby('gvkey')[col].transform(lambda x: x.fillna(x.rolling(window=5, min_periods=1).mean()))

df2.head()

Unnamed: 0,datacqtr,gvkey,cusip,tic,gsector,announcement_date,analyst_date,eps_actual,eps_predicted_mean,eps_predicted_median,...,txpq,txtq,wcapq,xaccq,xintq,xoprq,xrdq,xsgaq,prccq,original
0,1985Q4,1004,361105,AIR,20,1985-12-19,1985-11-14,0.1732,0.13,0.13,...,,2.33,57.965,,0.723,54.194,,8.243,24.0,True
1,1986Q1,1004,361105,AIR,20,1986-03-21,1985-12-19,0.1419,0.13,0.13,...,,2.35,59.787,,0.751,57.559,,8.746,20.375,True
2,1986Q2,1004,361105,AIR,20,1986-07-08,1986-03-20,0.1599,0.15,0.15,...,1.054,2.46,70.657,,0.799,61.242,,10.526,23.625,True
3,1986Q3,1004,361105,AIR,20,1986-09-24,1986-07-17,0.1466,0.16,0.16,...,1.054,2.53,88.918,,1.1,58.741,,8.962,23.5,True
4,1986Q4,1004,361105,AIR,20,1986-12-16,1986-10-16,0.1665,0.18,0.18,...,1.054,3.36,91.195,,0.7,66.944,,9.385,23.5,True


In [11]:
df2 = df2[df2['original']==True]
df2 = df2.drop(columns=['original'])

### Save Data

In [12]:
df2.to_pickle('../data/preprocessed_data_3.pkl')