In [1]:
import pandas as pd
import numpy as np

### Read Data

In [2]:
df = pd.read_pickle('../data/preprocessed_data_1.pkl')

In [3]:
print(f'Number of Companies: {df["gvkey"].nunique()}')
print(f'Number of Rows: {len(df)}')
print(f'Number of Columns: {len(df.columns)}')

Number of Companies: 2545
Number of Rows: 111528
Number of Columns: 85


### Fix Outliers

In [4]:
general_variables = ['gsector','gvkey','datacqtr','cusip','tic', 'announcement_date', 'analyst_date', 'eps_predicted_mean', 'eps_predicted_median']
columns_to_fix = [x for x in df.columns if x not in (general_variables + ['atq','mkvaltq'])]

In [5]:
len(columns_to_fix)

74

In [6]:
for column in columns_to_fix:
    series = df[column]/df['atq']
    if column in ['eps_actual','prccq']:
        series = df[column]
    
    lower_threshold = (series).quantile(0.01)
    upper_threshold = (series).quantile(0.99)
    
    df[column] = df[column].mask((series) < lower_threshold, np.nan)
    df[column] = df[column].mask((series) > upper_threshold, np.nan)

In [7]:
df[columns_to_fix].describe().loc[:,df[columns_to_fix].describe().iloc[2]==0]

Unnamed: 0,acchgq,esoptq,xiq
count,107738.0,105745.0,109729.0
mean,0.0,0.0,0.0
std,0.0,0.0,0.0
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,0.0,0.0,0.0


In [8]:
df = df.drop(columns=['acchgq','esoptq','xiq'])

### Drop samples containing missing values in crucial financial variables

In [9]:
df = df.dropna(subset=['atq','ltq','seqq','niq','revtq','chechy'])

### Save Data

In [10]:
df.head()

Unnamed: 0,gvkey,datacqtr,cusip,tic,gsector,announcement_date,analyst_date,eps_actual,eps_predicted_mean,eps_predicted_median,...,txdbq,txpq,txtq,wcapq,xaccq,xintq,xoprq,xrdq,xsgaq,prccq
0,1004,1985Q4,361105,AIR,20,1985-12-19,1985-11-14,0.1732,0.13,0.13,...,,,2.33,57.965,,0.723,54.194,,8.243,24.0
1,1004,1986Q1,361105,AIR,20,1986-03-21,1985-12-19,0.1419,0.13,0.13,...,,,2.35,59.787,,0.751,57.559,,8.746,20.375
2,1004,1986Q2,361105,AIR,20,1986-07-08,1986-03-20,0.1599,0.15,0.15,...,,1.054,2.46,70.657,,0.799,61.242,,10.526,23.625
3,1004,1986Q3,361105,AIR,20,1986-09-24,1986-07-17,0.1466,0.16,0.16,...,,,2.53,88.918,,1.1,58.741,,8.962,23.5
4,1004,1986Q4,361105,AIR,20,1986-12-16,1986-10-16,0.1665,0.18,0.18,...,,,3.36,91.195,,0.7,66.944,,9.385,23.5


In [12]:
df.to_pickle('../data/preprocessed_data_2.pkl')