In [1]:
import pandas as pd

### Read Data

In [2]:
df = pd.read_pickle('../data/merged_data.pkl')

In [3]:
print(f'Number of Companies: {df["gvkey"].nunique()}')
print(f'Number of Rows: {len(df)}')
print(f'Number of Columns: {len(df.columns)}')

Number of Companies: 3165
Number of Rows: 141178
Number of Columns: 85


### Sort columns

In [4]:
general_variables = ['gvkey','datacqtr','cusip','tic', 'gsector','announcement_date', 'analyst_date', 'eps_actual', 'eps_predicted_mean', 'eps_predicted_median']

df = df[general_variables + [x for x in df.columns if x not in general_variables]]

### Check column types

In [5]:
with pd.option_context('display.max_rows', None):
    display(df.dtypes)

gvkey                     int64
datacqtr                 object
cusip                    object
tic                      object
gsector                  object
announcement_date        object
analyst_date             object
eps_actual              float64
eps_predicted_mean      float64
eps_predicted_median    float64
mkvaltq                 float64
acchgq                  float64
acomincq                float64
acoq                    float64
actq                    float64
ancq                    float64
aocipenq                float64
aoq                     float64
apq                     float64
atq                     float64
capxy                   float64
chechy                  float64
cheq                    float64
ciotherq                float64
cogsq                   float64
cshopq                  float64
dcomq                   float64
diladq                  float64
dlcq                    float64
dlttq                   float64
doq                     float64
dpactq  

### Check missing values

In [6]:
# percent of missing values by columns
missing_df = pd.DataFrame((df.isna().sum()/len(df)*100).sort_values()).rename(columns={0:'perc'})
missing_df.head(10)

Unnamed: 0,perc
gvkey,0.0
datacqtr,0.0
cusip,0.0
tic,0.0
announcement_date,0.0
analyst_date,0.0
eps_actual,0.0
eps_predicted_mean,0.0
eps_predicted_median,0.0
gsector,0.007792


In [7]:
missing_df.tail(10)

Unnamed: 0,perc
drltq,38.198586
dpactq,39.212909
ivltq,41.241553
drcq,44.151355
cshopq,51.904688
txdbq,56.92105
xrdq,57.722875
recdq,65.044837
xaccq,72.810211
rcpq,80.007508


### Remove companies from utility and finance sectors

In [8]:
df = df[~(df['gsector'].isin(['40','55']))].reset_index(drop=True)

In [9]:
print(f'Number of Companies: {df["gvkey"].nunique()}')
print(f'Number of Rows: {len(df)}')

Number of Companies: 2551
Number of Rows: 111678


### Require non-missing total assets

In [10]:
df['atq'].isna().sum()

150

In [11]:
df = df[(df['atq'].isna()==False)]

In [12]:
print(f'Number of Companies: {df["gvkey"].nunique()}')
print(f'Number of Rows: {len(df)}')

Number of Companies: 2545
Number of Rows: 111528


### Save Data

In [13]:
df.to_pickle('../data/preprocessed_data_1.pkl')