# Dataset Controls

In [2]:
NAME = '07-01_dataset_control_variables'
PROJECT = 'conference-calls-sentiment'
PYTHON_VERSION = '3.7.0'

### Imports  

In [3]:
import os
import re
import numpy as np
import pandas as pd

### Settings

In [4]:
workdir = re.sub("(?<={})[\w\W]*".format(PROJECT), "", os.getcwd())
os.chdir(workdir)

pipeline = os.path.join('2_pipeline', NAME)
if not os.path.exists(pipeline):
    os.makedirs(pipeline)
    for folder in ['out', 'store', 'tmp']:
        os.makedirs(os.path.join(pipeline, folder))

---
# Main Code 

## Compustat
*I construct the following control variable from Compustat data:*
- *Size*: `ATQ`
- *Eanings*: `NIQ`
- *Loss*: `NIQ` < 0 

In [4]:
compustat = pd.read_csv(os.path.join('0_data', 'compustat', 'compustat_quarterly_2000-2020.csv'))

In [78]:
def lag_controls(data):
    lagged_controls = (data
                       .sort_values(['gvkey', 'ticker', 'quarter'])
                       .assign(quarter=data.groupby(['gvkey', 'ticker'])['quarter'].shift(-1))
                       .dropna(subset=['quarter']))
    return lagged_controls

In [None]:
compustat_clean = (compustat
                   .copy()
                   .assign(quarter=pd.PeriodIndex(compustat['datacqtr'], freq='Q'))
                   .filter(['gvkey', 'tic', 'quarter', 'atq', 'ibq',
                            'cshoq', 'prccq', 'capxy', 'dlttq'])
                   .rename(columns={'tic': 'ticker'})
                   .drop_duplicates(['gvkey', 'quarter'], keep='last')
                   .pipe(lag_controls))
compustat_clean

## Earnings Surprise

In [None]:
surprise = pd.read_csv(os.path.join('0_data', 'ibes', 'ibes_earnings-surprise_2000-2020.csv'))
surprise

In [None]:
surprise['PMON'].value_counts().to_frame().sort_index().T

In [52]:
def add_quarter(row):
    date = pd.to_datetime(surprise['anndats'], format='%Y%m%d')
    row['quarter'] = date.dt.to_period('Q')
    return row

In [None]:
surprise_clean = (surprise.copy()
                          .pipe(add_quarter)
                          .filter(['OFTIC', 'quarter', 'surpmean'])
                          .rename(columns={'OFTIC': 'ticker'})
                          .drop_duplicates(['ticker', 'quarter']))
surprise_clean

## Construct controls dataset

In [None]:
controls = (pd.merge(compustat_clean, surprise_clean, on=['ticker', 'quarter'], how='left')
            .assign(surprise=lambda x: x['surpmean'] / x['prccq'] * 100,
                    size=lambda x: np.log(x['cshoq'] * x['prccq']),
                    roa=lambda x: x['ibq'] / x['atq'] * 100,
                    leverage=lambda x: x['dlttq'] / x['atq'],
                    loss=lambda x: (x['ibq'] < 0).astype('int'),
                    capex=lambda x: x['capxy'] / x['atq'] * 100)
            .filter(['gvkey', 'quarter', 'surprise', 'size', 'roa',
                     'leverage', 'loss', 'capex'])
            .sort_values(['gvkey', 'quarter'])
            .reset_index(drop=True))

controls.head()

In [None]:
controls.isna().sum()

In [83]:
# Save
controls.to_feather(os.path.join(pipeline, 'out', 'control_variables.feather'))