# Dataset Construction

In [5]:
NAME = '07-02_dataset_construction'
PROJECT = 'conference-calls-sentiment'
PYTHON_VERSION = '3.7.0'

### Imports  

In [6]:
import os
import re
import numpy as np
import pandas as pd

### Settings

In [7]:
workdir = re.sub("(?<={})[\w\W]*".format(PROJECT), "", os.getcwd())
os.chdir(workdir)

pipeline = os.path.join('2_pipeline', NAME)
if not os.path.exists(pipeline):
    os.makedirs(pipeline)
    for folder in ['out', 'store', 'tmp']:
        os.makedirs(os.path.join(pipeline, folder))

---
# Main Code 

## Firm-level

### Tone

In [None]:
tone_by_firm = pd.read_feather(os.path.join('2_pipeline', '04-03_tone_measures', 'out', 'tone_by_firm.feather'))
tone_by_firm.head()

### Abnormal Return Variable

In [None]:
abnormal_returns = pd.read_feather(os.path.join('2_pipeline', '05-01_abnormal_returns', 'out', 'abnormal_returns.feather'))
abnormal_returns.head()

### Control Variables

In [None]:
controls = pd.read_feather(os.path.join('2_pipeline', '07-01_dataset_control_variables', 'out', 'control_variables.feather'))
controls.head()

### Merge

In [None]:
dataset = (tone_by_firm
           .merge(abnormal_returns, how='left', validate='1:1')
           .merge(controls, on=['gvkey', 'quarter'], how='left', validate='m:1')
           .assign(year=lambda x: x['event_date'].dt.year)
           .sort_values(['gvkey', 'event_date'])
           .reset_index(drop=True))
dataset

In [None]:
dataset.notna().sum().to_frame().T

In [16]:
# Save
dataset.to_feather(os.path.join(pipeline, 'out', 'dataset.feather'))
dataset.drop(columns='quarter').to_stata(os.path.join(pipeline, 'out', 'dataset.dta'))

## Analyst Dataset

### Sentiment

In [None]:
tone_by_speaker = pd.read_feather(os.path.join('2_pipeline', '04-03_tone_measures', 'out', 'tone_by_speaker.feather'))
tone_by_analyst = (tone_by_speaker
                   .query("speaker_role == 'Analyst'")
                   .rename(columns={'speaker_name': 'analyst',
                                    'speaker_firm': 'brokerage'})
                   .drop(columns=['speaker_role'])
                   .drop_duplicates(subset=['gvkey', 'event_date', 'analyst']))
tone_by_analyst.head()

### IBES Variables

In [None]:
analysts_output = pd.read_feather(os.path.join('2_pipeline', '06-04_analysts_output', 'out', 'analysts_output.feather'))
analysts_output = analysts_output.drop_duplicates(['gvkey', 'event_date', 'analyst'])
analysts_output.head()

### Controls

In [None]:
controls = pd.read_feather(os.path.join('2_pipeline', '07-01_dataset_control_variables', 'out', 'control_variables.feather'))
controls.head()

### Merge

In [None]:
analysts_dataset = (tone_by_analyst
                    .merge(analysts_output,
                           on=['gvkey', 'event_date', 'analyst'],
                           validate='1:1')
                    .merge(controls,
                           on=['gvkey', 'quarter'],
                           validate='m:1')
                    .sort_values(['gvkey', 'event_date'])
                    .reset_index(drop=True))
analysts_dataset

In [None]:
analysts_dataset.notna().sum().to_frame().T

In [15]:
# Save
analysts_dataset.to_feather(os.path.join(pipeline, 'out', 'analysts_dataset.feather'))
analysts_dataset.drop(columns=['brokerage', 'quarter']).to_stata(os.path.join(pipeline, 'out', 'analysts_dataset.dta'))