# Tone - LM Dictionary
*Source: [Loughran & McDonald (2011)](https://sraf.nd.edu/textual-analysis/resources/#Master%20Dictionary)*

In [1]:
NAME = '04-01_tone_lm'
PROJECT = 'conference-calls-sentiment'
PYTHON_VERSION = '3.7.0'

### Imports  

In [2]:
%%capture
import os
import re
import pickle
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Progress bar
from tqdm import tqdm
tqdm.pandas()

# NLP
from sklearn.feature_extraction.text import CountVectorizer

### Settings

In [3]:
workdir = re.sub("(?<={})[\w\W]*".format(PROJECT), "", os.getcwd())
os.chdir(workdir)

pipeline = os.path.join('2_pipeline', NAME)
if not os.path.exists(pipeline):
    os.makedirs(pipeline)
    for folder in ['out', 'store', 'tmp']:
        os.makedirs(os.path.join(pipeline, folder))

---
# Main Code 

In [None]:
cc = pd.read_feather(os.path.join('2_pipeline', '02-02_conference_calls_preprocess', 'out', 'cc_transcripts.feather'))
cc.head()

## Loughran & McDonald (2011)

In [5]:
with open(os.path.join('2_pipeline', '03-01_model_dictionaries', 'out', 'lm_positive.pickle'), 'rb') as f:
    lm_positive = pickle.load(f)

with open(os.path.join('2_pipeline', '03-01_model_dictionaries', 'out', 'lm_negative.pickle'), 'rb') as f:
    lm_negative = pickle.load(f)

print(f"# Positive: {len(lm_positive)}\n# Negative: {len(lm_negative)}")

# Positive: 354
# Negative: 2355


In [6]:
lm_vocabulary = {j: i for i, j in enumerate(lm_positive + lm_negative)}
cv = CountVectorizer(vocabulary=lm_vocabulary)

cv_transcripts = cv.fit_transform(cc['transcript'])

In [7]:
# Vocabulary mapping
lm_positive_mapping = np.array([True if f in lm_positive else False for f in cv.get_feature_names()])
lm_negative_mapping = np.array([True if f in lm_negative else False for f in cv.get_feature_names()])

def sentiment_count(row):
    positive_count = cv_transcripts[row.name, lm_positive_mapping].sum()
    negative_count = cv_transcripts[row.name, lm_negative_mapping].sum()
    return positive_count, negative_count

In [8]:
cc['lm_positive'], cc['lm_negative'] = zip(*cc.progress_apply(sentiment_count, axis=1))

100%|██████████| 6072868/6072868 [50:27<00:00, 2006.16it/s]


## Process Sentiment

In [9]:
def get_lm_tone(row):
    if (row['lm_positive'] + row['lm_negative']) > 0:  # avoid zero division error
        tone_lm = (row['lm_positive']-row['lm_negative']) / (row['lm_positive']+row['lm_negative'])
    else:
        tone_lm = 0
    return tone_lm

### Tone by Speaker

In [None]:
lm_tone_by_speaker = (cc
                      .groupby(['gvkey', 'ticker', 'event_date', 'speaker_role', 'speaker_name', 'speaker_firm'])
                      .agg(num_words=('num_words', np.sum),
                           num_sentences=('num_words', 'count'),
                           lm_positive=('lm_positive', np.sum),
                           lm_negative=('lm_negative', np.sum))
                      .assign(lm_tone=lambda x: x.apply(get_lm_tone, axis=1),
                              lm_tone_norm=lambda x: (x['lm_tone']-x['lm_tone'].mean()) 
                                          / x['lm_tone'].std())
                      .reset_index())
lm_tone_by_speaker

In [11]:
# Save
lm_tone_by_speaker.to_feather(os.path.join(pipeline, 'out', 'lm_tone_by_speaker.feather'))

### Tone by Role

In [12]:
lm_tone_by_role = (cc
                   .groupby(['gvkey', 'ticker', 'event_date', 'speaker_role'])
                   .agg(num_words=('num_words', np.sum),
                        num_sentences=('num_words', 'count'),
                        num_speakers=('speaker_name', 'nunique'),
                        lm_positive=('lm_positive', np.sum),
                        lm_negative=('lm_negative', np.sum))
                   .assign(lm_tone=lambda x: x.apply(get_lm_tone, axis=1),
                           lm_tone_norm=lambda x: (x['lm_tone']-x['lm_tone'].mean()) 
                         / x['lm_tone'].std())
                   .reset_index())

lm_tone_by_role

Unnamed: 0,gvkey,ticker,event_date,speaker_role,num_words,num_sentences,num_speakers,lm_positive,lm_negative,lm_tone,lm_tone_norm
0,1013,ADCT,2004-02-18,Analyst,1511,87,12,13,21,-0.235294,-0.877963
1,1013,ADCT,2004-02-18,Management,2371,130,2,34,17,0.333333,0.774359
2,1013,ADCT,2004-05-19,Analyst,1306,70,7,7,14,-0.333333,-1.162846
3,1013,ADCT,2004-05-19,Management,2414,125,2,33,15,0.375000,0.895435
4,1013,ADCT,2004-08-25,Analyst,1248,60,8,6,24,-0.600000,-1.937729
...,...,...,...,...,...,...,...,...,...,...,...
52139,316056,ALLE,2020-04-23,Management,3934,229,2,83,48,0.267176,0.582118
52140,316056,ALLE,2020-07-23,Analyst,811,41,7,10,11,-0.047619,-0.332615
52141,316056,ALLE,2020-07-23,Management,3225,176,2,80,34,0.403509,0.978276
52142,316056,ALLE,2020-10-22,Analyst,1130,67,8,12,7,0.263158,0.570443


In [13]:
# Save
lm_tone_by_role.to_feather(os.path.join(pipeline, 'out', 'lm_tone_by_role.feather'))

### Sentiment by Firm

In [None]:
lm_tone_by_firm = (cc
                   .groupby(['gvkey', 'ticker', 'event_date'])
                   .agg(num_words=('num_words', np.sum),
                        num_sentences=('num_words', 'count'),
                        num_speakers=('speaker_name', 'nunique'),
                        lm_positive=('lm_positive', np.sum),
                        lm_negative=('lm_negative', np.sum))
                   .assign(lm_tone=lambda x: x.apply(get_lm_tone, axis=1),
                           lm_tone_norm=lambda x: (x['lm_tone']-x['lm_tone'].mean()) 
                                            / x['lm_tone'].std())
                   .reset_index())
lm_tone_by_firm

In [15]:
# Save
lm_tone_by_firm.to_feather(os.path.join(pipeline, 'out', 'lm_tone_by_firm.feather'))