# Tone - FinBERT

In [1]:
NAME = '04-02_tone_finbert'
PROJECT = 'conference-calls-sentiment'
PYTHON_VERSION = '3.7.0'

### Imports  

In [2]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from finbert_utils import preprocess_for_finbert, finbert_predict

Using GPU GeForce GTX 1060 6GB


### Settings

In [3]:
workdir = re.sub("(?<={})[\w\W]*".format(PROJECT), "", os.getcwd())
os.chdir(workdir)

pipeline = os.path.join('2_pipeline', NAME)
if not os.path.exists(pipeline):
    os.makedirs(pipeline)
    for folder in ['out', 'store', 'tmp']:
        os.makedirs(os.path.join(pipeline, folder))

---
# Main Code 

In [4]:
CC_PATH = os.path.join('2_pipeline', '02-02_conference_calls_preprocess', 'out', 'cc_transcripts.feather')

cc = pd.read_feather(CC_PATH)
cc.head()

Unnamed: 0,gvkey,ticker,event_date,coname,speaker_role,speaker_name,speaker_firm,transcript,num_words,transcript_id,quarter,year
0,1013,ADCT,2004-02-18,ADC TELECOMMUNICATIONS INC,Analyst,BUCK E,JANCO PARTNERS,first i think near the beginning of your discu...,35,137638020605,2004Q1,2004
1,1013,ADCT,2004-02-18,ADC TELECOMMUNICATIONS INC,Analyst,BUCK E,JANCO PARTNERS,would you mind running through that list once ...,9,137638020605,2004Q1,2004
2,1013,ADCT,2004-02-18,ADC TELECOMMUNICATIONS INC,Analyst,BUCK E,JANCO PARTNERS,and secondly that obviously means youre pretty...,15,137638020605,2004Q1,2004
3,1013,ADCT,2004-02-18,ADC TELECOMMUNICATIONS INC,Analyst,BUCK E,JANCO PARTNERS,and if you looked at all those without identif...,27,137638020605,2004Q1,2004
4,1013,ADCT,2004-02-18,ADC TELECOMMUNICATIONS INC,Analyst,BUCK E,JANCO PARTNERS,i had assumed that but i was just trying to ge...,16,137638020605,2004Q1,2004


## Preprocess for BERT

In [6]:
LABELS = {0: 'neutral', 1 :'positive', 2: 'negative'}
NUM_LABELS = len(LABELS)
MAX_SEQ_LENGTH = 64
VOCAB_PATH = os.path.join('0_data', 'finbert', 'finbert_vocab')

In [7]:
input_ids, token_type_ids, attention_masks = preprocess_for_finbert(tqdm(cc['transcript']), VOCAB_PATH)

100%|██████████| 6072868/6072868 [1:02:59<00:00, 1606.87it/s]


In [8]:
dataset = TensorDataset(input_ids, token_type_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=64)

## Predict with FinBERT

In [9]:
# Load FinBERT
finbert = torch.load(os.path.join('2_pipeline', '03-02_model_finbert', 'out', 'finbert'))

In [10]:
cc['finbert'] = finbert_predict(finbert, dataloader)

100%|██████████| 94889/94889 [8:25:00<00:00,  3.13it/s]


## Process FinBERT Tone

In [11]:
cc['finbert'].value_counts()

neutral     4100238
positive    1300205
negative     672425
Name: finbert, dtype: int64

In [12]:
def get_finbert_tone(row):
    if (row['positive'] + row['negative']) > 0:  # avoid zero division error
        finbert_tone = (row['positive']-row['negative']) / (row['positive']+row['negative'])
    else:
        finbert_tone = 0
    return finbert_tone

### Tone by Speaker

In [13]:
finbert_tone_by_speaker = (cc
                           .groupby(['gvkey', 'ticker', 'event_date', 'speaker_role', 'speaker_name', 'speaker_firm', 'finbert'])
                           .size().unstack(fill_value=0)
                           .assign(finbert_tone=lambda x: x.apply(get_finbert_tone, axis=1),
                                   finbert_tone_norm=lambda x: (x['finbert_tone']-x['finbert_tone'].mean()) 
                                                                / x['finbert_tone'].std())
                           .reset_index())
finbert_tone_by_speaker

finbert,gvkey,ticker,event_date,speaker_role,speaker_name,speaker_firm,negative,neutral,positive,finbert_tone,finbert_tone_norm
0,1013,ADCT,2004-02-18,Analyst,BUCK E,JANCO PARTNERS,0,6,0,0.000000,-0.024677
1,1013,ADCT,2004-02-18,Analyst,CHURCH R,WACHOVIA SECURITIES,1,8,1,0.000000,-0.024677
2,1013,ADCT,2004-02-18,Analyst,COLEMAN S,MORGAN STANLEY,0,7,1,1.000000,1.458168
3,1013,ADCT,2004-02-18,Analyst,COOPERSCHMIDT M,LEHMAN BROTHERS,1,9,1,0.000000,-0.024677
4,1013,ADCT,2004-02-18,Analyst,GOULD M,CSFB,0,7,2,1.000000,1.458168
...,...,...,...,...,...,...,...,...,...,...,...
308182,316056,ALLE,2020-10-22,Analyst,SNYDER C,UBS INVESTMENT BANK,2,6,0,-1.000000,-1.507521
308183,316056,ALLE,2020-10-22,Analyst,WEST C,LONGBOW RESEARCH LLC,2,3,1,-0.333333,-0.518958
308184,316056,ALLE,2020-10-22,Analyst,WOJS T,ROBERT W. BAIRD & CO. INCORPORATED,0,5,1,1.000000,1.458168
308185,316056,ALLE,2020-10-22,Management,PETRATIS D,ALLEGION PLC,16,48,51,0.522388,0.749943


In [14]:
# Save
finbert_tone_by_speaker.to_feather(os.path.join(pipeline, 'out', 'finbert_tone_by_speaker.feather'))

### Tone by Role

In [15]:
finbert_tone_by_role = (cc
                        .groupby(['gvkey', 'ticker', 'event_date', 'speaker_role', 'finbert'])
                        .size().unstack(fill_value=0)
                        .assign(finbert_tone=lambda x: x.apply(get_finbert_tone, axis=1),
                                finbert_tone_norm=lambda x: (x['finbert_tone']-x['finbert_tone'].mean()) 
                                                        / x['finbert_tone'].std())
                        .reset_index())
finbert_tone_by_role

finbert,gvkey,ticker,event_date,speaker_role,negative,neutral,positive,finbert_tone,finbert_tone_norm
0,1013,ADCT,2004-02-18,Analyst,7,72,8,0.066667,-0.202309
1,1013,ADCT,2004-02-18,Management,15,91,24,0.230769,0.180165
2,1013,ADCT,2004-05-19,Analyst,4,63,3,-0.142857,-0.690647
3,1013,ADCT,2004-05-19,Management,7,88,30,0.621622,1.091126
4,1013,ADCT,2004-08-25,Analyst,11,47,2,-0.692308,-1.971253
...,...,...,...,...,...,...,...,...,...
52139,316056,ALLE,2020-04-23,Management,32,127,70,0.372549,0.510612
52140,316056,ALLE,2020-07-23,Analyst,3,32,6,0.333333,0.419212
52141,316056,ALLE,2020-07-23,Management,15,90,71,0.651163,1.159978
52142,316056,ALLE,2020-10-22,Analyst,10,45,12,0.090909,-0.145807


In [16]:
# Save
finbert_tone_by_role.to_feather(os.path.join(pipeline, 'out', 'finbert_tone_by_role.feather'))

### Tone by Firm

In [17]:
finbert_tone_by_firm = (cc
                        .groupby(['gvkey', 'ticker', 'event_date', 'finbert'])
                        .size().unstack(fill_value=0)
                        .assign(finbert_tone=lambda x: x.apply(get_finbert_tone, axis=1),
                                finbert_tone_norm=lambda x: (x['finbert_tone']-x['finbert_tone'].mean()) 
                                                        / x['finbert_tone'].std())
                        .reset_index())
finbert_tone_by_firm

finbert,gvkey,ticker,event_date,negative,neutral,positive,finbert_tone,finbert_tone_norm
0,1013,ADCT,2004-02-18,22,163,32,0.185185,-0.433520
1,1013,ADCT,2004-05-19,11,151,33,0.500000,0.723564
2,1013,ADCT,2004-08-25,22,105,18,-0.100000,-1.481702
3,1013,ADCT,2004-12-14,11,128,29,0.450000,0.539792
4,1013,ADCT,2005-02-28,27,165,40,0.194030,-0.401012
...,...,...,...,...,...,...,...,...
26232,316056,ALLE,2019-10-24,13,70,51,0.593750,1.068137
26233,316056,ALLE,2020-02-18,19,60,48,0.432836,0.476706
26234,316056,ALLE,2020-04-23,38,164,78,0.344828,0.153236
26235,316056,ALLE,2020-07-23,18,122,77,0.621053,1.168486


In [18]:
# Save
finbert_tone_by_firm.to_feather(os.path.join(pipeline, 'out', 'finbert_tone_by_firm.feather'))