# Create LLM Features

In this notebook, we use the FinBERT LLM to process the earnings calls and extract sentiment features related to the call. FinBERT is trained on exactly the type of data being used in this analysis: Financial Earnings Transcripts. For this reason, we use the LLM as-is and do not pretrain it. 

When finished, we drop the transcript column from our working dataframe as it is not needed in our regression analysis.

In [1]:
import numpy as np
import pandas as pd
import datetime
import re
import time
import yfinance as yf
import os, contextlib
import warnings

from transformers import AutoModelForSequenceClassification

import finbert
from finbert.finbert import *
import finbert.utils as tools

In [2]:
# divide text into a number of fractions/chunks, breaking on sentence boundaries
def divide_text(text, chunks=1, sep='.'):
    N = len(text)
    breaks = []
    divided = []
    for i in range(0, N - chunks, int(N/chunks)):
        breaks.append(i)
    breaks.append(N)

    for i in range(1, chunks):
        loc = text.find(sep, breaks[i])
        if loc > 0:
            breaks[i] = loc + 1
        divided.append(text[breaks[i-1]: breaks[i]])

    divided.append(text[breaks[i]:])
    
    return divided

In [3]:
llm = pd.read_pickle('data/stock_earnings_df.pkl.bz2')

In [4]:
# configure the LLM for classification
BASE_MODEL = "ProsusAI/finbert"
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, cache_dir=None, num_labels=3)

In [8]:
# simple example using the LLM to create features. This example is not used as part of the project.
text = "Later that day Apple said it was revising down its earnings expectations in \
the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. \
The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours \
trading and the decline was extended to more than 10% when the market opened. The dollar fell \
by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering \
some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. \
Yields on government bonds fell as investors fled to the traditional haven in a market storm."
#result = finbert.finbert.predict(text,model)
#result

In [5]:
# use the LLM to generate predictions for the earnings call transcripts
# this is extremely time consuming, even on a GPU
# partial checkpoints are written for every 1000 transcripts
logging.disable(logging.CRITICAL)
warnings.filterwarnings('ignore')

idx = 0
start_time = time.time()
for index, row in llm.iterrows():
    index_start = time.time()
    text = row['transcript']
    for chunks in [1, 3, 4]:
        text_blocks = divide_text(text, chunks)
        for chunk in range(0, chunks):
            suffix = '_%d_%d' % (chunk+1, chunks)
            result = finbert.finbert.predict(text_blocks[chunk], model, use_gpu=True)

            sentiment = result['sentiment_score'].median()
            llm.loc[index, 'sentiment' + suffix] = sentiment
    
            prediction = result.groupby('prediction')['prediction'].agg(['count'])
            prediction = prediction.div(prediction.sum())
            prediction = prediction.T
            prediction = prediction.reset_index()
            prediction = prediction.drop(columns=['index'])
            prediction = prediction.rename_axis(None, axis=1)

            for label in ['positive', 'negative', 'neutral']:
                try:
                    llm.loc[index, label + suffix] = prediction[label].iloc[0]
                except:
                    llm.loc[index, label + suffix] = 0.0

    index_end = time.time()
    index_elapsed = index_end - index_start
    
    idx += 1
    print('completed row %4d (%.1f kb) in %.1f sec' % (idx, len(text)/1024, index_elapsed))
    if idx % 1000 == 0:
        print('='*80)
        print('total runtime: %.1f sec' % (index_end - start_time))
        llm.to_pickle('data/llm_partial.pkl.bz2')

print('='*80)
print('total runtime: %.1f sec' % (index_end - start_time))
print('='*80)

completed row    1 (34.9 kb) in 11.7 sec
completed row    2 (29.1 kb) in 8.1 sec
completed row    3 (51.8 kb) in 15.3 sec
completed row    4 (26.6 kb) in 6.5 sec
completed row    5 (48.6 kb) in 14.4 sec
completed row    6 (61.8 kb) in 17.9 sec
completed row    7 (54.3 kb) in 15.7 sec
completed row    8 (31.1 kb) in 9.6 sec
completed row    9 (36.0 kb) in 11.4 sec
completed row   10 (43.2 kb) in 12.2 sec
completed row   11 (71.3 kb) in 29.1 sec
completed row   12 (50.2 kb) in 12.8 sec
completed row   13 (65.1 kb) in 20.4 sec
completed row   14 (50.4 kb) in 15.7 sec
completed row   15 (53.5 kb) in 15.2 sec
completed row   16 (28.6 kb) in 7.1 sec
completed row   17 (54.4 kb) in 16.5 sec
completed row   18 (55.0 kb) in 18.3 sec
completed row   19 (40.0 kb) in 13.3 sec
completed row   20 (51.9 kb) in 15.1 sec
completed row   21 (58.2 kb) in 17.9 sec
completed row   22 (45.2 kb) in 10.9 sec
completed row   23 (65.4 kb) in 19.7 sec
completed row   24 (38.7 kb) in 10.6 sec
completed row   25 (

In [6]:
llm = llm.drop(columns=['transcript'])

In [7]:
llm.to_pickle('data/llm_complete.pkl.bz2')