In [15]:
!pip install transformers

from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

model_name = "ProsusAI/finbert"
model_finbert = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer_finbert = AutoTokenizer.from_pretrained(model_name)

def output_probabilities(text):
  global tokenizer_finbert
  global model_finbert
  inputs = tokenizer_finbert(text, return_tensors="pt")
  outputs = model_finbert(**inputs)
  return torch.nn.functional.softmax(outputs.logits, dim=-1)

def convert_finbert_to_vader(text):
  distribution = output_probabilities(text)
  return float(distribution[0][0]-distribution[0][2])



In [33]:
!pip install finvader

from finvader import finvader

def get_vader_score(text):
  return finvader(text, use_sentibignomics = True, use_henry = True, indicator = 'compound')



In [34]:
from google import colab
colab.drive.mount('./MyDrive')

import json
with open('./MyDrive/MyDrive/combined_selected_sentences_no_duplicates.json', 'r', encoding='utf-8') as f:
  sentences = json.load(f)

Drive already mounted at ./MyDrive; to attempt to forcibly remount, call drive.mount("./MyDrive", force_remount=True).


In [35]:
# Just a test
text1='Revenues have risen on an average by 40 % with margins in late 30s .'
print(text1)
print(convert_finbert_to_vader(text1))
print(get_vader_score(text1))

print('\n')

text2='However , the growth margin slowed down due to the financial crisis .'
print(text2)
print(convert_finbert_to_vader(text2))
print(get_vader_score(text2))

Revenues have risen on an average by 40 % with margins in late 30s .
0.9285646677017212
0.3549


However , the growth margin slowed down due to the financial crisis .
-0.00876725185662508
-0.0194


In [40]:
from tqdm import tqdm

In [41]:
finbert_and_vader={}
for report in tqdm(sentences):
  finbert_and_vader[report]={}
  finbert_and_vader[report]['finbert']=list(map(convert_finbert_to_vader, sentences[report]))
  finbert_and_vader[report]['vader']=list(map(get_vader_score, sentences[report]))

100%|██████████| 6/6 [05:10<00:00, 51.70s/it]


In [42]:
finbert_and_vader

{'f22_sentences': {'finbert': [0.938607931137085,
   0.039770789444446564,
   -0.5722103118896484,
   -0.8694009780883789,
   -0.8872555494308472,
   0.005516743287444115,
   -0.8734026551246643,
   -0.8977593779563904,
   -0.913379967212677,
   -0.8906866312026978,
   -0.13444530963897705,
   -0.9196404218673706,
   -0.8666003346443176,
   0.03522757068276405,
   0.9291642308235168,
   0.9114267230033875,
   0.1139395534992218,
   -0.8784705400466919,
   -0.9329143166542053,
   -0.564735472202301,
   -0.8758662939071655,
   -0.7496278882026672,
   -0.6308491826057434,
   -0.8909431099891663,
   -0.9073465466499329,
   0.005863039754331112,
   0.9363362193107605,
   -0.7966012358665466,
   -0.5361589789390564,
   -0.9321939945220947,
   -0.009459605440497398,
   -0.00774404127150774,
   -0.014840480871498585,
   0.9281219244003296,
   -0.896694540977478,
   0.020486507564783096,
   -0.9186574816703796,
   -0.318280965089798,
   -0.9135357737541199,
   -0.7559205293655396,
   -0.9117060

In [45]:
sentences.keys()

dict_keys(['f22_sentences', 'f23_sentences', 'tsla22_sentences', 'tsla23_sentences', 'ups22_sentences', 'ups23_sentences'])

In [47]:
df_to_plot=pd.DataFrame({'lst1':finbert_and_vader['ups22_sentences']['finbert'], 'lst2':finbert_and_vader['ups22_sentences']['vader'], 'lst_hover':sentences['ups22_sentences']}).sort_values('lst1')

NameError: name 'pd' is not defined

In [48]:
import plotly.graph_objects as go
import pandas as pd

df_to_plot=pd.DataFrame({'lst1':finbert_and_vader['ups22_sentences']['finbert'], 'lst2':finbert_and_vader['ups22_sentences']['vader'], 'lst_hover':sentences['ups22_sentences']}).sort_values('lst1')

lst1 = list(df_to_plot['lst1'])
lst2 = list(df_to_plot['lst2'])
lst_hover = list(df_to_plot['lst_hover'])

x_ticks = list(range(1, len(lst1) + 1))

trace1 = go.Scatter(
    x=x_ticks,
    y=lst1,
    mode='lines',
    name='FinBert',
    line=dict(color='blue'),
    hoverinfo='text',
    text=lst_hover
)

trace2 = go.Scatter(
    x=x_ticks,
    y=lst2,
    mode='lines',
    name='FinVader',
    line=dict(color='red'),
    hoverinfo='text',
    text=lst_hover
)

fig = go.Figure(data=[trace1, trace2])

fig.update_layout(
    title='Sentiment scores of FinBert and FinVader on UPS 2022 report',
    xaxis_title='Index',
    yaxis_title='Sentiment scores',
    hovermode='closest'  # Configure hover mode
)

fig.show()

In [50]:
import plotly.graph_objects as go
import pandas as pd

df_to_plot=pd.DataFrame({'lst1':finbert_and_vader['tsla23_sentences']['finbert'], 'lst2':finbert_and_vader['tsla23_sentences']['vader'], 'lst_hover':sentences['tsla23_sentences']}).sort_values('lst1')

lst1 = list(df_to_plot['lst1'])
lst2 = list(df_to_plot['lst2'])
lst_hover = list(df_to_plot['lst_hover'])

x_ticks = list(range(1, len(lst1) + 1))

trace1 = go.Scatter(
    x=x_ticks,
    y=lst1,
    mode='lines',
    name='FinBert',
    line=dict(color='blue'),
    hoverinfo='text',
    text=lst_hover
)

trace2 = go.Scatter(
    x=x_ticks,
    y=lst2,
    mode='lines',
    name='FinVader',
    line=dict(color='red'),
    hoverinfo='text',
    text=lst_hover
)

fig = go.Figure(data=[trace1, trace2])

fig.update_layout(
    title='Sentiment scores of FinBert and FinVader on Tesla 2023 report',
    xaxis_title='Index',
    yaxis_title='Sentiment scores',
    hovermode='closest'  # Configure hover mode
)

fig.show()

Conclusion:
- **There remained too many unrelevant sentences. (Semantic Search)**
- The unrelevant sentences received mostly random scores.
- Maybe our conversion between FinBert and FinVader is also wrong.
- The models produce similar results in relevant positive sentences.


In [51]:
df_to_plot

Unnamed: 0,lst1,lst2,lst_hover
166,-0.935774,0.5267,"Upon vesting and exercise, including the payme..."
84,-0.932369,-0.1154,Stock options granted under the 2019 Plan may ...
168,-0.932267,0.0039,Portions of the registrant’s Proxy Statement f...
195,-0.931386,0.0258,"Market for Registrant's Common Equity, Related..."
66,-0.931017,0.3182,"MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED..."
...,...,...,...
128,0.927997,0.3612,"R&D expenses increased $894 million, or 29%, i..."
46,0.928434,0.6187,Automotive regulatory credits revenue increase...
178,0.929424,0.3612,"In 2023, we recognized total revenues of $96.7..."
139,0.932809,0.6411,Automotive sales revenue increased $11.30 bill...


In [52]:
convert_finbert_to_vader('Upon vesting and exercise, including the payment of the exercise price of $23.34 per share as adjusted to give effect to the 2020 Stock Split and the 2022 Stock Split, our CEO must hold shares that he acquires for five years post-exercise, other than a cashless exercise where shares are simultaneously sold to pay for the exercise price and any required tax withholding.')

-0.9357743263244629