<a href="https://colab.research.google.com/github/mprewarski/stock-models/blob/main/fin_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install any dependencies

In [1]:
! pip install -q yfinance

## Import Libraries

In [138]:
%matplotlib inline

import numpy as np
import polars as pl
import plotly.express as px
from datetime import datetime
import yfinance as yf
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
!pip install datasets
import datasets




In [5]:


dataset = datasets.load_dataset('ashraq/financial-news', split='train')


Repo card metadata block was not found. Setting CardData to empty.


In [11]:
dataset

Dataset({
    features: ['headline', 'url', 'publisher', 'date', 'stock'],
    num_rows: 1845559
})

In [8]:
df = pd.DataFrame(dataset)

In [9]:
df.head()

Unnamed: 0,headline,url,publisher,date,stock
0,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A
1,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A
2,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A
3,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A
4,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A


In [12]:
df_pl = dataset.to_polars()

In [14]:
df_pl.head()

headline,url,publisher,date,stock
str,str,str,str,str
"""Agilent Techno…","""http://www.gur…","""GuruFocus""","""2020-06-01 00:…","""A"""
"""Agilent (A) Ge…","""http://www.zac…","""Zacks""","""2020-05-18 00:…","""A"""
"""J.P. Morgan As…","""http://www.gur…","""GuruFocus""","""2020-05-15 00:…","""A"""
"""Pershing Squar…","""http://www.gur…","""GuruFocus""","""2020-05-15 00:…","""A"""
"""Agilent Awards…","""http://www.gur…","""GuruFocus""","""2020-05-12 00:…","""A"""


In [27]:
df_pl.describe()

describe,headline,url,publisher,date,stock
str,str,str,str,str,str
"""count""","""1845559""","""1845559""","""1845559""","""1845559""","""1845559"""
"""null_count""","""0""","""0""","""0""","""0""","""0"""
"""mean""",,,,,
"""std""",,,,,
"""min""","""""$2-Billion Ma…","""http://marketf…","""Accesswire""","""1969-12-31 00:…","""A"""
"""25%""",,,,,
"""50%""",,,,,
"""75%""",,,,,
"""max""","""補倉(""Bǔcāng"") =…","""https://www.za…","""ycharts""","""2020-06-04 00:…","""ZX"""


In [43]:
tickers = df_pl["stock"].explode().unique().to_list()

In [59]:
'CPRT' in tickers

True

In [66]:
df_cprt = df_pl.filter(pl.col("stock") == "CPRT")

In [67]:
df_cprt.shape

(594, 5)

In [15]:
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

In [68]:
def calc_sentiment(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1022, truncation=True)
    outputs = model(**inputs)
    v = map_sf_to_single(outputs)
    return v
    #sentiment_class = outputs.logits.argmax(dim=1).item()
    #sentiment_mapping = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    #predicted_sentiment = sentiment_mapping.get(sentiment_class, 'Unknown')

In [118]:
def map_sf_to_single(sf_out):
    smax = sf_out.logits.softmax(dim=1).squeeze()
    v = 1 * smax[2] + 0.5 * smax[1]
    return v.item()

In [119]:
def df_sentiment(df, col_name):
    #print(df['headline'].shape)
    sentiment = []
    for hl in df['headline']:
        s = calc_sentiment(model, tokenizer, hl)
        sentiment.append(s)
    return sentiment

In [120]:
sentiment = df_sentiment(df_cprt, 'headline')

In [123]:
sentiment[:20]


[0.49999183416366577,
 0.49998709559440613,
 0.4999935030937195,
 0.4999890625476837,
 0.4999898076057434,
 0.49999159574508667,
 0.4999912679195404,
 0.49998873472213745,
 0.49998676776885986,
 0.49998968839645386,
 0.9997230172157288,
 0.49998968839645386,
 0.9776378870010376,
 0.49999019503593445,
 0.9994962215423584,
 0.49999627470970154,
 0.9749326705932617,
 0.9995773434638977,
 0.5000318884849548,
 0.49998754262924194]

In [127]:
df_cprt = df_cprt.with_columns(pl.Series(name="sentiment", values=sentiment))

In [128]:
df_cprt.head(4)

headline,url,publisher,date,stock,sentiment
str,str,str,str,str,f64
"""SkyTop Capital…","""http://www.gur…","""GuruFocus""","""2020-05-15 00:…","""CPRT""",0.499992
"""Gobi Capital L…","""http://www.gur…","""GuruFocus""","""2020-05-15 00:…","""CPRT""",0.499987
"""Foyston, Gordo…","""http://www.gur…","""GuruFocus""","""2020-05-15 00:…","""CPRT""",0.499994
"""Avenir Corp Bu…","""http://www.gur…","""GuruFocus""","""2020-05-14 00:…","""CPRT""",0.499989


In [134]:
import plotly.express as px

In [137]:
fig = px.bar(df_cprt, x="date", y="sentiment")
fig.show()