In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

In [2]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0, batch_size=32)

In [3]:
articles = [
    "The stock market saw a huge increase today as tech companies led the gains.",
    "The economic downturn is putting a lot of pressure on businesses across sectors.",
    "This tweet is neutral.",
    "this tweet is both positive and negative but it is not neutral.",
    "Results were not as good as expected but they weren't that bad even if people fear..."
]

results = pipe(articles)

for i, result in enumerate(results):
    print(f"Article {i+1}:")
    print(f"Text: {articles[i]}")
    print(f"Sentiment: {result['label']}, Score: {result['score']}\n")


Article 1:
Text: The stock market saw a huge increase today as tech companies led the gains.
Sentiment: positive, Score: 0.9221598505973816

Article 2:
Text: The economic downturn is putting a lot of pressure on businesses across sectors.
Sentiment: negative, Score: 0.951231062412262

Article 3:
Text: This tweet is neutral.
Sentiment: neutral, Score: 0.9087401032447815

Article 4:
Text: this tweet is both positive and negative but it is not neutral.
Sentiment: neutral, Score: 0.8927180171012878

Article 5:
Text: Results were not as good as expected but they weren't that bad even if people fear...
Sentiment: positive, Score: 0.4589930772781372



  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [11]:
res_df = pd.DataFrame(results)
res_df = pd.get_dummies(res_df).astype(float)
res_df["sent_fbert"] = res_df["score"] * (res_df["label_positive"] - res_df["label_negative"])
# ensuite, l'id√©e est de faire
# df_d["sent_fbert"] = res_df["sent_fbert"].to_list()
res_df

Unnamed: 0,score,label_negative,label_neutral,label_positive,sent_fbert
0,0.92216,0.0,0.0,1.0,0.92216
1,0.951231,1.0,0.0,0.0,-0.951231
2,0.90874,0.0,1.0,0.0,0.0
3,0.892718,0.0,1.0,0.0,0.0
4,0.458993,0.0,0.0,1.0,0.458993


In [31]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

pipe_twt = pipeline("sentiment-analysis", model=MODEL, tokenizer=MODEL)
results_twt = pipe_twt(articles)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [32]:
pd.DataFrame(results_twt)

Unnamed: 0,label,score
0,positive,0.978731
1,negative,0.739658
2,neutral,0.827707
3,neutral,0.560319
4,neutral,0.388296


In [39]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
res = []

for article in articles:
    encoded_input = tokenizer(article, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    res.append(scores)

pd.DataFrame(res, columns=['Negative', 'Neutral', 'Positive'])

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,Negative,Neutral,Positive
0,0.002184,0.019085,0.978731
1,0.739658,0.243607,0.016735
2,0.048503,0.827707,0.12379
3,0.40501,0.560319,0.034671
4,0.289988,0.388296,0.321716


In [35]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = articles[0]
print(text)
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The stock market saw a huge increase today as tech companies led the gains.
1) positive 0.9787
2) neutral 0.0191
3) negative 0.0022
