In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd

In [None]:
# Load your tweets
df_twitter = pd.read_csv('data/stock_tweets.csv')

In [None]:
# Preprocess function to replace usernames and links with placeholders
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        if t.startswith('@') and len(t) > 1:
            t = '@user'
        elif t.startswith('http'):
            t = 'http'
        new_text.append(t)
    return " ".join(new_text)

In [None]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Lists to store scores for all tweets
negative_scores = []
neutral_scores = []
positive_scores = []

# Process each tweet
for tweet in df_twitter['Tweet']:
    text = preprocess(tweet)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)

    # Map scores to labels
    sentiment_score = {config.id2label[i]: float(np.round(score, 4)) for i, score in enumerate(scores)}

    negative_scores.append(sentiment_score.get('negative', 0.0))
    neutral_scores.append(sentiment_score.get('neutral', 0.0))
    positive_scores.append(sentiment_score.get('positive', 0.0))

# Create a new DataFrame with sentiment scores
df_sentiments = pd.DataFrame({
    'negative': negative_scores,
    'neutral': neutral_scores,
    'positive': positive_scores
})

# Optional: Combine with original dataframe
df_twitter = pd.concat([df_twitter.reset_index(drop=True), df_sentiments], axis=1)

# Display the first few rows
print(df_twitter.head())