Implement RoBERTa on My TwitterData Set.

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd
import torch

# Load the pre-trained model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Function to preprocess text
def preprocess(text):
    if isinstance(text, float):
        text = str(text)
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Load your dataset
df = pd.read_csv('/content/sample_data/company_tweet_new.csv', encoding='latin-1')

# Apply the preprocess function to the 'body' column
df['preprocessed_body'] = df['body'].apply(preprocess)
texts = df['preprocessed_body'].tolist()

# Define batch size
batch_size = 16  # Adjust based on available memory

# Tokenize the data in batches
def tokenize_batch(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Create batches of texts
text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]

# Ensure the model is in evaluation mode
model.eval()

# Function to get sentiment scores
def get_sentiment_scores(text_batch):
    inputs = tokenize_batch(text_batch)
    with torch.no_grad():
        outputs = model(**inputs)
        scores = outputs.logits.detach().numpy()
        return scores

# Process each batch and store results
all_scores = []
for text_batch in text_batches:
    batch_scores = get_sentiment_scores(text_batch)
    all_scores.extend(batch_scores)

# Convert scores to probabilities
probs = softmax(np.array(all_scores), axis=1)

# Get the predicted sentiment labels and polarity scores
predicted_labels = np.argmax(probs, axis=1)
polarity_scores = np.max(probs, axis=1)

# Define the mapping from numeric labels to sentiment labels
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
predicted_sentiments = [label_map[pred] for pred in predicted_labels]

# Add the results to the dataframe
df['sentiment'] = predicted_sentiments
df['polarity_score'] = polarity_scores

# Save the results to a new CSV file
df.to_csv('/content/sample_data/twitter_data_with_sentiments.csv', index=False)

print("Sentiment analysis complete. Results saved to 'twitter_data_with_sentiments.csv'.")


In [None]:
df = pd.read_csv('/content/sample_data/twitter_data_with_sentiments.csv', encoding='latin-1')

# Perform one-hot encoding on the 'sentiment' column
one_hot = pd.get_dummies(df['sentiment'])

# Concatenate the one-hot encoded columns with the original dataframe
df = pd.concat([df, one_hot], axis=1)

# Drop the original 'sentiment' column if desired
df = df.drop('sentiment', axis=1)

df.to_csv('/content/sample_data/updated_twitter_data_with_sentiments.csv', index=False)

print("One-hot encoding complete. Updated dataset saved to 'updated_twitter_data_with_sentiments.csv'.")
