In [None]:
# -*- coding: utf-8 -*-
"""Transformers_Sentiment_Analysis.ipynb

This notebook performs sentiment analysis on a dataset of comments using a pre-trained transformer model.

Automatically generated by Colab.
"""

# Import necessary libraries
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

# Load model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Sentiment labels
LABELS = ['negative', 'neutral', 'positive']

# Check for GPU availability and move model to appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def get_batch_sentiments(input_ids, attention_mask):
    """
    Perform sentiment analysis on a batch of input_ids and attention_mask using the pre-trained model.
    Returns sentiment labels and corresponding softmax scores.
    """
    # Inference with no gradient tracking
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Apply softmax to get class probabilities (scores)
    logits = outputs[0].cpu().numpy()
    softmax_scores = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)

    # Convert softmax scores to sentiment labels
    sentiments = [LABELS[np.argmax(score)] for score in softmax_scores]

    return sentiments, softmax_scores

# Load the labeled dataset
data_file_path = "/content/drive/MyDrive/labeled_k_comments.csv"
k_df_labeled = pd.read_csv(data_file_path)

# Prepare data: Ensure 'Comments' column is in string format
comments = k_df_labeled['Comments'].astype(str).tolist()

# Tokenize the entire dataset in one step
inputs = tokenizer(comments, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Create a DataLoader for batch processing
batch_size = 16  # Adjusted for memory limits
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Initialize lists to store results
all_sentiments = []
all_scores = []

# Process the dataset in batches
model.eval()
for batch in tqdm(dataloader, total=len(dataloader)):
    input_ids, attention_mask = batch  # Extract input data from batch
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    # Get sentiment predictions for the current batch
    sentiments, scores = get_batch_sentiments(input_ids, attention_mask)

    # Store the results
    all_sentiments.extend(sentiments)
    all_scores.extend(scores)

    # Free up memory (if using GPU)
    torch.cuda.empty_cache()

# Add the sentiment results and scores to the DataFrame
k_df_labeled['Sentiment_Result'] = all_sentiments
k_df_labeled['Sentiment_Scores'] = all_scores

# Save the results to a new CSV file
output_file_path = "/content/drive/MyDrive/kendrick_comment_data_with_sentiment.csv"
k_df_labeled.to_csv(output_file_path, index=False)

print(f"Sentiment analysis complete. Results saved to: {output_file_path}")
