In [1]:
import numpy as np 
import os
for dirname, _, filenames in os.walk('C:\\Users\\91787\\Programming\\Sentiment-Analysis_Roberta'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# Step 0: Read in Data and NLTK Basics
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from scipy.special import softmax

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Read in data
df = pd.read_csv("C:\\Users\\91787\\Programming\\Projects\\Sentiment-Analysis_Roberta\\data\\Reviews.csv")
df = df.head(500)

# Step 1: VADER Sentiment Scoring
sia = SentimentIntensityAnalyzer()

# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')

# Step 3: Roberta Pretrained Model
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to(device)

def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    encoded_text = encoded_text.to(device)
    
    output = model(**encoded_text)
    scores = output.logits.detach().cpu().numpy()
    scores = softmax(scores, axis=1)  # Ensure softmax is applied along the correct axis
    
    scores_dict = {
        'roberta_neg' : scores[0, 0],  # Adjust indexing
        'roberta_neu' : scores[0, 1],  # Adjust indexing
        'roberta_pos' : scores[0, 2],  # Adjust indexing
    }
    return scores_dict

res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Text']
        myid = row['Id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')


Using device: cpu


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Broke for id 83
Broke for id 187


In [3]:
def sentiment_analysis(text):
    # Test VADER Sentiment Analysis
    sample_vader_result = sia.polarity_scores(text)

    # Test Roberta Pretrained Model
    sample_roberta_result = polarity_scores_roberta(text)

    # Determine overall sentiment
    if sample_vader_result['compound'] >= 0 and sample_roberta_result['roberta_pos'] > sample_roberta_result['roberta_neg']:
        overall_sentiment = "Positive"
        confidence_percent = sample_vader_result['pos'] * 100
    else:
        overall_sentiment = "Negative"
        confidence_percent = sample_vader_result['neg'] * 100

    print(f"For the text: {text}")
    print(f"\nThe sentiment is {overall_sentiment} with a confidence of {confidence_percent:.2f}%.")

In [4]:
# Save the tokenizer and model
tokenizer.save_pretrained("C:\\Users\\91787\\Programming\\Jupyter\\Models")
model.save_pretrained("C:\\Users\\91787\\Programming\\Jupyter\\Models")


In [7]:
# Sample text for testing sentiment
sample_text = "i hate this college "

# Call the sentiment_analysis function
sentiment_analysis(sample_text)

For the text: i hate this college 

The sentiment is Negative with a confidence of 64.90%.
