In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd
from sklearn.metrics import accuracy_score
import torch

# Load the pre-trained model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Function to preprocess text
def preprocess(text):
    if isinstance(text, float):
        text = str(text)

    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Load your dataset
df = pd.read_csv('/content/sample_data/test.csv',  encoding='latin-1')

# Apply the preprocess function to the text column
df['preprocessed_text'] = df['text'].apply(preprocess)
texts = df['preprocessed_text'].tolist()
labels = df['sentiment'].tolist()

# Tokenize the data
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Ensure the model is in evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    scores = outputs[0].detach().numpy()

# Apply softmax to get probabilities
probs = softmax(scores, axis=1)

# Get the predicted labels
predicted_labels = np.argmax(probs, axis=1)

# Define the mapping from numeric labels to sentiment labels
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
predicted_sentiments = [label_map[pred] for pred in predicted_labels]

# Calculate the accuracy
accuracy = accuracy_score(labels, predicted_sentiments)
print(f'Accuracy: {accuracy * 100:.2f}%')



Twitter Sentiment Analysis with RoBERTa

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset

# Load the pre-trained model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Function to preprocess text
def preprocess(text):
    if isinstance(text, float):
        text = str(text)

    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Load your dataset
df = pd.read_csv('/content/sample_data/test.csv',  encoding='latin-1')

# Apply the preprocess function to the text column
df['preprocessed_text'] = df['text'].apply(preprocess)
texts = df['preprocessed_text'].tolist()
labels = df['sentiment'].tolist()

# Tokenize the data in batches
batch_size = 16  # You can adjust the batch size based on available memory

def tokenize_batch(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create batches of texts and labels
text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
label_batches = [labels[i:i + batch_size] for i in range(0, len(labels), batch_size)]

# Ensure the model is in evaluation mode
model.eval()

predicted_labels = []

# Process each batch
for i, text_batch in enumerate(text_batches):
    inputs = tokenize_batch(text_batch)
    with torch.no_grad():
        outputs = model(**inputs)
        scores = outputs[0].detach().numpy()
        probs = softmax(scores, axis=1)
        batch_predicted_labels = np.argmax(probs, axis=1)
        predicted_labels.extend(batch_predicted_labels)

# Define the mapping from numeric labels to sentiment labels
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
predicted_sentiments = [label_map[pred] for pred in predicted_labels]

# Flatten the label batches to a single list
flat_labels = [item for sublist in label_batches for item in sublist]

# Calculate the accuracy
accuracy = accuracy_score(flat_labels, predicted_sentiments)
print(f'Accuracy: {accuracy * 100:.2f}%')         # "cardiffnlp/twitter-roberta-base-sentiment-latest"


Twitter Sentiment Analysis with DistilBERT

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd
from sklearn.metrics import accuracy_score
import torch

# Load DistilBERT model and tokenizer
MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Function to preprocess text
def preprocess(text):
    if isinstance(text, float):
        text = str(text)

    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Load your dataset
df = pd.read_csv('/content/sample_data/test.csv',  encoding='latin-1')

# Apply the preprocess function to the text column
df['preprocessed_text'] = df['text'].apply(preprocess)
texts = df['preprocessed_text'].tolist()
labels = df['sentiment'].tolist()

# Define batch size
batch_size = 16  # Adjust based on available memory

# Tokenize the data in batches
def tokenize_batch(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create batches of texts and labels
text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
label_batches = [labels[i:i + batch_size] for i in range(0, len(labels), batch_size)]

# Ensure the model is in evaluation mode
model.eval()

predicted_labels = []

# Process each batch
for text_batch in text_batches:
    inputs = tokenize_batch(text_batch)
    with torch.no_grad():
        outputs = model(**inputs)
        scores = outputs[0].detach().numpy()
        probs = softmax(scores, axis=1)
        batch_predicted_labels = np.argmax(probs, axis=1)
        predicted_labels.extend(batch_predicted_labels)

# Define the mapping from numeric labels to sentiment labels
label_map = {0: 'negative', 1: 'positive'}  # Adjust if needed, as SST-2 is binary (positive/negative)
predicted_sentiments = [label_map[pred] for pred in predicted_labels]

# Flatten the label batches to a single list
flat_labels = [item for sublist in label_batches for item in sublist]

# Calculate the accuracy
accuracy = accuracy_score(flat_labels, predicted_sentiments)
print(f'Accuracy: {accuracy * 100:.2f}%')  # "distilbert-base-uncased-finetuned-sst-2-english"


Twitter Sentiment Analysis with ALBERT

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd
from sklearn.metrics import accuracy_score
import torch

# Load ALBERT model and tokenizer
MODEL = "textattack/albert-base-v2-SST-2"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Function to preprocess text
def preprocess(text):
    if isinstance(text, float):
        text = str(text)

    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Load your dataset
df = pd.read_csv('/content/sample_data/test.csv',  encoding='latin-1')

# Apply the preprocess function to the text column
df['preprocessed_text'] = df['text'].apply(preprocess)
texts = df['preprocessed_text'].tolist()
labels = df['sentiment'].tolist()

# Define batch size
batch_size = 16  # Adjust based on available memory

# Tokenize the data in batches
def tokenize_batch(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create batches of texts and labels
text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
label_batches = [labels[i:i + batch_size] for i in range(0, len(labels), batch_size)]

# Ensure the model is in evaluation mode
model.eval()

predicted_labels = []

# Process each batch
for text_batch in text_batches:
    inputs = tokenize_batch(text_batch)
    with torch.no_grad():
        outputs = model(**inputs)
        scores = outputs.logits.detach().numpy()
        probs = softmax(scores, axis=1)
        batch_predicted_labels = np.argmax(probs, axis=1)
        predicted_labels.extend(batch_predicted_labels)

# Define the mapping from numeric labels to sentiment labels
label_map = {0: 'negative', 1: 'positive'}  # Adjust if needed, as SST-2 is binary (positive/negative)
predicted_sentiments = [label_map[pred] for pred in predicted_labels]

# Flatten the label batches to a single list
flat_labels = [item for sublist in label_batches for item in sublist]

# Calculate the accuracy
accuracy = accuracy_score(flat_labels, predicted_sentiments)
print(f'Accuracy: {accuracy * 100:.2f}%')  #"textattack/albert-base-v2-SST-2"


Twitter Sentiment Analysis with VADER

In [None]:
# !pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.metrics import accuracy_score

# Load your dataset
df = pd.read_csv('/content/sample_data/test.csv', encoding='latin-1')

# Ensure 'sentiment' and 'text' columns are string type
df['sentiment'] = df['sentiment'].astype(str)
df['text'] = df['text'].astype(str)

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Preprocess and analyze sentiment
def vader_sentiment(text):
    score = analyzer.polarity_scores(text)
    if score['compound'] >= 0.05:
        return 'positive'
    elif score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis
df['predicted_sentiment'] = df['text'].apply(vader_sentiment)

# Ensure the predicted sentiment values are also strings
df['predicted_sentiment'] = df['predicted_sentiment'].astype(str)

# Calculate accuracy
accuracy = accuracy_score(df['sentiment'], df['predicted_sentiment'])
print(f'VADER Accuracy: {accuracy * 100:.2f}%')


Twitter Sentiment Analysis with TextBlob

In [None]:
from textblob import TextBlob
import pandas as pd
from sklearn.metrics import accuracy_score

# Load your dataset
df = pd.read_csv('/content/sample_data/test.csv', encoding='latin-1')

# Ensure 'sentiment' and 'text' columns are string type
df['sentiment'] = df['sentiment'].astype(str)
df['text'] = df['text'].astype(str)

# Preprocess and analyze sentiment
def textblob_sentiment(text):
    if isinstance(text, float):
        text = str(text)
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return 'positive'
    elif sentiment < 0:
        return 'negative'
    else:
        return 'neutral'

df['predicted_sentiment'] = df['text'].apply(textblob_sentiment)

# Ensure the predicted sentiment values are also strings
df['predicted_sentiment'] = df['predicted_sentiment'].astype(str)

# Calculate accuracy
accuracy = accuracy_score(df['sentiment'], df['predicted_sentiment'])
print(f'TextBlob Accuracy: {accuracy * 100:.2f}%')


Twitter Sentiment Analysis with Flair

In [None]:
# !pip install flair
from flair.models import TextClassifier
from flair.data import Sentence
import pandas as pd
from sklearn.metrics import accuracy_score

# Load your dataset
df = pd.read_csv('/content/sample_data/test.csv', encoding='latin-1')

# Ensure 'sentiment' and 'text' columns are string type
df['sentiment'] = df['sentiment'].astype(str)
df['text'] = df['text'].astype(str)

# Load the sentiment classifier
classifier = TextClassifier.load('sentiment-fast')

# Preprocess and analyze sentiment
def flair_sentiment(text):
    if isinstance(text, float):
        text = str(text)
    sentence = Sentence(text)
    classifier.predict(sentence)
    label = sentence.labels[0].value
    return label.lower()

# Apply sentiment analysis
df['predicted_sentiment'] = df['text'].apply(flair_sentiment)

# Ensure the predicted sentiment values are also strings
df['predicted_sentiment'] = df['predicted_sentiment'].astype(str)

# Calculate accuracy
accuracy = accuracy_score(df['sentiment'], df['predicted_sentiment'])
print(f'Flair Accuracy: {accuracy * 100:.2f}%')
