In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

device = torch.device("cpu")
model.to(device)

# the CSV should have columns: 'id', 'label', 'tweet'
df = pd.read_csv("/Users/neilkadian/Downloads/Default Safari Downloads Folder/sentiment_analysis.csv")

# verify the first few rows
print(df.head())

# define common negation words
NEGATION_WORDS = ["not", "never", "no", "none", "n't", "cannot", "neither", "nor"]

def mark_negation_scope(text):
    """Mark negation scope in text."""
    words = text.split()
    negated = False
    marked_text = []

    for word in words:
        # chheck for negation word match
        if any(re.search(rf'\b{neg_word}\b' if neg_word != "n't" else r"\b\w*n't\b", word.lower()) for neg_word in NEGATION_WORDS):
            negated = True
            marked_text.append("NEG_" + word)
        elif negated and re.match(r'[,.!?]', word):  # End negation scope at punctuation
            negated = False
            marked_text.append(word)
        elif negated:
            marked_text.append("NEG_" + word)
        else:
            marked_text.append(word)
    
    return " ".join(marked_text)

# apply negation scope marking
df['marked_tweet'] = df['tweet'].apply(mark_negation_scope)

# ssample the full dataset
sample_fraction = 1
df_sampled = df.sample(frac=sample_fraction, random_state=42)

# split the dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_sampled["marked_tweet"], df_sampled["label"], test_size=0.5, random_state=42
)

print(f"Size of train_texts: {len(train_texts)}")
print(f"Size of test_texts: {len(test_texts)}")

# load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# add NEG_ token to vocabulary
tokenizer.add_tokens(["NEG_"])
model.resize_token_embeddings(len(tokenizer))

# tokenize the data
def tokenize_function(texts):
    return tokenizer(list(texts), padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": list(train_labels)
})

test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": list(test_labels)
})

# load DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# fine-tune the model
trainer.train()

# evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


In [None]:
import torch

# check if MPS is available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# move the model to the MPS device
model = model.to(device)

# function to test the model on multiple inputs
def test_model_on_multiple_inputs(input_texts, model, tokenizer, device):
    predicted_sentiments = []  # List to store predictions for each input
    sentiment_map = {1: "Negative", 0: "Positive"}  # Sentiment mapping
    
    for input_text in input_texts:
        # tokenize the input
        encoding = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        
        # Move the tokenized input to the correct device
        encoding = {key: value.to(device) for key, value in encoding.items()}
        
        # pass the tokenized input through the model
        with torch.no_grad():  # Disable gradient calculation
            outputs = model(**encoding)
        
        # geet the predicted label
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()
        
        # map the predicted label to sentiment and add it to the list
        predicted_sentiments.append(sentiment_map[predicted_label])
    
    return predicted_sentiments

input_texts = [
    "#OnePlusOne is beast of a phone, finally bought one & now just can't get my hands off it. #enjoyingmyself #thankyou #Oneplus #inlove",
    "I don't think I've ever loved my iPhone more than with the new iOS5 update!! #apple #appleiosupdate",
    "I have to give some to #apple for the #Iphone . I dropped my #iphone in the sink today and not one problem. Thank you #Apple !",
    "@JamesDawute I like tablets but personally I have no use for it So in that sense Im not a fan but I think its quite a gd product",
    "I just can't do it. Every time I look at a PC laptop, it only makes me want a MacBook Pro even more... Apple",
    "Now my girl will never pay any attention to me now she just got an iPhone 5"
]

predicted_sentiments = test_model_on_multiple_inputs(input_texts, model, tokenizer, device)

# print results for each input
for input_text, sentiment in zip(input_texts, predicted_sentiments):
    print(f"Input Text: {input_text}")
    print(f"Predicted Sentiment: {sentiment}\n")

In [None]:
def contains_negation(text):
    """Check if a text contains any negation words."""
    text = text.lower()
    return any(re.search(rf'\b{neg_word}\b' if neg_word != "n't" else rf"{neg_word}\b", text) for neg_word in NEGATION_WORDS)

import numpy as np
import matplotlib.pyplot as plt

# Define the categories for analysis
categories = [
    "Negative Label, Not Negated",
    "Negative Label, Negated",
    "Positive Label, Not Negated",
    "Positive Label, Negated",
]

# predict on the test dataset
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# process test data into input text and true labels
test_texts = test_texts.reset_index(drop=True)  # Ensure proper indexing
test_labels = test_labels.reset_index(drop=True)

# Initialize the matrix
matrix = np.zeros((4, 2), dtype=int)

# fill the matrix
for i, text in enumerate(test_texts):
    true_label = test_labels[i]
    predicted_label = predicted_labels[i]
    negation_flag = contains_negation(text)  # Use the negation function defined earlier ######

    # Determine the row index based on true label and negation status
    if true_label == 1:  # Negative sentiment
        row = 0 if not negation_flag else 1
    elif true_label == 0:  # Positive sentiment
        row = 2 if not negation_flag else 3

    # determine the column index based on predicted label
    col = 0 if predicted_label == 1 else 1

    # Update the matrix
    matrix[row, col] += 1

# calculate total number of test samples
total_test_samples = len(test_texts)

# calculate percentages
percent_matrix = (matrix / total_test_samples) * 100

# visualize the matrix
# fig, ax = plt.subplots(figsize=(10, 6))

# calculate total negated and non-negated inputs
negated_rows = [1, 3]  # Rows corresponding to negated inputs
non_negated_rows = [0, 2]  # Rows corresponding to non-negated inputs

# correctly labeled inputs for negated and non-negated cases
correct_negated = matrix[1, 0] + matrix[3, 1]  # Negated, correct: True Negative + True Positive
correct_non_negated = matrix[0, 0] + matrix[2, 1]  # Non-Negated, correct: True Negative + True Positive

# total negated and non-negated inputs
total_negated = matrix[1, 0] + matrix[1, 1] + matrix[3, 0] + matrix[3, 1]
total_non_negated = matrix[0, 0] + matrix[0, 1] + matrix[2, 0] + matrix[2, 1]

# ccalculate overall percentages
percent_correct_negated = (correct_negated / total_negated) * 100 if total_negated > 0 else 0
percent_correct_non_negated = (correct_non_negated / total_non_negated) * 100 if total_non_negated > 0 else 0

# calculate total number of correct predictions
correct_predictions = matrix[0, 0] + matrix[1, 0] + matrix[2, 1] + matrix[3, 1]
# print(f" {matrix[0, 0]} ")
# print(f" {matrix[1, 0]} ")
# print(f" {matrix[2, 1]} ")
# print(f" {matrix[3, 1]} ")

# cclculate overall accuracy
overall_accuracy = (correct_predictions / total_test_samples) * 100

# visualize the matrix
fig, ax = plt.subplots(figsize=(8, 5))

im = ax.imshow(matrix, cmap="Blues", aspect="auto")

# annotate the matrix with counts and percentages
for i in range(matrix.shape[0]):
    for j in range(matrix.shape[1]):
        count = matrix[i, j]
        percent = percent_matrix[i, j]
        ax.text(j, i, f"{count}\n({percent:.2f}%)", ha="center", va="center", color="black")

# add labels and titles
ax.set_xticks([0, 1])
ax.set_yticks(range(4))
ax.set_xticklabels(["Predicted Negative", "Predicted Positive"])
ax.set_yticklabels(categories)
ax.set_title("Performance Visualization Matrix (Counts and Percentages)")
ax.set_xlabel("Predicted Sentiment")
ax.set_ylabel("Input Categories")

# add overall percentages below the matrix
fig.text(
    0.55,
    -0.08,
    f"Correctly Labeled Negated Inputs: {percent_correct_negated:.2f}%\n"
    f"Correctly Labeled Non-Negated Inputs: {percent_correct_non_negated:.2f}%\n"
    f"Overall Accuracy: {overall_accuracy:.2f}%",
    ha="center",
    fontsize=12,
    color="black",
)

plt.colorbar(im, ax=ax)
plt.tight_layout()
plt.show()


In [None]:
examples = [
    "I am not happy.",
    "I don't like this product.",
    "neg_not neg_happy with the service."
]

# tokenize the sentences
for sentence in examples:
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.encode(sentence, add_special_tokens=True)  # Includes [CLS] and [SEP] tokens
    decoded_tokens = [tokenizer.decode([id]) for id in token_ids]

    print(f"Original Sentence: {sentence}")
    print(f"Tokenized: {tokens}")
    print(f"Token IDs: {token_ids}")
    print(f"Decoded Tokens: {decoded_tokens}")
    print("-" * 50)
