In [1]:
import pandas as pd

df = pd.read_csv("datasets/Train.csv")
df_test = pd.read_csv("datasets/Test.csv")
df_valid = pd.read_csv("datasets/Valid.csv")

In [None]:
import re, emoji, html
from bs4 import BeautifulSoup

def clean(text):
    text = BeautifulSoup(text, "html.parser").get_text(" ")  # strip HTML tags
    text = html.unescape(text)                               # e.g. &amp; → &
    text = emoji.demojize(text, delimiters=(" ", " "))       # 😀 →  smile_face 
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r"@\w+", "", text)                          # remove mentions
    text = re.sub(r"#\w+", "", text)                          # remove hashtags
    text = re.sub(r"[^\w\s.,!?;:]", "", text)       # remove special characters
    text = text.lower()                                       # convert to lowercase    
    text = re.sub(r"\s+", " ", text).strip()                 # collapse spaces
    return text

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class sentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = self.texts[idx]
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [5]:
train_texts_raw = df["text"].astype(str).tolist()
train_texts = [clean(t) for t in train_texts_raw]
train_labels = df["label"].tolist()

test_texts_raw = df_test["text"].astype(str).tolist()
test_texts = [clean(t) for t in test_texts_raw]
test_labels = df_test["label"].tolist()

valid_texts_raw = df_valid["text"].astype(str).tolist()
valid_texts = [clean(t) for t in valid_texts_raw]
valid_labels = df_valid["label"].tolist()

train_dataset = sentimentDataset(train_texts, train_labels, tokenizer)
test_dataset  = sentimentDataset(test_texts,  test_labels,  tokenizer)
valid_dataset = sentimentDataset(valid_texts, valid_labels, tokenizer)

# (optional) Slightly faster host→GPU copies
cuda = torch.cuda.is_available()
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True,  pin_memory=cuda)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False, pin_memory=cuda)
test_loader  = DataLoader(test_dataset,  batch_size=16, shuffle=False, pin_memory=cuda)

In [6]:
# --- TRAIN & EVAL (drop-in) -----------------------------------------------
import os, random, numpy as np, torch
from torch.optim import AdamW
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from transformers import get_linear_schedule_with_warmup

# Reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer & Scheduler
epochs = 3                    # bump to 4–5 if you have time
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

# Mixed precision
device_type = "cuda" if torch.cuda.is_available() else "cpu"
amp_enabled = (device_type == "cuda")
scaler = torch.amp.GradScaler(enabled=amp_enabled)

# ----------------- Evaluation helper -----------------
def evaluate(dataloader):
    model.eval()
    total_loss = 0.0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval", leave=False):
            # to device
            batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
            # forward
            with torch.amp.autocast(device_type=device_type, enabled=amp_enabled):
                outputs = model(**batch)  # expects input_ids, attention_mask, labels
                loss = outputs.loss
            total_loss += loss.item()

            preds = outputs.logits.argmax(dim=-1).detach().cpu().numpy()
            labels = batch["labels"].detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)

    avg_loss = total_loss / max(1, len(dataloader))
    acc = accuracy_score(all_labels, all_preds)
    p, r, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="macro", zero_division=0)
    return {"loss": avg_loss, "accuracy": acc, "precision": p, "recall": r, "f1": f1}, np.array(all_labels), np.array(all_preds)

# ----------------- Training loop -----------------
best_f1 = 0.0
os.makedirs("checkpoints", exist_ok=True)

for epoch in range(1, epochs + 1):
    model.train()
    running_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}"):
        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}

        optimizer.zero_grad(set_to_none=True)

        with torch.amp.autocast(device_type=device_type, enabled=amp_enabled):
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        running_loss += loss.item()

    train_loss = running_loss / max(1, len(train_loader))
    val_metrics, y_true_val, y_pred_val = evaluate(valid_loader)
    print(f"Epoch {epoch}: train_loss={train_loss:.4f} | "
          f"val_loss={val_metrics['loss']:.4f} | "
          f"val_acc={val_metrics['accuracy']:.4f} | "
          f"val_f1={val_metrics['f1']:.4f}")

    # Save best by macro-F1
    if val_metrics["f1"] > best_f1:
        best_f1 = val_metrics["f1"]
        torch.save(model.state_dict(), "checkpoints/bert_sentiment_best.pt")
        print("✓ Saved new best model")

# ----------------- Final test -----------------
# load best
model.load_state_dict(torch.load("checkpoints/bert_sentiment_best.pt", map_location=device))
test_metrics, y_true, y_pred = evaluate(test_loader)
print("\n=== TEST METRICS ===")
print(test_metrics)

print("\n=== CLASSIFICATION REPORT ===")
print(classification_report(y_true, y_pred, digits=4))

print("\n=== CONFUSION MATRIX ===")
print(confusion_matrix(y_true, y_pred))


Epoch 1/3: 100%|██████████| 2500/2500 [29:34<00:00,  1.41it/s]
                                                       

Epoch 1: train_loss=0.2447 | val_loss=0.1716 | val_acc=0.9358 | val_f1=0.9357
✓ Saved new best model


Epoch 2/3: 100%|██████████| 2500/2500 [34:28<00:00,  1.21it/s]
                                                       

Epoch 2: train_loss=0.1088 | val_loss=0.1686 | val_acc=0.9424 | val_f1=0.9424
✓ Saved new best model


Epoch 3/3: 100%|██████████| 2500/2500 [33:55<00:00,  1.23it/s]
                                                       

Epoch 3: train_loss=0.0413 | val_loss=0.1985 | val_acc=0.9448 | val_f1=0.9448
✓ Saved new best model


                                                       


=== TEST METRICS ===
{'loss': 0.1837186083054771, 'accuracy': 0.9474, 'precision': 0.947521024112429, 'recall': 0.9473841895367582, 'f1': 0.9473949478107877}

=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0     0.9544    0.9395    0.9469      2495
           1     0.9406    0.9553    0.9479      2505

    accuracy                         0.9474      5000
   macro avg     0.9475    0.9474    0.9474      5000
weighted avg     0.9475    0.9474    0.9474      5000


=== CONFUSION MATRIX ===
[[2344  151]
 [ 112 2393]]


In [7]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import re, emoji, html
from bs4 import BeautifulSoup

# -----------------------------
# 1) Config: labels & checkpoint
# -----------------------------
# Edit this to your classes. Order must match your training label IDs (0..K-1)
# Example for 5 classes:
label_names = ["negative", "positive"]
# Example for 3 classes (uncomment instead):
# label_names = ["negative", "neutral", "positive"]

NUM_LABELS = len(label_names)
checkpoint_path = "checkpoints/bert_sentiment_best.pt"   # change if different
pretrained_name = "bert-base-uncased"                    # must match training

# -----------------------------
# 2) Same cleaner you used before
# -----------------------------
def clean(text: str) -> str:
    text = BeautifulSoup(text, "html.parser").get_text(" ")  # strip HTML tags
    text = html.unescape(text)                               # e.g. &amp; → &
    text = emoji.demojize(text, delimiters=(" ", " "))       # 😀 →  smile_face 
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # URLs
    text = re.sub(r"@\w+", "", text)                         # mentions
    text = re.sub(r"#\w+", "", text)                         # hashtags
    text = re.sub(r"[^\w\s.,!?;:]", "", text)                # special chars
    text = text.lower()                                      # lowercase
    text = re.sub(r"\s+", " ", text).strip()                 # collapse spaces
    return text

# -----------------------------
# 3) Create some example reviews
# -----------------------------
example_reviews = [
    "This was painfully boring. I almost fell asleep halfway through.",
    "Terrible plot but the cinematography is decent.",
    "It's okay overall—nothing special, nothing awful.",
    "I really enjoyed the characters, solid pacing and great music!",
    "Masterpiece. Best film I've seen in years.",
    "Mediocre at best; some scenes work but most don't.",
    "Awful acting, clumsy script. Would not recommend.",
    "Charming and heartfelt with a satisfying ending."
]

# -----------------------------
# 4) Load tokenizer & model
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(pretrained_name)

model = BertForSequenceClassification.from_pretrained(
    pretrained_name,
    num_labels=NUM_LABELS
)
# Load your fine-tuned weights
state = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(state)
model.to(device)
model.eval()

# -----------------------------
# 5) Prediction helpers
# -----------------------------
def predict_reviews(reviews, max_length=512, batch_size=16):
    # clean
    texts = [clean(str(t)) for t in reviews]

    all_preds, all_probs = [], []
    # simple batching for large lists
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        with torch.no_grad():
            logits = model(**enc).logits
            probs = F.softmax(logits, dim=-1)
            preds = probs.argmax(dim=-1)

        all_preds.extend(preds.detach().cpu().tolist())
        all_probs.extend(probs.detach().cpu().tolist())

    # build a results dataframe
    top_prob = [max(p) for p in all_probs]
    pred_label = [label_names[i] for i in all_preds]
    df = pd.DataFrame({
        "review": reviews,
        "review_clean": texts,
        "pred_label": pred_label,
        "pred_id": all_preds,
        "pred_confidence": top_prob
    })
    return df, all_probs

def predict_one(review_text):
    df, _ = predict_reviews([review_text])
    row = df.iloc[0]
    return row["pred_label"], row["pred_confidence"], row

# -----------------------------
# 6) Run on the sample reviews
# -----------------------------
results_df, probs = predict_reviews(example_reviews)
print(results_df[["review", "pred_label", "pred_confidence"]])

# (Optional) show full probability distribution per review
for i, r in enumerate(example_reviews):
    print(f"\nReview: {r}")
    for k, p in zip(label_names, probs[i]):
        print(f"  {k:>15}: {p:.4f}")

# -----------------------------
# 7) How to use with your own text(s)
# -----------------------------
# Example:
# label, conf, row = predict_one("The movie was outstanding, I loved every minute!")
# print(label, conf)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                              review pred_label  \
0  This was painfully boring. I almost fell aslee...   negative   
1    Terrible plot but the cinematography is decent.   negative   
2  It's okay overall—nothing special, nothing awful.   positive   
3  I really enjoyed the characters, solid pacing ...   positive   
4         Masterpiece. Best film I've seen in years.   positive   
5  Mediocre at best; some scenes work but most do...   negative   
6  Awful acting, clumsy script. Would not recommend.   negative   
7   Charming and heartfelt with a satisfying ending.   positive   

   pred_confidence  
0         0.997285  
1         0.989064  
2         0.844082  
3         0.999103  
4         0.989832  
5         0.995147  
6         0.998520  
7         0.998122  

Review: This was painfully boring. I almost fell asleep halfway through.
         negative: 0.9973
         positive: 0.0027

Review: Terrible plot but the cinematography is decent.
         negative: 0.989

In [8]:
trick_reviews = [
    r"I’m thrilled to report that this film redefined cinema for me—specifically, it redefined the precise moment I should have left the theater. The opening shot is breathtaking if your breath is taken by beige hallways and unlit rooms. Performances are “committed,” in the sense that the actors should be committed for agreeing to this script. The director clearly loves long pauses; unfortunately, so did my streaming app when I tried to fast-forward. The plot twist arrives like a surprise party thrown for someone who moved away last year: nobody asked for it and the cake is stale. Still, the soundtrack works—if your playlist is “Elevator Drones Vol. 3.” Credit where due: the credits rolled eventually. I can’t wait to recommend this to people I don’t like, so we have something new not to talk about.",
    r"On paper, this should have been my favorite release of the year. The lighting is meticulous, the production design has that lived-in feel, and a few lines sparkle with wit. Yet scene after scene drifts by like a well-polished boat going nowhere. I admired the craft while checking the time; I enjoyed the score while wondering why the characters sounded like they were reading stage directions. The lead’s vulnerability is disarming, but it mostly disarms tension. Even the climax feels carefully underlined in pencil—technically clear, emotionally faint. I left thinking, “That was… competent?” I wouldn’t call it bad; I’d call it an elegantly framed shrug, the kind of movie you recommend with a soft maybe and a strong caveat. If you need something pretty for a lazy Sunday, it’s fine. If you want to feel anything sharper than polite approval, you may want to look elsewhere.",
    r"I didn’t exactly dislike it, which isn’t to say I liked it—more that I can’t claim I wasn’t not entertained. The pacing isn’t entirely without momentum, and the jokes aren’t completely devoid of timing, though they rarely land in a way that would be impossible to ignore. The performances don’t lack effort, but the effort doesn’t quite not show, if you follow. It’s not the kind of movie I wouldn’t avoid recommending to someone who’s not uninterested in background noise. By the time the not-unexpected ending arrived, I was neither unmoved nor moved, just not unready for the credits. In fairness, the cinematography isn’t unappealing, and the score isn’t unlistenable; both do a job that isn’t unneeded. I suppose calling it “not terrible” wouldn’t be inaccurate, as long as “not good” doesn’t feel unkind. If ambiguity was the goal, mission… not unaccomplished.",
    r"Wow, what an experience 🤯—I learned exactly how long ninety minutes can feel when time stops. The trailer promised thrills 😍, but the movie delivered responsible seatbelt usage and a deep respect for indoor voices. The romance is “simmering” if by simmering you mean two strangers who occasionally nod. I loved the soundtrack in the sense that silence is technically a sound 👂. Credit to the editor for keeping every scene long enough to wonder if my remote died. There’s a twist ending 🎁, which is adorable—like when a toddler hides behind a curtain and says, “Guess where I am!” Still, I’m grateful: the closing credits gave me the year’s most honest character arc—me, leaving. If you crave excitement, try watching paint dry, then watch this to cool down. Five stars for existing ⭐ (calm down, algorithm, that’s sarcasm), zero for joy.",
    r"If mediocrity were an art, this would hang in a tasteful gallery next to a plaque that reads, “Adequate, 2025.” Compared to the director’s earlier disaster, it’s practically a comeback; compared to anything genuinely good, it’s a carefully ironed bedsheet—smooth, flat, and forgettable. The performances are fine in that way airplanes are fine: you arrive, you don’t applaud. I admired how the script avoids clichés by replacing them with footnotes, each scene explaining why it exists instead of earning the right to. Yet I can’t deny a strange comfort: it’s competent, even considerate, never insulting my intelligence so much as politely ignoring it. Tomorrow I’ll remember a line or two; next week I’ll struggle to recall the title. Would I watch it again? Only if I’ve already seen everything else and need something to not mind missing."
    ]
results_df, probs = predict_reviews(trick_reviews)
print(results_df[["review", "pred_label", "pred_confidence"]])

                                              review pred_label  \
0  I’m thrilled to report that this film redefine...   positive   
1  On paper, this should have been my favorite re...   negative   
2  I didn’t exactly dislike it, which isn’t to sa...   negative   
3  Wow, what an experience 🤯—I learned exactly ho...   positive   
4  If mediocrity were an art, this would hang in ...   negative   

   pred_confidence  
0         0.988354  
1         0.997311  
2         0.670189  
3         0.896984  
4         0.995633  
