In [None]:
import pandas as pd

df = pd.read_csv("train.csv")
df["rating"] = df["rating"].astype(int)

In [None]:
df["rating"].value_counts().plot(kind="bar")

In [None]:
from transformers import MobileBertTokenizerFast


ckpt = "google/mobilebert-uncased"

tokenizer = MobileBertTokenizerFast.from_pretrained(ckpt)

inputs = tokenizer(
    df["review"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

In [None]:
def convert_binary(rating):
    return int(rating >= 6)


df["rating"] = df["rating"].map(convert_binary)

In [None]:
df["rating"].value_counts().plot(kind="bar")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [None]:
import torch
from transformers import (
    MobileBertForSequenceClassification,
    get_linear_schedule_with_warmup,
)

BATCH_SIZE = 8
NUM_LABELS = len(df["rating"].unique())
EPOCHS = 20

dataset = TextDataset(df["review"], df["rating"], tokenizer)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

model = MobileBertForSequenceClassification.from_pretrained(
    ckpt, num_labels=NUM_LABELS
).cuda()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
total_steps = len(dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

In [None]:
# for param in model.mobilebert.parameters():
#     param.requires_grad = False

In [None]:
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("runs/mobilebert")

model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        labels = batch["labels"].cuda()

        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    writer.add_scalar("Training epoch loss", avg_loss, epoch)

    torch.save(
        {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": avg_loss,
        },
        f"./ckpts/model_epoch_{epoch}.pt",
    )

writer.close()

In [None]:
test = pd.read_csv('test.csv')

test["rating"] = test["rating"].map(convert_binary)

test_dataset = TextDataset(test['review'], test['rating'], tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np


def evaluate(model, dataloader):
    model.eval()  

    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch["input_ids"].cuda()
            attention_mask = batch["attention_mask"].cuda()
            labels = batch["labels"].cuda()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()

            predictions.append(logits)
            true_labels.append(label_ids)
    
    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)

    predictions = np.argmax(predictions, axis=1)

    return predictions, true_labels

In [None]:
predictions, true_labels = evaluate(model, test_dataloader)

accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, predictions))