In [None]:
!pip install -q transformers==4.44.2 datasets accelerate scikit-learn beautifulsoup4 emoji
import gdown
import pandas as pd, numpy as np, re, emoji
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from datasets import Dataset
import torch
from torch import nn
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


In [None]:
train_file_id = '1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW'
train_output_path = 'train.csv'
gdown.download(f'https://drive.google.com/uc?id={train_file_id}', train_output_path, quiet=False)
df = pd.read_csv(train_output_path)
df['label'] = df['overall'] - 1

Downloading...
From (original): https://drive.google.com/uc?id=1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW
From (redirected): https://drive.google.com/uc?id=1-AlW7oNJHaqi3xk_9dWHUS52Dzl_FmFW&confirm=t&uuid=287051bf-99c0-40cc-ae5e-f2ee5062bae2
To: /content/train.csv
100%|██████████| 635M/635M [00:02<00:00, 223MB/s]
  df = pd.read_csv(train_output_path)


In [None]:
target_per_class = 25000
balanced_df = pd.concat([
    df[df['label'] == cls].sample(n=target_per_class, random_state=42)
    if len(df[df['label'] == cls]) >= target_per_class else df[df['label'] == cls]
    for cls in sorted(df['label'].unique())
]).sample(frac=1, random_state=42)


def clean_text(text):
    text = str(text)
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'[^A-Za-z0-9\s.,!?]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()


In [None]:
balanced_df['cleaned_review'] = balanced_df['reviewText'].apply(clean_text)


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    balanced_df['cleaned_review'], balanced_df['label'],
    test_size=0.2, stratify=balanced_df['label'], random_state=42
)


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=160)

train_ds = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
val_ds = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})
train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])



Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro')
    }

In [None]:
for param in model.distilbert.parameters():
    param.requires_grad = False

training_args_stage1 = TrainingArguments(
    output_dir="./results_stage1",
    evaluation_strategy="epoch",
    save_strategy="no",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    fp16=True,
    logging_dir="./logs_stage1",
    logging_strategy="epoch"
)

trainer_stage1 = CustomTrainer(
    model=model,
    args=training_args_stage1,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("--- Stage 1 ---")
trainer_stage1.train()



--- Stage 1 ---


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.3235,1.236004,0.46048,0.440159
2,1.2465,1.216619,0.46988,0.461123
3,1.2369,1.212089,0.47132,0.46312


TrainOutput(global_step=9375, training_loss=1.2689383072916667, metrics={'train_runtime': 547.8638, 'train_samples_per_second': 547.581, 'train_steps_per_second': 17.112, 'total_flos': 1.241948304e+16, 'train_loss': 1.2689383072916667, 'epoch': 3.0})

In [None]:
for param in model.distilbert.parameters():
    param.requires_grad = True

training_args_stage2 = TrainingArguments(
    output_dir="./results_stage2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=32,
    fp16=True,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs_stage2",
    logging_strategy="epoch",
)

trainer_stage2 = CustomTrainer(
    model=model,
    args=training_args_stage2,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("--- Stage 2 ---")
trainer_stage2.train()



--- Stage 2 ---


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.0392,0.941771,0.59492,0.585355
2,0.8628,0.904926,0.61368,0.61369
3,0.7574,0.934443,0.61396,0.613909


TrainOutput(global_step=9375, training_loss=0.8864675520833334, metrics={'train_runtime': 1663.9755, 'train_samples_per_second': 180.291, 'train_steps_per_second': 5.634, 'total_flos': 1.241948304e+16, 'train_loss': 0.8864675520833334, 'epoch': 3.0})

In [None]:
preds = trainer_stage2.predict(val_ds)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=1)
print(f"Final Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(f"Final F1-macro: {f1_score(y_true, y_pred, average='macro'):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

Final Accuracy: 0.6137
Final F1-macro: 0.6137
Confusion Matrix:
 [[3427 1181  270   57   65]
 [1139 2440 1211  147   63]
 [ 303  983 2793  794  127]
 [  54  117  977 2894  958]
 [  52   33  144  983 3788]]
