Create a sentiment analyzer based on Anthropolgie reviews
Expecting the following scale

1 - Mostly Negative

2 - Partially Negative

3 - Neutral 

4 - Partially Positive

5 - Mostly Positive

In [1]:
import pandas as pd
import sys
import os

In [2]:
from datasets import load_dataset

seed = r"data\tune_set.csv"
full = r"data\train_val_scored_annotated.csv"

dataset = load_dataset("csv", data_files={
    "seed": seed,  # 200 labeled samples
    "full": full,   # 800 rows to split
})


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import DatasetDict, concatenate_datasets

# Shuffle the 200-row “seed” set
shuffled_seed = dataset["seed"].shuffle(seed=42)

# Split the 800-row “full” set 80/20
split_full = dataset["full"].train_test_split(test_size=0.2, seed=42)

# Concatenate into final train/validation
combined_dataset = DatasetDict({
    "train": concatenate_datasets([
        shuffled_seed.select(range(160)),
        split_full["train"]
    ]),
    "validation": concatenate_datasets([
        shuffled_seed.select(range(160, 200)),
        split_full["test"]
    ])
})

print(combined_dataset)

DatasetDict({
    train: Dataset({
        features: ['RETURN_COMMENT', 'sentiment_score'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['RETURN_COMMENT', 'sentiment_score'],
        num_rows: 200
    })
})


In [4]:
from transformers import AutoTokenizer

# Subtract 1 from sentiment_score to shift from 1-5 to 0-4
def shift_labels(example):
    example["sentiment_score"] = example["sentiment_score"] - 1
    return example

combined_dataset = combined_dataset.map(shift_labels)

# 1. Rename your label column
combined_dataset = combined_dataset.rename_column("sentiment_score", "labels")

# 2. Tokenizer (as before)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(examples):
    return tokenizer(
        examples["RETURN_COMMENT"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

# 3. Apply it, dropping only the RETURN_COMMENT text column
tokenized_datasets = combined_dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["RETURN_COMMENT"],
)

# 4. Set format for PyTorch
tokenized_datasets.set_format("torch")

print(tokenized_datasets)


Map: 100%|██████████| 800/800 [00:00<00:00, 21276.57 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 13323.71 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 20902.54 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 12897.02 examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
})





In [5]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import transformers, accelerate, torch
print(transformers.__version__)
print(torch.__version__)
import accelerate
print(accelerate.__version__)


def run_trainings(epochs, lr, i):
    print(f"Training for {epochs} epochs with learning rate {lr}")
    # 1. Load a fresh DistilBERT for sequence classification with 5 labels
    model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=5,
    )
    # 2. Define training arguments
    
    training_args = TrainingArguments(
        output_dir=f"./fine_tuned_sentiment_i_{i}",    # where to save checkpoints & final model
        eval_strategy="epoch",            # run evaluation at end of each epoch
        save_strategy="epoch",                  # save checkpoint after each epoch
        learning_rate=lr,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,            # keep the best checkpoint
        metric_for_best_model="accuracy",
    )

    # 3. Define a metrics function
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        # For multiclass F1 (macro)
        f1 = f1_score(labels, preds, average="macro")
        return {"accuracy": acc, "f1": f1}

    # 4. Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # 5. Launch training
    trainer.train()
    i+=1

epochs = [3, 4, 5]
lr = 2e-5
i = 0
for epoch in epochs:
    run_trainings(epoch, lr, i)
    i += 1

4.53.0
2.7.1+cpu
1.8.1
Training for 3 epochs with learning rate 2e-05


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.716604,0.81,0.179006
2,No log,0.640365,0.81,0.179006
3,No log,0.606375,0.81,0.179006


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training for 4 epochs with learning rate 2e-05


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 