In [11]:
# Import libraries
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments, pipeline
from datasets import load_dataset
import torch
import evaluate
import numpy as np


# Load the dataset
dataset = load_dataset('imdb')

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare data for PyTorch
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
# test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(50))
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))


# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "f1": f1}


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
)

# Define a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")




# PART 4
sentiment_pipeline = pipeline("text-classification", model=trainer.model, tokenizer=tokenizer)

data = [
    "I love this product! It's so good!",
    "This is the worst thing I've bought, definelty don't recommend to others.",
    "Its decent, not the best but not the worst either.",
    "Total waste of money, broke after one day of light use",
    "I mean you get what you pay for, its fine.",
    "Ohh I quite like this product, it's really good quality."
]

for review, result in zip(data, sentiment_pipeline(data)):
    label = result['label']
    if label == "LABEL_0":
        sentiment = "Negative"
    elif label == "LABEL_1":
        sentiment = "Positive"
    else:
        sentiment = label
        
    score = result['score']
    print(f"Review: {review}")
    print(f"Sentiment: {sentiment} (Confidence: {score:.2f})")
    print("-" * 50)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6056,0.587164,0.75,0.741944
2,0.3217,0.370307,0.82,0.819279
3,0.239,0.371421,0.83,0.829541




Device set to use mps:0


Evaluation results: {'eval_loss': 0.3714207112789154, 'eval_accuracy': 0.83, 'eval_f1': 0.8295405865278751, 'eval_runtime': 0.8204, 'eval_samples_per_second': 121.89, 'eval_steps_per_second': 15.846, 'epoch': 3.0}
Review: I love this product! It's so good!
Sentiment: Positive (Confidence: 0.94)
--------------------------------------------------
Review: This is the worst thing I've bought, definelty don't recommend to others.
Sentiment: Negative (Confidence: 0.90)
--------------------------------------------------
Review: Its decent, not the best but not the worst either.
Sentiment: Negative (Confidence: 0.69)
--------------------------------------------------
Review: Total waste of money, broke after one day of light use
Sentiment: Negative (Confidence: 0.81)
--------------------------------------------------
Review: I mean you get what you pay for, its fine.
Sentiment: Positive (Confidence: 0.56)
--------------------------------------------------
Review: Ohh I quite like this product,