In [None]:
!pip install transformers datasets evaluate torch

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
import torch
import os

In [16]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load IMDB
dataset = load_dataset("imdb")

In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 2 labels (positive | negative)

In [19]:
# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [20]:
# Split to train and test data
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(2000))

In [21]:
# Evaluation metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [22]:
# Tuning settings
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/sentiment classification/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="/content/drive/MyDrive/sentiment classification/logs",
    logging_steps=10,
    push_to_hub=False,
    report_to="none"
)



In [23]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [24]:
# Model training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2203,0.228345,0.9075
2,0.0981,0.272759,0.9045
3,0.1835,0.343507,0.9105


TrainOutput(global_step=1875, training_loss=0.1888594190677007, metrics={'train_runtime': 1460.0356, 'train_samples_per_second': 20.547, 'train_steps_per_second': 1.284, 'total_flos': 3946665830400000.0, 'train_loss': 0.1888594190677007, 'epoch': 3.0})

In [25]:
# Save the model
model.save_pretrained("/content/drive/MyDrive/sentiment classification/fine-tuned model")
tokenizer.save_pretrained("/content/drive/MyDrive/sentiment classification/fine-tuned model")

('/content/drive/MyDrive/sentiment classification/fine-tuned model/tokenizer_config.json',
 '/content/drive/MyDrive/sentiment classification/fine-tuned model/special_tokens_map.json',
 '/content/drive/MyDrive/sentiment classification/fine-tuned model/vocab.txt',
 '/content/drive/MyDrive/sentiment classification/fine-tuned model/added_tokens.json',
 '/content/drive/MyDrive/sentiment classification/fine-tuned model/tokenizer.json')

# prediction

In [26]:
model_path = "/content/drive/MyDrive/sentiment classification/fine-tuned model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [27]:
# A sample for prediction
texts = [
    "This movie was fantastic! The characters were well-developed and the story was engaging.",
    "I didn't like the movie at all. It was boring and poorly acted."
]

In [28]:
# Tokenization
inputs = tokenizer(texts, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

In [29]:
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, axis=-1)

In [30]:
# Predictin results
labels = ["Negative", "Positive"]
for text, pred in zip(texts, predictions):
    print(f"Text: {text}")
    print(f"Prediction: {labels[pred]}")
    print()

Text: This movie was fantastic! The characters were well-developed and the story was engaging.
Prediction: Positive

Text: I didn't like the movie at all. It was boring and poorly acted.
Prediction: Negative

