In [None]:
!pip install transformers datasets torch --quiet



In [None]:
# 0. Disable WandB
import os
os.environ["WANDB_DISABLED"] = "true"

# 1. Imports
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# 2. Load dataset
dataset = load_dataset("imdb")

# 3. Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 4. Tokenize data
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

train_dataset = dataset['train'].map(tokenize, batched=True)
test_dataset = dataset['test'].map(tokenize, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# 5. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # for demo purposes
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=50,
    report_to=[],  # disables WandB and other logging integrations
)

# 6. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# 7. Train
trainer.train()

# 8. Evaluate
results = trainer.evaluate()
print(results)

# 9. Simple prediction function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs).item()
    return "Positive" if pred == 1 else "Negative"

# Test predictions
print(predict_sentiment("This movie was fantastic!"))
print(predict_sentiment("I did not like this film at all."))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]



Step,Training Loss
50,0.6254
100,0.5374
150,0.477
200,0.4565
250,0.4647
300,0.3847
350,0.4121
400,0.4331
450,0.3738
500,0.4222
