In [1]:
# In Google Colab or local Jupyter Notebook
!pip install transformers[torch] datasets torch -U
!pip install accelerate -U
!pip install tqdm
!pip install torch
!pip install datasets



In [2]:
import os
import logging

# Ensure the directories exist
results_dir = './results'
logs_dir = './logs'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Directories '{results_dir}' and '{logs_dir}' are created.")


INFO:__main__:Directories './results' and './logs' are created.


In [3]:
from datasets import load_dataset
from transformers import BertTokenizer

# Load the IMDb dataset
dataset = load_dataset('imdb')

# Access, shuffle, and select a subset of the training data
train_data = dataset['train'].shuffle(seed=42).select(range(5000))
test_data = dataset['test'].shuffle(seed=42).select(range(1000))

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_data = train_data.map(tokenize_data, batched=True)
test_data = test_data.map(tokenize_data, batched=True)
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


INFO:datasets:PyTorch version 2.3.1 available.


In [4]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load a pre-trained BERT model for sequence classification with 2 labels (binary classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments with logging steps
training_args = TrainingArguments(
    output_dir=results_dir,           # Directory to save model checkpoints and logs
    num_train_epochs=3,               # Number of training epochs
    per_device_train_batch_size=8,    # Batch size for training
    per_device_eval_batch_size=16,    # Batch size for evaluation
    eval_strategy='epoch',            # Evaluate at the end of each epoch
    logging_dir=logs_dir,             # Directory to save logs
    logging_steps=10,                 # Log every 10 steps
    report_to="none"                  # To prevent logging to external services
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                      # The pre-trained BERT model
    args=training_args,               # Training arguments
    train_dataset=train_data,         # Training dataset
    eval_dataset=test_data,           # Evaluation dataset
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',           # Directory to save model checkpoints and logs
    num_train_epochs=1,               # Number of training epochs
    per_device_train_batch_size=8,    # Batch size for training
    per_device_eval_batch_size=16,    # Batch size for evaluation
    eval_strategy='epoch',            # Evaluate at the end of each epoch
    logging_dir='./logs',             # Directory to save logs
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                      # The pre-trained BERT model
    args=training_args,               # Training arguments
    train_dataset=train_data,         # Training dataset
    eval_dataset=test_data,           # Evaluation dataset
)



In [None]:
# Start fine-tuning the model
logger.info("Starting training...")
trainer.train()
logger.info("Training complete.")


INFO:__main__:Starting training...


In [None]:
# Define the function to compute metrics
def compute_metrics(p):
    pred, labels = p
    pred = pred.argmax(axis=1)
    acc = accuracy_score(labels, pred)
    f1 = f1_score(labels, pred, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Evaluate the model
trainer.evaluate(eval_dataset=test_data, metric_key_prefix="eval", compute_metrics=compute_metrics)


In [None]:
# Implement core functionality for prediction
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return predictions.argmax().item()

# Example usage
text = "The movie was fantastic!"
print(f'Sentiment: {"Positive" if predict_sentiment(text) == 1 else "Negative"}')
