<a href="https://colab.research.google.com/github/monkrus/pytorch_example/blob/main/pytorch_bert_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code provides a complete example of using BERT for text classification, from dataset preparation to fine-tuning and making predictions. You can customize the dataset, model parameters, and other settings to fit your specific use case or due to the warning.

In [None]:
#install libraries
!pip install torch transformers

In [None]:
#prepare dataset and dataloader
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd

# Example dataset
data = {
    'text': [
        'I love this movie!',
        'This film was terrible.',
        'What a fantastic performance!',
        'I did not enjoy the plot.',
        'The acting was okay, but the story was boring.',
        'Great movie with an excellent cast.',
        'Worst movie I have ever seen.',
        'Absolutely loved the direction and the screenplay.',
        'Not my cup of tea.',
        'An enjoyable experience overall.'
    ],
    'label': [1, 0, 1, 0, 0, 1, 0, 1, 0, 1]  # 1 for positive, 0 for negative
}

df = pd.DataFrame(data)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Load pre-trained BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {'input_ids': inputs['input_ids'].squeeze(), 'attention_mask': inputs['attention_mask'].squeeze(), 'label': torch.tensor(label, dtype=torch.long)}

# Create dataset objects
train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_length=64)
val_dataset = TextDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_length=64)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)


In [None]:
#Fine-tune BERT
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training loss: {avg_train_loss:.4f}")

    # Validation loop
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation loss: {avg_val_loss:.4f}")


In [None]:
# making predictions
# Load the fine-tuned model and tokenizer
fine_tuned_model = BertForSequenceClassification.from_pretrained('fine-tuned-bert')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('fine-tuned-bert')
fine_tuned_model.to(device)

# Predict on new data
def predict(text):
    inputs = fine_tuned_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = fine_tuned_model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probabilities).item()
    return predicted_class

# Example predictions
texts = ["I really enjoyed this movie.", "The plot was boring and predictable."]
predictions = [predict(text) for text in texts]
print(predictions)  # Output: [1, 0]


**The result is [1] positive, [0] negative**