In [None]:
pip install transformers torch

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset

# Sample documents (sports vs technology)
documents = [
    "The football match was thrilling and intense!",
    "A new football player just broke a world record for goals.",
    "The basketball team celebrated after winning the final match.",
    "Baseball season is starting soon, and fans are excited.",
    "The soccer game was a nail-biting experience.",
    "Athletes are training hard for the upcoming tournament.",
    "The tennis match was fast-paced with many exciting rallies.",
    "The volleyball championship will be held next month.",
    "Cycling competitions are becoming more popular worldwide.",
    "Olympic athletes are preparing for the next games.",
    "The football team is aiming for the championship.",
    "Basketball players are working on their shooting skills.",
    "Tennis stars are competing in major tournaments.",
    "Soccer players are training in warm-weather camps.",
    "The sports equipment industry is booming.",
    
    "The new smartphone features amazing AI capabilities.",
    "Artificial intelligence is shaping the future of technology.",
    "Quantum computing is revolutionizing the tech industry.",
    "The tech conference introduced several new innovations.",
    "New advancements in 5G technology will change the world.",
    "Electric vehicles are becoming a popular choice among consumers.",
    "The tech startup just raised millions in funding for their app.",
    "Blockchain technology is being integrated into various industries.",
    "Virtual reality is becoming mainstream in entertainment.",
    "Wearable tech like smartwatches is growing rapidly.",
    "Cloud computing is transforming the IT industry.",
    "Artificial intelligence is used in self-driving cars.",
    "Blockchain is disrupting industries like finance and supply chain.",
    "Smart cities are using technology to improve infrastructure.",
    "The tech world is excited about the potential of quantum computing."
]

# Labels (1 for Sports, 0 for Technology)
labels = [1] * 15 + [0] * 15  # 15 Sports and 15 Technology labels

# Load BERT tokenizer and model
#Tokenization with Padding: We used tokenizer() with return_tensors='pt' to directly return PyTorch tensors, which is essential for working with BERT.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize the documents
encodings = tokenizer(documents, truncation=True, padding=True, max_length=64)

# Convert labels to tensor
labels_tensor = torch.tensor(labels)

# Create DataLoader for batching
dataset = TensorDataset(torch.tensor(encodings['input_ids']), torch.tensor(encodings['attention_mask']), labels_tensor)
train_dataloader = DataLoader(dataset, batch_size=4)

# Fine-tune the BERT model
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
#Training and Fine-Tuning: 
#The model is trained for 3 epochs using AdamW optimizer. 
# Each batch is processed, and the loss is backpropagated to fine-tune the model on your text classification task.

model.train()
for epoch in range(3):  # 3 epochs for fine-tuning
    for batch in train_dataloader:
        input_ids, attention_mask, label = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Test the model
#Evaluation Mode: After training, we switch the model to evaluation mode (model.eval()) so that it can make predictions without updating weights.
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in train_dataloader:
        input_ids, attention_mask, label = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.append(label.cpu().numpy())

# Flatten predictions and true labels
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"BERT Model Accuracy: {accuracy * 100:.2f}%")

# Example prediction
sample_text = "AI technology is transforming the way we live and work."
inputs = tokenizer(sample_text, return_tensors='pt', padding=True, truncation=True, max_length=64)
outputs = model(**inputs)
predicted_label = torch.argmax(outputs.logits, dim=-1).item()

category = "Sports" if predicted_label == 1 else "Technology"
print(f"\nThe document is categorized as: {category}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BERT Model Accuracy: 96.67%

The document is categorized as: Technology
