In [1]:
!pip install transformers torch pandas scikit-learn



to build BERT model:


1.   tokenization
2.   the  DataLoader
3.   model initization (pre-trained)
4.   optimizer
5.   the train loop
6.   evaluation



In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Dataa Preparation
  1. encodes the categories: converts "storage_issue" -> 0, etc.
  2. tokenizes: converts text -> numbers (input IDs)
  3. creates dataLoaders: packages the data into batches of 16 for the model

In [3]:
df = pd.read_csv('text_data_cleaned.csv')

In [4]:
# 1. encode labels (text -> numbers)
# the model needs numbers (0, 1, 2...), not strings ("storage_issue")
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])

In [5]:
# Save the mapping to check later
# we will need it as a translator
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:", label_map)

Label Mapping: {'contract_dispute': np.int64(0), 'currency_issue': np.int64(1), 'demand_spike': np.int64(2), 'distribution_center_issue': np.int64(3), 'forecasting_error': np.int64(4), 'import_delay': np.int64(5), 'it_system_failure': np.int64(6), 'labor_shortage': np.int64(7), 'manufacturing_issue': np.int64(8), 'neutral_report': np.int64(9), 'quality_control_issue': np.int64(10), 'regulation_issue': np.int64(11), 'seasonal_variation': np.int64(12), 'storage_issue': np.int64(13), 'supplier_problem': np.int64(14), 'transport_issue': np.int64(15)}


In [6]:
#split the data (train 80% - validation 20%)
# we use 'stratify' to ensure all categories are represented in both sets
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df['text_clean'],
    df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [7]:
# 2. tokenizes
# initialize BERT tokenizer

#bert-base --> this is the standard version (12 Layers)
#uncased --> ignore capitalized "wez"and "Wez"are the same

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# tokenization function
def tokenize_data(text_list, labels):
    encoded_batch = tokenizer.batch_encode_plus(
        text_list.tolist(),
        add_special_tokens=True,      # add [CLS] and [SEP]
        padding='max_length',         # pad to max length
        truncation=True,              # truncate if too long
        max_length=64,                # fixed length for all sentences
        return_attention_mask=True,   # generate masks
        return_tensors='pt'           # return PyTorch tensors
    )

    return TensorDataset(
        encoded_batch['input_ids'],
        encoded_batch['attention_mask'],
        torch.tensor(labels.tolist())
    )

In [9]:
#create datasets
train_dataset = tokenize_data(X_train_text, y_train)
val_dataset = tokenize_data(X_val_text, y_val)

In [10]:
print("train_dataset:", train_dataset)

train_dataset: <torch.utils.data.dataset.TensorDataset object at 0x7a3778ff9e20>


In [11]:
input_ids, attention_masks, labels = train_dataset[0]

print("1. Input IDs (The Numbers):", input_ids)
print("2. Label (The Answer):", labels)

1. Input IDs (The Numbers): tensor([  101,  6887, 27292, 20499,  1999,  2047,  3923,  4279,  2988,  3314,
         2007,  3424,  3597,  8490,  7068,  7666,  1012,  2027,  6563,  4026,
         1998,  3665,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
2. Label (The Answer): tensor(15)


In [12]:
#3. dataloader
batch_size = 16

In [13]:
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset), # shuffle training data
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset), # don't shuffle validation
    batch_size=batch_size
)

print(":D! Data is tokenized and packed into DataLoaders.")
print(f"Training Batches: {len(train_dataloader)}")
print(f"Validation Batches: {len(validation_dataloader)}")

:D! Data is tokenized and packed into DataLoaders.
Training Batches: 20
Validation Batches: 5


part 2: model initializing

In [14]:
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

In [15]:
#initizalize the mode
print("initializing BERT model")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_map),  # Automatically set number of categories
    output_attentions=False,
    output_hidden_states=False,
)

initializing BERT model


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# move to GPU and GPU are built for the massive math required by neural networks
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"   Model loaded on: {device}")

   Model loaded on: cuda


part 3: optimizer

In [17]:
# Setup the Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [24]:
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

part 4: training looop

In [25]:
print("\n2. Starting Training...")

for epoch in range(epochs):
    print(f"\n   Epoch {epoch + 1} / {epochs}")

    # --- TRAINING PHASE ---
    model.train()

    for step, batch in enumerate(train_dataloader):
        # Move batch to GPU
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch

        # The Learning Steps
        optimizer.zero_grad()
        output = model(input_ids, attention_mask=input_mask, labels=labels)

        # Print Loss every 5 batches so we can watch it learn
        loss = output.loss
        if step % 5 == 0 and step > 0:
             print(f"      Batch {step}: Loss = {loss.item():.4f}")

        loss.backward()
        optimizer.step()
        scheduler.step()

    # --- VALIDATION PHASE ---
    model.eval()
    total_correct = 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch

        with torch.no_grad():
            output = model(input_ids, attention_mask=input_mask)

        # Calculate Accuracy
        predictions = torch.argmax(output.logits, dim=1)
        total_correct += (predictions == labels).sum().item()

    # Print Score
    accuracy = total_correct / len(val_dataset)
    print(f"   Validation Accuracy: {accuracy:.2f}")


2. Starting Training...

   Epoch 1 / 10
      Batch 5: Loss = 2.4921
      Batch 10: Loss = 1.8643
      Batch 15: Loss = 1.8200
   Validation Accuracy: 0.38

   Epoch 2 / 10
      Batch 5: Loss = 2.3156
      Batch 10: Loss = 2.0465
      Batch 15: Loss = 2.2461
   Validation Accuracy: 0.40

   Epoch 3 / 10
      Batch 5: Loss = 2.0417
      Batch 10: Loss = 1.9814
      Batch 15: Loss = 1.7841
   Validation Accuracy: 0.45

   Epoch 4 / 10
      Batch 5: Loss = 1.9750
      Batch 10: Loss = 1.5148
      Batch 15: Loss = 1.5577
   Validation Accuracy: 0.59

   Epoch 5 / 10
      Batch 5: Loss = 1.5785
      Batch 10: Loss = 1.3087
      Batch 15: Loss = 1.5314
   Validation Accuracy: 0.75

   Epoch 6 / 10
      Batch 5: Loss = 1.6424
      Batch 10: Loss = 1.4791
      Batch 15: Loss = 1.3148
   Validation Accuracy: 0.88

   Epoch 7 / 10
      Batch 5: Loss = 1.4475
      Batch 10: Loss = 1.4292
      Batch 15: Loss = 1.1313
   Validation Accuracy: 0.90

   Epoch 8 / 10
      Batch 5

In [None]:
#model is now understanding 94/100 pharmacy reports correctly.
