In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import DistilBertTokenizer, DistilBertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import time
import seaborn as sns
import matplotlib.pyplot as plt

# Check device availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [23]:
df = pd.read_csv("/content/cleaned_dataset.csv")

# --- STRATEGY A: HIERARCHICAL LABEL GROUPING ---
# We map specific, granular events to broader "Root Cause" categories.
def simplify_label(text):
    t = str(text).lower()

    # Falls
    if 'fall' in t:
        if 'same level' in t: return 'Fall - Same Level'
        if 'lower level' in t: return 'Fall - Lower Level'
        return 'Fall - Other'

    # Impact / Compression
    if 'struck' in t: return 'Struck By Object'
    if 'caught' in t or 'compressed' in t or 'pinch' in t or 'crushed' in t: return 'Caught In/Compressed'

    # Environmental / Chemical
    if 'exposure' in t or 'temperature' in t or 'burn' in t: return 'Exposure (Heat/Chemical)'
    if 'electric' in t or 'shock' in t: return 'Electrical'

    # Operational
    if 'transport' in t or 'vehicle' in t: return 'Transportation Incident'
    if 'fire' in t or 'explosion' in t: return 'Fire/Explosion'
    if 'assault' in t or 'violence' in t: return 'Violence/Assault'

    # Default
    return 'Other/Misc'

# Apply grouping
df['Broad_Event'] = df['EventTitle'].apply(simplify_label)

print("Label Grouping Complete.")
print(df['Broad_Event'].value_counts())

# Data Selection
data = df[['Final Narrative', 'Broad_Event']].dropna()

# Label Encoding
label_encoder = LabelEncoder()
data['label_id'] = label_encoder.fit_transform(data['Broad_Event'])
num_classes = len(label_encoder.classes_)

# Split Data (Stratified to maintain distribution)
X_train, X_val, y_train, y_val = train_test_split(
    data['Final Narrative'].values,
    data['label_id'].values,
    test_size=0.15,
    random_state=42,
    stratify=data['label_id'].values
)

print(f"\nTraining Samples: {len(X_train)}")
print(f"Validation Samples: {len(X_val)}")
print(f"Number of Target Classes: {num_classes}")

# --- STRATEGY B: COMPUTE CLASS WEIGHTS ---
# This prevents the model from ignoring rare classes (like Fire/Explosion)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print(f"Class Weights Computed: {weights_tensor}")

Label Grouping Complete.
Broad_Event
Caught In/Compressed        24263
Fall - Lower Level          15275
Struck By Object            15208
Other/Misc                  14466
Fall - Same Level           12276
Fall - Other                 9943
Exposure (Heat/Chemical)     5468
Transportation Incident      1759
Fire/Explosion                904
Violence/Assault               10
Name: count, dtype: int64

Training Samples: 84636
Validation Samples: 14936
Number of Target Classes: 10
Class Weights Computed: tensor([4.1040e-01, 1.8209e+00, 6.5185e-01, 1.0014e+00, 8.1108e-01, 1.1020e+01,
        6.8832e-01, 6.5472e-01, 5.6613e+00, 1.0579e+03], device='cuda:0')


In [24]:
# Initialize Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class SafetyNetDataset(Dataset):
    def __init__(self, narratives, labels, tokenizer, max_len):
        self.narratives = narratives
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.narratives)

    def __getitem__(self, item):
        narrative = str(self.narratives[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            narrative,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'narrative_text': narrative,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long)
        }

# Data Loaders
MAX_LEN = 128
BATCH_SIZE = 32

train_dataset = SafetyNetDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = SafetyNetDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [25]:
class AccidentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(AccidentClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_state = output.last_hidden_state[:, 0, :]
        val = self.drop(hidden_state)
        return self.out(val)

model = AccidentClassifier(num_classes)
model = model.to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [26]:
EPOCHS = 11
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Loss Function with Class Weights (Strategy B)
loss_fn = nn.CrossEntropyLoss(weight=weights_tensor).to(device)

# Updated Scaler for Mixed Precision (Fixes deprecation warning)
scaler = torch.amp.GradScaler('cuda')

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    start_time = time.time()

    for i, d in enumerate(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        # Mixed Precision Context
        with torch.amp.autocast('cuda'):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        # Backpropagation
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        optimizer.zero_grad()

        if i % 100 == 0:
            print(f"  Batch {i}/{len(data_loader)} | Loss: {loss.item():.4f}")

    print(f"Epoch finished in {(time.time() - start_time)/60:.2f} min")
    return correct_predictions.double() / n_examples, np.mean(losses)

In [27]:
print("Starting Training with Optimized Strategies...")

history = {'train_acc': [], 'train_loss': []}

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_dataset)
    )
    print(f"Train Accuracy: {train_acc:.4f} | Train Loss: {train_loss:.4f}")
    history['train_acc'].append(train_acc.item())
    history['train_loss'].append(train_loss)

Starting Training with Optimized Strategies...

Epoch 1/11
  Batch 0/2645 | Loss: 2.3398
  Batch 100/2645 | Loss: 1.0224
  Batch 200/2645 | Loss: 1.0652
  Batch 300/2645 | Loss: 0.5778
  Batch 400/2645 | Loss: 0.4568
  Batch 500/2645 | Loss: 1.1620
  Batch 600/2645 | Loss: 1.1846
  Batch 700/2645 | Loss: 0.7914
  Batch 800/2645 | Loss: 0.4631
  Batch 900/2645 | Loss: 0.4476
  Batch 1000/2645 | Loss: 0.5761
  Batch 1100/2645 | Loss: 0.6293
  Batch 1200/2645 | Loss: 0.4394
  Batch 1300/2645 | Loss: 0.5745
  Batch 1400/2645 | Loss: 0.4309
  Batch 1500/2645 | Loss: 0.3633
  Batch 1600/2645 | Loss: 0.5122
  Batch 1700/2645 | Loss: 0.9019
  Batch 1800/2645 | Loss: 0.3978
  Batch 1900/2645 | Loss: 0.3207
  Batch 2000/2645 | Loss: 0.9870
  Batch 2100/2645 | Loss: 0.4994
  Batch 2200/2645 | Loss: 0.6950
  Batch 2300/2645 | Loss: 0.6299
  Batch 2400/2645 | Loss: 0.3934
  Batch 2500/2645 | Loss: 0.2111
  Batch 2600/2645 | Loss: 0.3880
Epoch finished in 4.39 min
Train Accuracy: 0.7387 | Train Loss

In [28]:
def get_predictions(model, data_loader):
    model = model.eval()
    narratives = []
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["narrative_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            narratives.extend(texts)
            predictions.extend(preds)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return narratives, predictions, real_values

# Get Predictions
narratives, y_pred, y_test = get_predictions(model, val_loader)

# --- FRACTURE CONTEXT CHECK ---
# We verify if the model can distinguish "Fracture from Fall" vs "Fracture from Machinery"
pred_labels = label_encoder.inverse_transform(y_pred)
true_labels = label_encoder.inverse_transform(y_test)

results = pd.DataFrame({
    'Narrative': narratives,
    'True_Label': true_labels,
    'Predicted_Label': pred_labels
})

# Filter for "fracture" keyword
fracture_subset = results[results['Narrative'].str.contains('fracture', case=False)]

print("\n--- FRACTURE SUBSET ANALYSIS ---")
print(classification_report(fracture_subset['True_Label'], fracture_subset['Predicted_Label']))

# Show specific examples where it might still fail (if any)
errors = fracture_subset[fracture_subset['True_Label'] != fracture_subset['Predicted_Label']]
if not errors.empty:
    print("\nRemaining Fracture Errors (for manual review):")
    for i, row in errors.head(3).iterrows():
        print(f"Text: {row['Narrative'][:100]}...")
        print(f"True: {row['True_Label']} | Pred: {row['Predicted_Label']}")
        print("-" * 30)

# Full Report
print("\n--- OVERALL CLASSIFICATION REPORT ---")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


--- FRACTURE SUBSET ANALYSIS ---
                          precision    recall  f1-score   support

    Caught In/Compressed       0.79      0.79      0.79       325
Exposure (Heat/Chemical)       0.50      0.50      0.50         6
      Fall - Lower Level       0.91      0.91      0.91       818
            Fall - Other       0.73      0.75      0.74       412
       Fall - Same Level       0.92      0.93      0.92       509
          Fire/Explosion       0.75      0.69      0.72        13
              Other/Misc       0.64      0.54      0.59       233
        Struck By Object       0.73      0.73      0.73       386
 Transportation Incident       0.52      0.70      0.60        64

                accuracy                           0.81      2766
               macro avg       0.72      0.73      0.72      2766
            weighted avg       0.81      0.81      0.81      2766


Remaining Fracture Errors (for manual review):
Text: An employee slipped and fell while carrying product

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
import pickle
import torch

# 1. Save the PyTorch Model Weights
torch.save(model.state_dict(), 'safety_model.bin')

# 2. Save the Label Encoder (to decode 0,1,2... back to "Fall", "Fire", etc.)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Download 'safety_model.bin' and 'label_encoder.pkl' to your computer.