### Import Libraries

In [54]:
# To read and manipulate the data
import pandas as pd
pd.set_option('max_colwidth', None)
import numpy as np
# To assess the model performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout, BatchNormalization, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Suppressing the warnings.
import warnings
warnings.filterwarnings("ignore")

### Step 1: Import the data

In [55]:
use_mount_drive = True
df = pd.DataFrame()
if use_mount_drive:
    from google.colab import drive
    drive.mount('/content/drive')
else:
  print("Mount Drive option is selected. Proceeding with runtime drive.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
# Load preprocessed dataset
encoded_cleaned_df = pd.read_csv('/content/drive/MyDrive/data_files/encoded_cleaned.csv')

##  Design, Train and Test LSTM classifiers



Lets implement the three LSTM classifiers for accident classification:

1. Text-only model: Uses accident descriptions as input.
2. Categorical-only model: Uses structured categorical data.
3. Multi-input model: Combines text features (LSTM) and categorical features (Dense layers).

## Preprocessing: Tokenization, padding for text, and standardization for categorical data

In [57]:
# Extract features
text_data = encoded_cleaned_df['Processed Description']
categorical_data = encoded_cleaned_df.select_dtypes(exclude=['object'])
target = encoded_cleaned_df['Accident Level']

In [58]:
# Encode labels
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

In [59]:
# Split data
X_train_text, X_test_text, X_train_cat, X_test_cat, y_train, y_test = train_test_split(
    text_data, categorical_data, target_encoded, test_size=0.2, random_state=42)

In [60]:
# Tokenization & Padding for text data
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)
X_train_text_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_text), maxlen=100, padding='post')
X_test_text_seq = pad_sequences(tokenizer.texts_to_sequences(X_test_text), maxlen=100, padding='post')

In [61]:
# Standardize categorical data
scaler = StandardScaler()
X_train_cat_scaled = scaler.fit_transform(X_train_cat)
X_test_cat_scaled = scaler.transform(X_test_cat)

## Creating following three models:

## Text-only: Uses Bidirectional LSTM.


In [62]:
# Define text-only model
def create_text_model():
    input_text = Input(shape=(100,))
    embedding = Embedding(input_dim=10000, output_dim=128, input_length=100)(input_text)
    x = Bidirectional(LSTM(128, return_sequences=True))(embedding)
    x = LSTM(64)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(len(label_encoder.classes_), activation='softmax')(x)
    model = Model(inputs=input_text, outputs=output)
    return model

## Categorical-only: Uses Dense layers with Batch Normalization.

In [63]:
def train_model(model, X_train, X_test, y_train, y_test, epochs=20):
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=32, verbose=1)

    # Predict on the test set
    y_pred_probs = model.predict(X_test)
    y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalance
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Print metrics
    print(f"Model Evaluation Metrics on Test data")
    print(f"{60*'-'}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"{60*'-'}")

    return model


## Multi-input: Combines LSTM for text and Dense layers for categorical data

In [64]:
# Train text-only model
text_model = create_text_model()
print(text_model.summary())

None


In [65]:
train_model(text_model, X_train_text_seq, X_test_text_seq, y_train, y_test)

Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 553ms/step - accuracy: 0.5252 - loss: 1.4061 - val_accuracy: 0.8118 - val_loss: 0.7503
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 710ms/step - accuracy: 0.7699 - loss: 0.9195 - val_accuracy: 0.8118 - val_loss: 0.8181
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 491ms/step - accuracy: 0.7165 - loss: 1.0486 - val_accuracy: 0.8118 - val_loss: 0.7589
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 484ms/step - accuracy: 0.7526 - loss: 0.9646 - val_accuracy: 0.8118 - val_loss: 0.7711
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 475ms/step - accuracy: 0.7254 - loss: 1.0046 - val_accuracy: 0.8118 - val_loss: 0.8023
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 492ms/step - accuracy: 0.7479 - loss: 0.9679 - val_accuracy: 0.8118 - val_loss: 0.7677
Epoch 7/20
[1m11/11[0m 

<Functional name=functional_6, built=True>

In [66]:
# Define categorical-only model
def create_categorical_model():
    input_cat = Input(shape=(X_train_cat_scaled.shape[1],))
    x = Dense(64, activation='relu')(input_cat)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(len(label_encoder.classes_), activation='softmax')(x)
    model = Model(inputs=input_cat, outputs=output)
    return model

###

In [67]:
# Train categorical-only model
cat_model = create_categorical_model()
print(cat_model.summary())

None


In [68]:
train_model(cat_model, X_train_cat_scaled, X_test_cat_scaled, y_train, y_test)


Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.0731 - loss: 2.7695 - val_accuracy: 0.1294 - val_loss: 1.6783
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1146 - loss: 2.2122 - val_accuracy: 0.2118 - val_loss: 1.5363
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1845 - loss: 1.8663 - val_accuracy: 0.4588 - val_loss: 1.4090
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2829 - loss: 1.6589 - val_accuracy: 0.5765 - val_loss: 1.2973
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4069 - loss: 1.4044 - val_accuracy: 0.7412 - val_loss: 1.1985
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5221 - loss: 1.3844 - val_accuracy: 0.7882 - val_loss: 1.1125
Epoch 7/20
[1m11/11[0m [32m━━━━



[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 71ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Model Evaluation Metrics on Test data
------------------------------------------------------------
Accuracy: 0.7765
Precision: 0.7032
Recall: 0.7765
F1 Score: 0.7307
------------------------------------------------------------


<Functional name=functional_7, built=True>

###

In [69]:
# Define multi-input model
def create_multi_input_model():
    input_text = Input(shape=(100,))
    embedding = Embedding(input_dim=10000, output_dim=128, input_length=100)(input_text)
    x_text = Bidirectional(LSTM(128, return_sequences=True))(embedding)
    x_text = LSTM(64)(x_text)
    x_text = Dense(64, activation='relu')(x_text)
    x_text = Dropout(0.5)(x_text)

    input_cat = Input(shape=(X_train_cat_scaled.shape[1],))
    x_cat = Dense(64, activation='relu')(input_cat)
    x_cat = BatchNormalization()(x_cat)
    x_cat = Dropout(0.5)(x_cat)
    x_cat = Dense(32, activation='relu')(x_cat)

    concatenated = Concatenate()([x_text, x_cat])
    x = Dense(64, activation='relu')(concatenated)
    x = Dropout(0.5)(x)
    output = Dense(len(label_encoder.classes_), activation='softmax')(x)

    model = Model(inputs=[input_text, input_cat], outputs=output)
    return model

###

In [70]:
# Train multi-input model
multi_input_model = create_multi_input_model()
print(multi_input_model.summary())


None


In [71]:
train_model(multi_input_model, [X_train_text_seq, X_train_cat_scaled], [X_test_text_seq, X_test_cat_scaled], y_train, y_test)


Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 546ms/step - accuracy: 0.4013 - loss: 1.4664 - val_accuracy: 0.8118 - val_loss: 0.8140
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 482ms/step - accuracy: 0.7062 - loss: 1.2005 - val_accuracy: 0.8118 - val_loss: 0.9264
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 635ms/step - accuracy: 0.6947 - loss: 1.1454 - val_accuracy: 0.8118 - val_loss: 0.9302
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 511ms/step - accuracy: 0.7026 - loss: 1.0721 - val_accuracy: 0.8118 - val_loss: 0.8848
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 707ms/step - accuracy: 0.7092 - loss: 1.0600 - val_accuracy: 0.8118 - val_loss: 0.8557
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 487ms/step - accuracy: 0.7265 - loss: 0.9979 - val_accuracy: 0.8118 - val_loss: 0.8696
Epoch 7/20
[1m11/11[0m 

<Functional name=functional_8, built=True>

### **Observations**

## Implement a **Hugging Face BERT model** for accident classification

### 1. Install Dependencies and Import Required Libraries

In [72]:
!pip install transformers datasets torch scikit-learn

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

## Preprocessing data

In [74]:
# Extract features
bert_df = encoded_cleaned_df.copy()
textdata = bert_df['Processed Description']
target_data = bert_df['Accident Level']

# Convert categorical target_data labels to numeric
label_map = {label: idx for idx, label in enumerate(bert_df['Accident Level'].unique())}
bert_df['Accident Level'] = bert_df['Accident Level'].map(label_map)

# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    textdata, bert_df['Accident Level'], test_size=0.2, random_state=42
)

### Tokenization using BERT Tokenizer

In [76]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Convert to PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

###  Load Pretrained BERT Model

In [77]:
# Load BERT for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Function to compute classification metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


### Define Training Arguments & Trainer

In [80]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=500,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

###  Train the Model

In [None]:
trainer.train()

### Evaluate the Model

In [None]:
results = trainer.evaluate()
print(results)

### Make Predictions

In [None]:
def predict(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_labels = torch.argmax(predictions, dim=1)
    return [list(label_map.keys())[label] for label in predicted_labels]

# Example prediction
example_texts = ["Worker slipped and fell from a height of 3 meters."]
print(predict(example_texts))

In [None]:
# Define the save path
model_path = "bert_accident_classifier.pt"

# Save only the model state dictionary (efficient)
torch.save(model.state_dict(), model_path)

print(f"Model saved at {model_path}")