\##  Design, Train and Test LSTM classifiers



### Import Libraries

In [None]:
# To read and manipulate the data
import pandas as pd
pd.set_option('max_colwidth', None)
import numpy as np
# To assess the model performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout, BatchNormalization, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Suppressing the warnings.
import warnings
warnings.filterwarnings("ignore")

### Step 1: Import the data

In [None]:
use_mount_drive = True
df = pd.DataFrame()
if use_mount_drive:
    from google.colab import drive
    drive.mount('/content/drive')
else:
  print("Mount Drive option is selected. Proceeding with runtime drive.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load preprocessed dataset
encoded_cleaned_df = pd.read_csv('/content/drive/MyDrive/data_files/encoded_cleaned.csv')

Lets implement the three LSTM classifiers for accident classification:

1. Text-only model: Uses accident descriptions as input.
2. Categorical-only model: Uses structured categorical data.
3. Multi-input model: Combines text features (LSTM) and categorical features (Dense layers).

## Preprocessing: Tokenization, padding for text, and standardization for categorical data

In [None]:
# Extract features
text_data = encoded_cleaned_df['Processed Description']
categorical_data = encoded_cleaned_df.select_dtypes(exclude=['object'])
target = encoded_cleaned_df['Accident Level']

In [None]:
# Encode labels
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

In [None]:
# Split data
X_train_text, X_test_text, X_train_cat, X_test_cat, y_train, y_test = train_test_split(
    text_data, categorical_data, target_encoded, test_size=0.2, random_state=42)

In [None]:
# Tokenization & Padding for text data
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)
X_train_text_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_text), maxlen=100, padding='post')
X_test_text_seq = pad_sequences(tokenizer.texts_to_sequences(X_test_text), maxlen=100, padding='post')

In [None]:
# Standardize categorical data
scaler = StandardScaler()
X_train_cat_scaled = scaler.fit_transform(X_train_cat)
X_test_cat_scaled = scaler.transform(X_test_cat)

## Creating following three models:

## Text-only: Uses Bidirectional LSTM.


In [None]:
# Define text-only model
def create_text_model():
    input_text = Input(shape=(100,))
    embedding = Embedding(input_dim=10000, output_dim=128, input_length=100)(input_text)
    x = Bidirectional(LSTM(128, return_sequences=True))(embedding)
    x = LSTM(64)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(len(label_encoder.classes_), activation='softmax')(x)
    model = Model(inputs=input_text, outputs=output)
    return model

In [None]:
def train_model(model, X_train, X_test, y_train, y_test, epochs=20):
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=32, verbose=1)

    # Predict on the test set
    y_pred_probs = model.predict(X_test)
    y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalance
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Print metrics
    print(f"Model Evaluation Metrics on Test data")
    print(f"{60*'-'}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"{60*'-'}")

    return model


In [None]:
# Train text-only model
text_model = create_text_model()
print(text_model.summary())

None


In [None]:
train_model(text_model, X_train_text_seq, X_test_text_seq, y_train, y_test)

Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 361ms/step - accuracy: 0.5053 - loss: 1.4053 - val_accuracy: 0.8118 - val_loss: 0.7752
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 264ms/step - accuracy: 0.7513 - loss: 0.9579 - val_accuracy: 0.8118 - val_loss: 0.7923
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 262ms/step - accuracy: 0.7454 - loss: 0.9383 - val_accuracy: 0.8118 - val_loss: 0.7845
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 344ms/step - accuracy: 0.7352 - loss: 0.9589 - val_accuracy: 0.8118 - val_loss: 0.7741
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 251ms/step - accuracy: 0.7129 - loss: 1.0473 - val_accuracy: 0.8118 - val_loss: 0.8044
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 334ms/step - accuracy: 0.7408 - loss: 0.9596 - val_accuracy: 0.8118 - val_loss: 0.7879
Epoch 7/20
[1m11/11[0m [3

<Functional name=functional, built=True>

### Observations:-

- **High Accuracy (0.8118):** The model correctly classified ~81% of accident severity levels, showing solid performance using only text descriptions.  

- **Precision (0.6590) is Lower:** The model is more prone to **false positives**, meaning it sometimes predicts a severe accident level when it's not.  

- **High Recall (0.8118):** The model successfully identifies most of the true accident severity cases, but this might be at the cost of precision.  

- **F1 Score (0.7274):** Shows a balance between precision and recall, but the lower precision indicates the model could benefit from **better handling of false positives**.  

- **Overall Insight:** The model is good at not missing severe accidents (**high recall**), but it sometimes **over-predicts severity** (**low precision**).  


## Categorical-only: Uses Dense layers with Batch Normalization.

In [None]:
# Define categorical-only model
def create_categorical_model():
    input_cat = Input(shape=(X_train_cat_scaled.shape[1],))
    x = Dense(64, activation='relu')(input_cat)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(len(label_encoder.classes_), activation='softmax')(x)
    model = Model(inputs=input_cat, outputs=output)
    return model

In [None]:
# Train categorical-only model
cat_model = create_categorical_model()
print(cat_model.summary())

None


In [None]:
train_model(cat_model, X_train_cat_scaled, X_test_cat_scaled, y_train, y_test)


Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.3457 - loss: 1.6408 - val_accuracy: 0.5059 - val_loss: 1.4366
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4974 - loss: 1.3756 - val_accuracy: 0.6471 - val_loss: 1.2793
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6112 - loss: 1.1800 - val_accuracy: 0.7412 - val_loss: 1.1696
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6209 - loss: 1.1133 - val_accuracy: 0.7647 - val_loss: 1.0922
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7005 - loss: 0.9675 - val_accuracy: 0.7765 - val_loss: 1.0379
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7312 - loss: 0.9347 - val_accuracy: 0.7882 - val_loss: 1.0018
Epoch 7/20
[1m11/11[0m [32m━━━━━━━━

<Functional name=functional_1, built=True>

### Observations:

- **Accuracy (0.8118):** Matches the text-only model, showing categorical features alone are strong predictors of accident severity.  

- **Precision (0.6590):** Similar low precision as the text-only model, indicating possible **false positives**.  

- **Recall (0.8118):** High recall, effectively capturing true severe accidents.  

- **F1 Score (0.7274):** Balanced performance, but improving precision could enhance it further.  

- **Insight:** The categorical data, like **industry sector**, **accident level**, and **critical risk**, provide valuable insights. The performance parity with the text-only model suggests potential gains by combining both inputs in a **multi-input model**.  


## Multi-input: Combines LSTM for text and Dense layers for categorical data

In [None]:
# Define multi-input model
def create_multi_input_model():
    input_text = Input(shape=(100,))
    embedding = Embedding(input_dim=10000, output_dim=128, input_length=100)(input_text)
    x_text = Bidirectional(LSTM(128, return_sequences=True))(embedding)
    x_text = LSTM(64)(x_text)
    x_text = Dense(64, activation='relu')(x_text)
    x_text = Dropout(0.5)(x_text)

    input_cat = Input(shape=(X_train_cat_scaled.shape[1],))
    x_cat = Dense(64, activation='relu')(input_cat)
    x_cat = BatchNormalization()(x_cat)
    x_cat = Dropout(0.5)(x_cat)
    x_cat = Dense(32, activation='relu')(x_cat)

    concatenated = Concatenate()([x_text, x_cat])
    x = Dense(64, activation='relu')(concatenated)
    x = Dropout(0.5)(x)
    output = Dense(len(label_encoder.classes_), activation='softmax')(x)

    model = Model(inputs=[input_text, input_cat], outputs=output)
    return model

In [None]:
# Train multi-input model
multi_input_model = create_multi_input_model()
print(multi_input_model.summary())


None


In [None]:
train_model(multi_input_model, [X_train_text_seq, X_train_cat_scaled], [X_test_text_seq, X_test_cat_scaled], y_train, y_test)


Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 396ms/step - accuracy: 0.3986 - loss: 1.5523 - val_accuracy: 0.8118 - val_loss: 0.8142
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 275ms/step - accuracy: 0.6676 - loss: 1.1726 - val_accuracy: 0.8118 - val_loss: 0.9578
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 276ms/step - accuracy: 0.6728 - loss: 1.1041 - val_accuracy: 0.8118 - val_loss: 0.9404
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 277ms/step - accuracy: 0.7014 - loss: 1.1231 - val_accuracy: 0.8118 - val_loss: 0.9827
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 356ms/step - accuracy: 0.6891 - loss: 0.9971 - val_accuracy: 0.8118 - val_loss: 0.9604
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 277ms/step - accuracy: 0.7347 - loss: 0.9634 - val_accuracy: 0.8118 - val_loss: 0.8726
Epoch 7/20
[1m11/11[0m [3



[1m2/3[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 67ms/step 



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 222ms/step
Model Evaluation Metrics on Test data
------------------------------------------------------------
Accuracy: 0.8118
Precision: 0.6590
Recall: 0.8118
F1 Score: 0.7274
------------------------------------------------------------


<Functional name=functional_2, built=True>

### Observations:

- **Accuracy (0.8118):** Remains the same as the text-only and categorical-only models, indicating no immediate gain from combining inputs.  

- **Precision (0.6590):** Still relatively low, suggesting a persistent issue with **false positives**.  

- **Recall (0.8118):** Consistent high recall, maintaining effectiveness in identifying true severe accidents.  

- **F1 Score (0.7274):** Unchanged, showing the combined model does not yet improve over individual models.  


### **Insights:**  
1. **Textual Ambiguity:** Certain accident descriptions might be vague, leading to misclassification.  
2. **Class Imbalance:** If accident levels are imbalanced in the dataset, the model might struggle with minority classes.  
3. **Preprocessing Needs:** Advanced text preprocessing (e.g., stemming, lemmatization, or better handling of stopwords) could improve precision.  
4. **Model Complexity:** A more complex architecture (e.g., BERT-based model) might help capture deeper semantic meanings in accident descriptions.  


## Implement a **Hugging Face BERT model** for accident classification

### 1. Install Dependencies and Import Required Libraries

In [None]:
!pip install transformers datasets torch scikit-learn

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

## Preprocessing data

In [None]:
# Extract features
bert_df = encoded_cleaned_df.copy()
textdata = bert_df['Processed Description']
target_data = bert_df['Accident Level']

# Convert categorical target_data labels to numeric
label_map = {label: idx for idx, label in enumerate(bert_df['Accident Level'].unique())}
bert_df['Accident Level'] = bert_df['Accident Level'].map(label_map)

# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    textdata, bert_df['Accident Level'], test_size=0.2, random_state=42
)

### Tokenization using BERT Tokenizer

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Convert to PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

###  Load Pretrained BERT Model

In [None]:
# Load BERT for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Function to compute classification metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


### Define Training Arguments & Trainer

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=500,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

###  Train the Model

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.793968,0.811765,0.658962,0.811765,0.727426
2,No log,0.77454,0.811765,0.658962,0.811765,0.727426
3,No log,0.773424,0.811765,0.658962,0.811765,0.727426


TrainOutput(global_step=129, training_loss=0.9844439602637476, metrics={'train_runtime': 4658.3958, 'train_samples_per_second': 0.218, 'train_steps_per_second': 0.028, 'total_flos': 267591150885888.0, 'train_loss': 0.9844439602637476, 'epoch': 3.0})

### Evaluate the Model

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.7734237909317017, 'eval_accuracy': 0.8117647058823529, 'eval_precision': 0.658961937716263, 'eval_recall': 0.8117647058823529, 'eval_f1': 0.7274255156608098, 'eval_runtime': 100.0974, 'eval_samples_per_second': 0.849, 'eval_steps_per_second': 0.11, 'epoch': 3.0}


### Make Predictions

In [None]:
def predict(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_labels = torch.argmax(predictions, dim=1)
    return [list(label_map.keys())[label] for label in predicted_labels]

# Example prediction
example_texts = ["Worker slipped and fell from a height of 3 meters."]
print(predict(example_texts))

['I']


In [None]:
# Define the save path
model_path = "/content/drive/MyDrive/data_files/bert_accident_classifier.pt"

# Save only the model state dictionary (efficient)
torch.save(model.state_dict(), model_path)

print(f"Model saved at {model_path}")

Model saved at /content/drive/MyDrive/data_files/bert_accident_classifier.pt


## Obervations and Insights :

1. **Performance Consistency:** The BERT model's performance is identical to the previous LSTM and categorical models, showing no improvement across all metrics.  

2. **Stable Validation Loss:** The slight decrease in validation loss (from 0.79 to 0.77) indicates learning stabilization, but it does not reflect positively in performance metrics.  

3. **Learning Plateau:** The stagnant metrics across epochs suggest the model may have reached its capacity with the current configuration and dataset.  

4. **Dataset Limitation:** The dataset might not provide enough distinct or informative features for the BERT model to leverage, possibly due to limited variability or insufficient labeled data.  

5. **Feature Representation Limitation:** While BERT excels at understanding textual data, accident descriptions alone may not contain sufficient signal to improve classification accuracy.  
