In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = "/content/drive/MyDrive/Colab_Notebooks/NLP/Project/datasets/triage_dataset.csv"

In [None]:
# !cp $dataset_path .

In [None]:
import torch
from torch.utils.data import DataLoader, random_split, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.optim import AdamW
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import random

In [None]:
# Load your CSV dataset (replace with your actual dataset path)
df = pd.read_csv("triage_dataset.csv")

In [None]:
df['triage'] = df['triage'].str.lower()

In [None]:
df['triage'].unique()

array(['non-urgent', 'urgent'], dtype=object)

In [None]:
traige_encoder = LabelEncoder()
df['label'] = traige_encoder.fit_transform(df['triage'])

In [None]:
traige_encoder.classes_

array(['non-urgent', 'urgent'], dtype=object)

In [None]:
# df = df.iloc[:1000]

In [None]:
df.head()

Unnamed: 0,question,triage,label
0,"I am 35 years old unmarried , i was diagonized...",non-urgent,0
1,I have been having abdominal pain and burning ...,non-urgent,0
2,"sir, Day before yesterday i had an oil fried i...",urgent,1
3,"friend has a lump where their coccyx is, has b...",urgent,1
4,Which demographic should raise suspicion of a ...,non-urgent,0


In [None]:
def load_data_from_csv(df, text_column, label_column):
    texts = df[text_column].tolist()
    labels = df[label_column].tolist()

    return texts, labels

In [None]:
texts, labels = load_data_from_csv(df, 'question', 'label')

In [None]:
# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128  # Maximum length of tokens
dataset = TextDataset(texts, labels, tokenizer, max_len)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def create_data_loaders(dataset, batch_size):
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

In [None]:
batch_size = 380
train_data_loader, val_data_loader, test_data_loader = create_data_loaders(dataset, batch_size)

In [None]:
def initialize_model(device):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model = model.to(device)
    return model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = initialize_model(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def format_time(seconds):
    """Converts seconds to a string of hours, minutes, and seconds"""
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{int(hours)}h {int(minutes)}m {int(seconds)}s"

In [None]:
import time

model_directory = "./triage_bert"

def train_model(model, train_loader, val_loader, device, n_epochs=3):

    optimizer = AdamW(model.parameters(), lr=2e-5)

    prev_val_accuracy = 0
    for epoch in range(n_epochs):

        start_time = time.time()  # Record the start time of the epoch

        model.train()
        total_loss = 0
        for batch in train_loader:
            batch_input_ids = batch['input_ids'].to(device)
            batch_attention_mask = batch['attention_mask'].to(device)
            batch_labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        model.eval()
        total_eval_accuracy = 0
        for batch in val_loader:
            batch_input_ids = batch['input_ids'].to(device)
            batch_attention_mask = batch['attention_mask'].to(device)
            batch_labels = batch['labels'].to(device)
            with torch.no_grad():
                outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
                logits = outputs[0]
                preds = torch.argmax(logits, dim=1)
                total_eval_accuracy += (preds == batch_labels).sum().item()

        # Calculate the duration of the epoch
        end_time = time.time()
        epoch_duration = end_time - start_time
        formatted_time = format_time(epoch_duration)

        avg_val_accuracy = total_eval_accuracy / len(val_loader.dataset)
        avg_train_acc = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Acc: {avg_train_acc:.2f} | Validation acc: {avg_val_accuracy:.2f} | Duration: {formatted_time}")

        # Save model and tokenizer after we got better val loss.
        delta = 0.0001
        if avg_val_accuracy > prev_val_accuracy:
          print(f"{avg_val_accuracy:.4f} > {prev_val_accuracy:.4f}. Saving the model.")
          model.save_pretrained(model_directory)
          tokenizer.save_pretrained(model_directory)
          prev_val_accuracy = avg_val_accuracy

    return model

In [None]:
# Train model
n_epochs = 20  # Define the number of epochs
trained_model = train_model(model, train_data_loader, val_data_loader, device, n_epochs)

Epoch 1, Acc: 0.53 | Validation acc: 0.78 | Duration: 0h 3m 50s
0.7772 > 0.0000. Saving the model.
Epoch 2, Acc: 0.47 | Validation acc: 0.78 | Duration: 0h 3m 49s
0.7829 > 0.7772. Saving the model.
Epoch 3, Acc: 0.42 | Validation acc: 0.78 | Duration: 0h 3m 49s
Epoch 4, Acc: 0.37 | Validation acc: 0.76 | Duration: 0h 3m 49s
Epoch 5, Acc: 0.30 | Validation acc: 0.75 | Duration: 0h 3m 49s
Epoch 6, Acc: 0.22 | Validation acc: 0.75 | Duration: 0h 3m 49s
Epoch 7, Acc: 0.16 | Validation acc: 0.73 | Duration: 0h 3m 49s
Epoch 8, Acc: 0.11 | Validation acc: 0.76 | Duration: 0h 3m 49s
Epoch 9, Acc: 0.10 | Validation acc: 0.76 | Duration: 0h 3m 49s
Epoch 10, Acc: 0.08 | Validation acc: 0.77 | Duration: 0h 3m 49s
Epoch 11, Acc: 0.07 | Validation acc: 0.76 | Duration: 0h 3m 49s
Epoch 12, Acc: 0.06 | Validation acc: 0.75 | Duration: 0h 3m 49s
Epoch 13, Acc: 0.05 | Validation acc: 0.76 | Duration: 0h 3m 49s
Epoch 14, Acc: 0.05 | Validation acc: 0.76 | Duration: 0h 3m 49s
Epoch 15, Acc: 0.04 | Validat

In [None]:
save_path = "/content/drive/MyDrive/Colab_Notebooks/NLP/Project/models/"

In [None]:
!cp -r $model_directory $save_path

In [None]:
def evaluate_model(model, test_loader, device):
    model.eval()
    predictions, true_labels = [], []
    for batch in test_loader:
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs[0]
            preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.tolist())
        true_labels.extend(batch_labels.tolist())
    return predictions, true_labels

In [None]:
def generate_classification_report(predictions, true_labels):
    report = classification_report(true_labels, predictions, target_names=traige_encoder.classes_)
    return report

In [None]:
# Evaluate model
predictions, true_labels = evaluate_model(trained_model, test_data_loader, device)
report = generate_classification_report(predictions, true_labels)
print(report)

              precision    recall  f1-score   support

  non-urgent       0.82      0.88      0.85      3114
      urgent       0.58      0.46      0.51      1138

    accuracy                           0.77      4252
   macro avg       0.70      0.67      0.68      4252
weighted avg       0.75      0.77      0.76      4252



In [None]:
# Completing the predict_text function

def predict_text(model, text, tokenizer, max_len, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True,
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]

    return prediction

# Now, the complete function is provided, and the entire script is corrected. This should ensure the entire process, from data loading to prediction, works smoothly with a CSV input.

In [None]:
# Load the model and tokenizer from a specified epoch
device = "cuda" if torch.cuda.is_available() else "cpu"

model_directory = "./triage_bert"  # Adjust the path to match where you've saved the model

load_model = BertForSequenceClassification.from_pretrained(model_directory)
load_model.to(device)
load_tokenizer = BertTokenizer.from_pretrained(model_directory)
load_model = load_model.eval()

In [None]:
# Example prediction
example_text = "Your example text here"
prediction = predict_text(load_model, example_text, load_tokenizer, max_len, device)
predicted_label = traige_encoder.inverse_transform([prediction])[0]
print(f"Predicted class for example text: {predicted_label}")

Predicted class for example text: non-urgent
