<a href="https://colab.research.google.com/github/mhernandezlordui/Final-Project-LLM/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
import random
import time
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from sklearn.metrics import f1_score, roc_auc_score
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# -----------------------------------------------------------
# --- PARAMETER CONFIGURATION ---
# -----------------------------------------------------------

DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/project 2 LLM/generated_narratives_batch_save.csv'
MAX_LEN = 512 # Maximum length for the BERT sequence
BATCH_SIZE = 10 # Batch size for training
NUM_LABELS = 36 # Total number of labels (ASD + ADHD)
TEST_SIZE = 0.2 # 20% for validation
ENGLISH_MODEL_NAME = 'emilyalsentzer/Bio_ClinicalBERT'

# 1. Load the DataFrame
df = pd.read_csv(DATA_PATH)

# 2. Define Label Columns
label_cols = [col for col in df.columns if col.startswith('ASD_') or col.startswith('ADHD_')]
assert len(label_cols) == NUM_LABELS, "The number of label columns does not match NUM_LABELS."

# 3. Prepare Data and Labels for PyTorch
texts = df['text'].values
labels = df[label_cols].values

# 4. Load the Tokenizer
try:
    print(f"-> Loading Tokenizer for: {ENGLISH_MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(ENGLISH_MODEL_NAME)
    print("‚úÖ Bio_ClinicalBERT English Tokenizer loaded successfully.")
except Exception:
    print("‚ùå ERROR: Ensure the 'transformers' library is installed or use a valid model name.")
    exit()

# 5. Tokenization

# Reset on each execution
input_ids = []
attention_masks = []
print("‚úÖ Tensor lists reset for tokenization.")
# -----------------------------------------------------

for text in texts:
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Document to encode
                        add_special_tokens = True,
                        max_length = MAX_LEN,
                        padding = 'max_length',
                        truncation = True, # Added for safety to ensure MAX_LEN
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    # .squeeze(0) converts the shape (1, MAX_LEN) to (MAX_LEN)
    input_ids.append(encoded_dict['input_ids'].squeeze(0))
    attention_masks.append(encoded_dict['attention_mask'].squeeze(0))

# Stacks the list of tensors
input_ids = torch.stack(input_ids, dim=0)
attention_masks = torch.stack(attention_masks, dim=0)

# Labels should already be (1000, 36)
labels = torch.tensor(labels, dtype=torch.float32)

print(f"Final Input IDs Length (samples): {input_ids.shape[0]}")
print(f"Final Labels Length (samples): {labels.shape[0]}")
print(f"Input Tensor Shape (Input IDs): {input_ids.shape}") # Should be (1000, 128)

assert input_ids.shape[0] == labels.shape[0], "Error: Input IDs and Labels lengths do not match."


# 6. Split into Training and Validation Sets
all_data = (input_ids, attention_masks, labels)

# Unpack the three tensors consistently
(
    train_inputs,
    validation_inputs,
    train_masks,
    validation_masks,
    train_labels,
    validation_labels
) = train_test_split(
    *all_data,
    random_state=42,
    test_size=TEST_SIZE,
    shuffle=True
)

# 7. Create PyTorch Dataloaders
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=BATCH_SIZE)

print(f"‚úÖ Data tokenized and split into {len(train_dataloader)} training batches and {len(validation_dataloader)} validation batches.")

-> Loading Tokenizer for: emilyalsentzer/Bio_ClinicalBERT
‚úÖ Bio_ClinicalBERT English Tokenizer loaded successfully.
‚úÖ Tensor lists reset for tokenization.
Final Input IDs Length (samples): 1000
Final Labels Length (samples): 1000
Input Tensor Shape (Input IDs): torch.Size([1000, 512])
‚úÖ Data tokenized and split into 80 training batches and 20 validation batches.


In [14]:
# --- REQUIRED DEFINITIONS FROM PREVIOUS CELLS ---
# NUM_LABELS = 36
# ENGLISH_MODEL_NAME = 'emilyalsentzer/Bio_ClinicalBERT'
EPOCHS = 4#5#4 #9 #10 #4
LEARNING_RATE = 5e-6 #2e-5
FINAL_MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/project 2 LLM/clinicalbert_final_epoch_model.pt'
# train_dataloader, validation_dataloader (already created)
# --------------------------------------------------------

## 1. Load the BERT Model

try:
    # Loads the pre-trained BERT model and adds the 36-output classification layer
    model = AutoModelForSequenceClassification.from_pretrained(
        ENGLISH_MODEL_NAME,
        num_labels=NUM_LABELS,
        problem_type="multi_label_classification" # CRITICAL: Indicates multi-label classification
    )
    print(f"‚úÖ base model: {ENGLISH_MODEL_NAME} with 36-output classification layer loaded successfully.")
except Exception as e:
    print(f"‚ùå Error loading the model: {e}")
    exit()

# --- 2. DEVICE AND OPTIMIZER CONFIGURATION ---

# 2a. Define the Device (GPU or CPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'Using GPU: {torch.cuda.get_device_name(0)}')
else:
    device = torch.device("cpu")
    print('Using CPU.')

# 2b. Moves the model to the device (GPU or CPU)
model.to(device)

# Configuration of the Optimizer and Loss Function
# AdamW for Transformer models
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# BCEWithLogitsLoss, which combines Sigmoid and Binary Cross-Entropy
# Good for multi-label classification.
loss_fn = torch.nn.BCEWithLogitsLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ base model: emilyalsentzer/Bio_ClinicalBERT with 36-output classification layer loaded successfully.
Using GPU: Tesla T4


In [15]:
# --- PARAMETERS ---
# The required input parameters are: NUM_LABELS, ENGLISH_MODEL_NAME, EPOCHS, model, device,
# train_dataloader, validation_dataloader, optimizer, and loss_fn.
# --------------------

## 1. EVALUATION FUNCTION for Clinical Metrics

def evaluate_model(model, dataloader, device, loss_fn):
    """Evaluates the model on the dataloader and calculates loss and key metrics."""
    model.eval()
    total_eval_loss = 0
    all_logits = []
    all_labels = []

    for batch in dataloader:
        # Move data to the GPU
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits

        # Validation Loss
        loss = loss_fn(logits, b_labels)
        total_eval_loss += loss.item()

        # Collect data for metrics
        all_logits.append(logits.detach().cpu().numpy())
        all_labels.append(b_labels.detach().cpu().numpy())

    # Concatenate all results
    all_logits = np.concatenate(all_logits, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    # Calculate Average Loss
    avg_eval_loss = total_eval_loss / len(dataloader)

    # Calculate Probabilities (Apply Sigmoid)
    # The sigmoid function is: P = 1 / (1 + exp(-Logit))
    probabilities = 1 / (1 + np.exp(-all_logits))

    # Get Binary Predictions using a 0.5 threshold
    predictions = (probabilities > 0.5).astype(int)

    # CALCULATION OF CLINICAL METRICS
    f1_macro = f1_score(all_labels, predictions, average='macro', zero_division=0)
    f1_micro = f1_score(all_labels, predictions, average='micro', zero_division=0)

    # AUC-ROC (macro average)
    try:
        auc_roc = roc_auc_score(all_labels, probabilities, average='macro')
    except ValueError:
        auc_roc = 0.0 # Error handling if only one class is present

    return avg_eval_loss, f1_macro, f1_micro, auc_roc


## 2. FINE-TUNING LOOP

print("\n--- STARTING BERT FINE-TUNING AND EVALUATION ---")

for epoch_i in range(0, EPOCHS):
    print(f'\n======== EPOCH {epoch_i + 1} / {EPOCHS} ========')

    # 2a. TRAINING MODE
    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):

        # Move data to the GPU
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Reset gradients, Model call, and Loss calculation
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask)

        loss = loss_fn(outputs.logits, b_labels)
        total_train_loss += loss.item()

        # Backpropagation
        loss.backward()
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)


    # 2b. EVALUATION MODE
    avg_val_loss, f1_macro, f1_micro, auc_roc = evaluate_model(
        model,
        validation_dataloader,
        device,
        loss_fn
    )

    # 2c. PRINT RESULTS AND MONITOR OVERFITTING
    print(f"  Training Loss (Average): {avg_train_loss:.4f}")
    print(f"  Validation Loss (Average): {avg_val_loss:.4f} üî•")
    print(f"  F1-Macro (Validation): {f1_macro:.4f} üî¨")
    print(f"  AUC-ROC (Validation): {auc_roc:.4f}")

print("\n--- FINE-TUNING COMPLETE ---")


--- STARTING BERT FINE-TUNING AND EVALUATION ---

  Training Loss (Average): 0.7012
  Validation Loss (Average): 0.6898 üî•
  F1-Macro (Validation): 0.2769 üî¨
  AUC-ROC (Validation): 0.5354

  Training Loss (Average): 0.6718
  Validation Loss (Average): 0.6646 üî•
  F1-Macro (Validation): 0.4057 üî¨
  AUC-ROC (Validation): 0.6261

  Training Loss (Average): 0.6477
  Validation Loss (Average): 0.6401 üî•
  F1-Macro (Validation): 0.5315 üî¨
  AUC-ROC (Validation): 0.7028

  Training Loss (Average): 0.6257
  Validation Loss (Average): 0.6280 üî•
  F1-Macro (Validation): 0.6107 üî¨
  AUC-ROC (Validation): 0.7170

--- FINE-TUNING COMPLETE ---


# Saving the model

In [16]:
# --- Saving the model ---
# state_dict() contains all the learned weights and biases.
# ------------------------
try:
    torch.save(model.state_dict(), FINAL_MODEL_PATH)
    print(f"\n‚úÖ Model weights from the latest epoch saved in: {FINAL_MODEL_PATH}")
except Exception as e:
    print(f"\n‚ùå Error saving the model: {e}")


‚úÖ Model weights from the latest epoch saved in: /content/drive/MyDrive/Colab Notebooks/project 2 LLM/clinicalbert_final_epoch_model.pt
