# Notebook For TASK 4 with Data G4

## To run this notebook, Follow these steps:

1. Keep 'utils.py' file in the same folder
2. Update the data path for new data (G4)
3. Keep these files in the same folder 'subset_data', 'g1_test', 'g2_test', 'g3_test'
4. Choose the model from downloaded model from hugging face
5. Change or update parameters-- According to your need. I have used num_epochs = 20

### Import all the libraries and functions (from utils)

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
from sklearn.model_selection import train_test_split
from huggingface_hub import hf_hub_download

import warnings
warnings.filterwarnings("ignore")

from utils import*

## Function to load and preprocess data

In [2]:
def load_and_preprocess_data(train_data, test_data, tokenizer):
    # Mapping from BIO tags to numeric IDs for model training
    label_map = {
        "O": 0,
        "B-treatment": 1, "I-treatment": 2,
        "B-chronic_disease": 3, "I-chronic_disease": 4,
        "B-cancer": 5, "I-cancer": 6,
        "B-allergy_name": 7, "I-allergy_name": 8
    }

    # Print status
    print("Preprocessing Function")
    
    # Adjust range of tags and parse tags into structured format
    train_data['new_tags'] = train_data['tags'].apply(adjust_ranges)
    test_data['new_tags'] = test_data['tags'].apply(adjust_ranges)
    
    train_data['parsed_tags'] = train_data['new_tags'].apply(parse_tags)
    test_data['parsed_tags'] = test_data['new_tags'].apply(parse_tags)
    
    # Convert parsed tags to BIO format
    train_data['bio_tags'] = train_data['parsed_tags'].apply(to_bio)
    test_data['bio_tags'] = test_data['parsed_tags'].apply(to_bio)
    
    # Ensure 'text' is of type string
    train_data['text'] = train_data['text'].astype(str)
    test_data['text'] = test_data['text'].astype(str)
    
    # Tokenize text and align tokens with labels
    train_encodings = tokenize_and_align_labels(train_data['text'].tolist(), tokenizer)
    test_encodings = tokenize_and_align_labels(test_data['text'].tolist(), tokenizer)
    
    # Process labels for NER
    train_labels = process_for_ner(train_data, tokenizer, label_map)
    test_labels = process_for_ner(test_data, tokenizer, label_map)
    
    return train_encodings, train_labels, test_encodings, test_labels

## Function to train and evaluate the model

In [3]:
def train_and_evaluate(train_encodings, train_labels, test_encodings, test_labels, model, num_epochs, learning_rate):
    # Create datasets
    train_dataset = ClinicalDataset(train_encodings, train_labels)
    test_dataset = ClinicalDataset(test_encodings, test_labels)
    
    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    # Setup optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    print("Training Started...")
    
    # List to keep track of average losses per epoch
    epoch_losses = []
    
    # Training loop
    for epoch in range(num_epochs):
        loss = train_model(train_loader, model, optimizer, device)
        avg_epoch_loss = loss / len(train_loader)
        epoch_losses.append(avg_epoch_loss)
        print(f"Epoch {epoch+1}, Loss: {loss}")
        
    # Evaluate the model
    print("Evaluating the model...")
    report = get_classification_report(test_loader, model)

    # Save model
    print("Saving model")
    model.save_pretrained('./model_output')
    tokenizer.save_pretrained('./model_output')

## Main function to data loading, training, and evaluation

In [4]:
def main(data_path, subset_data, g1_test, g2_test, g3_test, model, tokenizer, num_epochs, learning_rate):
    print("New task data loaded")
    
    # Split data into train and test sets
    train_data, test_data = full_process_and_split_with_stratify(data_path, save_path=None)
    print("Train test split completed!!")
    
    print("Combining subset data from G1, G2, and G3 and G4 for model training...")
    train_data_with_subset = pd.concat([train_data, subset_data]).reset_index(drop=True)
    test_data_combined = pd.concat([test_data, g1_test, g2_test, g3_test]).reset_index(drop=True)
    
    # Load and preprocess data
    train_encodings, train_labels, test_encodings, test_labels = load_and_preprocess_data(
        train_data_with_subset, test_data_combined, tokenizer)
    
    print("Encoding Done!!")
    
    # Train and evaluate the model
    train_and_evaluate(train_encodings, train_labels, test_encodings, test_labels, model, num_epochs, learning_rate)

## Code to execute

In [5]:
# Setup paths and parameters
new_data_path = '../G3.xlsx'  # Path for G4 data, task T4

## These datasets are present in the repo
subset_data = pd.read_csv('./subset_data_combined_g1_g2_g3.csv')
g1_test = pd.read_csv('./test_dataset_for_t1_task_using_G1.csv')
g2_test = pd.read_csv('./test_dataset_for_t2_task_using_G2.csv')
g3_test = pd.read_csv('./test_dataset_for_t3_task_using_G3.csv')


# subset_data = pd.read_csv('../task_dataset/subset_data_combined_g1_g2_g3.csv') ## This data is present in the repo
# g1_test = pd.read_csv('../task_dataset/test_dataset_for_t1_task_using_G1.csv') ## This data is present in repo
# g2_test = pd.read_csv('../task_dataset/test_dataset_for_t2_task_using_G2.csv') ## This data is present in repo
# g3_test = pd.read_csv('../task_dataset/test_dataset_for_t3_task_using_G3.csv') ## This data is present in repo

# Training parameters
num_epochs = 20
learning_rate = 1e-5

### You can choose model 'FILENAME' from here:

##### Model for Task 1
-- FILENAME = "biobert_complete_model_G1_trained.pth"

##### Model for Task 2
-- FILENAME = "biobert_complete_model_G1+G2_trained.pth"

##### Model for Task 3
-- FILENAME = "biobert_complete_model_G1+G2+G3_trained.pth"


##### Combined model using G1+G2+G3
-- FILENAME = "biobert_combined_trained_model.pth"

In [6]:
# Load models from Hugging Face Hub
FILENAME = "biobert_complete_model_G1+G2+G3_trained.pth"


model = torch.load(hf_hub_download(repo_id="madhu147/miimansa_ner_project", filename=FILENAME), encoding='latin1')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer = BertTokenizerFast.from_pretrained("dmis-lab/biobert-v1.1")

In [7]:
# Run main function
main(new_data_path, subset_data, g1_test, g2_test, g3_test, model, tokenizer, num_epochs, learning_rate)

New task data loaded
Train test split completed!!
Combining subset data from G1, G2, and G3 for model training...
Preprocessing Function
Encoding Done!!
Training Started...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training: 100%|███████████████████████████████| 332/332 [10:57<00:00,  1.98s/it]


Epoch 1, Loss: 0.043861489267690186
Evaluating the model...
                 precision    recall  f1-score   support

         cancer      0.884     0.911     0.898      7343
              O      0.952     0.920     0.936     50255
   allergy_name      0.783     0.829     0.805      1413
chronic_disease      0.887     0.925     0.905     19204
      treatment      0.902     0.926     0.914     20554

       accuracy                          0.920     98769
      macro avg      0.882     0.902     0.891     98769
   weighted avg      0.921     0.920     0.920     98769

Saving model
