# Notebook to Re-Produce the Results for each Task

## This notebook takes Model, Task and Datas as main input. 

Kindly provide nacessary datas according to each task. 

1. For Task1-- G1_data path is enough
2. For Task2-- G1_data and G2_data both are required!!
3. For Task3-- G1_data, G2_data, and G3_data all are required!!

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
from sklearn.model_selection import train_test_split
from utils import*

import warnings
warnings.filterwarnings("ignore")

## Function to load and preprocess data

In [2]:
def load_and_preprocess_data(test_data, tokenizer):
    # Define label mappings
    label_map = {
        "O": 0,
        "B-treatment": 1, "I-treatment": 2,
        "B-chronic_disease": 3, "I-chronic_disease": 4,
        "B-cancer": 5, "I-cancer": 6,
        "B-allergy_name": 7, "I-allergy_name": 8
    }
    
    # Adjust ranges and parse tags for NER
    test_data['new_tags'] = test_data['tags'].apply(adjust_ranges)
    test_data['parsed_tags'] = test_data['new_tags'].apply(parse_tags)
    test_data['bio_tags'] = test_data['parsed_tags'].apply(to_bio)
    test_data['text'] = test_data['text'].astype(str)
    
    # Tokenize text and align labels with tokens
    test_encodings = tokenize_and_align_labels(test_data['text'].tolist(), tokenizer)
    test_labels = process_for_ner(test_data, tokenizer, label_map)
    
    return test_encodings, test_labels

## Function to train and evaluate the model

In [3]:
def train_and_evaluate(test_encodings, test_labels, model):
    # Create test dataset and dataloader
    test_dataset = ClinicalDataset(test_encodings, test_labels)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    # Evaluate and print classification report
    print("Evaluating the model...")
    report = get_classification_report(test_loader, model)

## Main function for evaluation

In [4]:
def main(model, tokenizer, task, data_path_g1, data_path_g2=None, data_path_g3=None):
    # Conditional flow based on the task
    if task == 1:
        print("Task:", task)
        train_data, test_data = full_process_and_split_with_stratify(data_path_g1, save_path=None)
        print("Train test split completed!!")
        test_encodings, test_labels = load_and_preprocess_data(test_data, tokenizer)
        print("Encoding Done!!")
        train_and_evaluate(test_encodings, test_labels, model)
    elif task == 2:
        if data_path_g2 is None:
            print("Please provide data G1, G2 data both to Proceed for Task 2 or Try Task 1")
        else:
            print("Task:", task)
            train_data_g1, test_data_g1 = full_process_and_split_with_stratify(data_path_g1, save_path=None)
            train_data_g2, test_data_g2 = full_process_and_split_with_stratify(data_path_g2, save_path=None)
            print("Combining G1_test data and G2_test data")
            test_data = pd.concat([test_data_g1, test_data_g2]).reset_index(drop=True)
            print("Train test split completed!!")
            test_encodings, test_labels = load_and_preprocess_data(test_data, tokenizer)
            print("Encoding Done!!")
            train_and_evaluate(test_encodings, test_labels, model)
    elif task == 3:
        if data_path_g2 is None or data_path_g3 is None:
            print("Please provide G1, G2 and G3 datas to Proceed for Task 3 or Try Task 1")
        else:
            print("Task:", task)
            train_data_g1, test_data_g1 = full_process_and_split_with_stratify(data_path_g1, save_path=None)
            train_data_g2, test_data_g2 = full_process_and_split_with_stratify(data_path_g2, save_path=None)
            train_data_g3, test_data_g3 = full_process_and_split_with_stratify(data_path_g3, save_path=None)
            print("Combining G1_test data, G2_test data and G3_test data")
            test_data = pd.concat([test_data_g1, test_data_g2, test_data_g3]).reset_index(drop=True)
            print("Train test split completed!!")
            test_encodings, test_labels = load_and_preprocess_data(test_data, tokenizer)
            print("Encoding Done!!")
            train_and_evaluate(test_encodings, test_labels, model)
    else:
        print("Specify the task number in integers like 1, 2, 3 and TRY AGAIN !!")

## Code to Execute

### You can choose model 'FILENAME' from here:

##### Model for Task 1
-- FILENAME = "biobert_complete_model_G1_trained.pth"

##### Model for Task 2
-- FILENAME = "biobert_complete_model_G1+G2_trained.pth"

##### Model for Task 3
-- FILENAME = "biobert_complete_model_G1+G2+G3_trained.pth"


##### Combined model using G1+G2+G3
-- FILENAME = "biobert_combined_trained_model.pth"

In [5]:
# Download and load models from Hugging Face hub

import torch
from huggingface_hub import hf_hub_download

REPO_ID = "madhu147/miimansa_ner_project"

# Load models
model1 = torch.load(hf_hub_download(repo_id=REPO_ID, filename="biobert_complete_model_G1_trained.pth"), encoding='latin1')
model2 = torch.load(hf_hub_download(repo_id=REPO_ID, filename="biobert_complete_model_G1+G2_trained.pth"), encoding='latin1')
model3 = torch.load(hf_hub_download(repo_id=REPO_ID, filename="biobert_complete_model_G1+G2+G3_trained.pth"), encoding='latin1')
combined_model = torch.load(hf_hub_download(repo_id=REPO_ID, filename="biobert_combined_trained_model.pth"), encoding='latin1')

# Set device for model operations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizerFast.from_pretrained("dmis-lab/biobert-v1.1")

model = model3  # Specify the model to use
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [9]:
## Update parameters

# Update Data Paths
data_path_g1 = '../G1.xlsx'
data_path_g2 = '../G2.xlsx'
data_path_g3 = '../G3.xlsx'

# Specify the task to perform
task = 1       

In [10]:
## Run main function
main(model, tokenizer, task, data_path_g1)

Task: 1
Train test split completed!!
Encoding Done!!
Evaluating the model...
                 precision    recall  f1-score   support

   allergy_name      0.755     0.763     0.759       283
      treatment      0.910     0.832     0.869      5285
         cancer      0.909     0.820     0.862      1820
chronic_disease      0.903     0.817     0.858      4912
              O      0.879     0.948     0.912     14116

       accuracy                          0.889     26416
      macro avg      0.871     0.836     0.852     26416
   weighted avg      0.891     0.889     0.888     26416

