In [1]:
!"C:\Users\Lokeshwar Reddy\AppData\Local\Programs\Python\Python310\python.exe" -m pip install hf_xet


Collecting hf_xet
  Downloading hf_xet-1.2.0-cp37-abi3-win_amd64.whl.metadata (5.0 kB)
Downloading hf_xet-1.2.0-cp37-abi3-win_amd64.whl (2.9 MB)
   ---------------------------------------- 0.0/2.9 MB ? eta -:--:--
   ---------------------------------------- 2.9/2.9 MB 28.1 MB/s  0:00:00
Installing collected packages: hf_xet
Successfully installed hf_xet-1.2.0


In [1]:
import json
from datasets import load_dataset, DatasetDict

# --- 1. Define File Paths ---
# (Using the paths you provided)
data_files = {
    'train': 'dataset/train.json',
    'test': 'dataset/test.json',
    'validation': 'dataset/valid.json'
}

label_file_path = 'dataset/label.json'

# --- 2. Load the Datasets from JSON files ---
# We specify 'json' and point to our file paths
# The 'load_dataset' function will create a DatasetDict
try:
    raw_datasets = load_dataset('json', data_files=data_files)
    print("‚úÖ Datasets loaded successfully!")
    print(raw_datasets)

except FileNotFoundError as e:
    print(f"‚ùå Error loading dataset files: {e}")
    print("Please make sure your file paths are correct.")
    # Exit or raise error if you want to stop execution
    

# --- 3. Load Label Mappings ---
# We load the label.json file to create our mappings
try:
    with open(label_file_path, 'r') as f:
        label_list = json.load(f)
    
    # Create the mappings
    id2label = {str(i): label for i, label in enumerate(label_list)}
    label2id = {label: str(i) for i, label in enumerate(label_list)}
    
    num_labels = len(label_list)

    print("\n‚úÖ Label mappings created successfully!")
    print(f"Number of labels: {num_labels}")
    print(f"Label List: {label_list}")
    print(f"id2label mapping: {id2label}")
    print(f"label2id mapping: {label2id}")

except FileNotFoundError as e:
    print(f"‚ùå Error loading label file: {e}")


# --- 4. Inspect the Data (Optional but Recommended) ---
if 'raw_datasets' in locals():
    print("\n--- Inspecting Training Data (First Example) ---")
    
    # Check the features of the dataset
    print(f"Features: {raw_datasets['train'].features}")
    
    # Print the first example
    # This helps you see the 'tokens' and 'tags' (NER labels)
    print(raw_datasets['train'][0])

‚úÖ Datasets loaded successfully!
DatasetDict({
    train: Dataset({
        features: ['tags', 'tokens'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tags', 'tokens'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tags', 'tokens'],
        num_rows: 5865
    })
})

‚úÖ Label mappings created successfully!
Number of labels: 5
Label List: {'O': 0, 'B-Chemical': 1, 'B-Disease': 2, 'I-Disease': 3, 'I-Chemical': 4}
id2label mapping: {'0': 'O', '1': 'B-Chemical', '2': 'B-Disease', '3': 'I-Disease', '4': 'I-Chemical'}
label2id mapping: {'O': '0', 'B-Chemical': '1', 'B-Disease': '2', 'I-Disease': '3', 'I-Chemical': '4'}

--- Inspecting Training Data (First Example) ---
Features: {'tags': List(Value('int64')), 'tokens': List(Value('string'))}
{'tags': [1, 0, 0, 0, 0, 0, 1, 0], 'tokens': ['Naloxone', 'reverses', 'the', 'antihypertensive', 'effect', 'of', 'clonidine', '.']}


In [2]:
import transformers
from transformers import AutoTokenizer

# --- 1. Setup Mappings & Tokenizer ---
# These are defined in the main process
ID2LABEL = {'0': 'O', '1': 'B-Chemical', '2': 'B-Disease', '3': 'I-Disease', '4': 'I-Chemical'}
LABEL2ID = {'O': '0', 'B-Chemical': '1', 'B-Disease': '2', 'I-Disease': '3', 'I-Chemical': '4'}
model_checkpoint = "dmis-lab/biobert-base-cased-v1.1"

print("‚è≥ Loading BioBERT tokenizer...")
MAIN_TOKENIZER = AutoTokenizer.from_pretrained(model_checkpoint)
print("‚úÖ Tokenizer loaded!")

# --- 2. Define the Alignment Function (UPDATED to accept ALL needed variables) ---
def tokenize_and_align_labels(examples, tokenizer, id2label_map, label2id_map):
    # Use passed arguments, not global variables
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True
    )

    labels = []
    for i, label_ids in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        new_label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                new_label_ids.append(-100)
            elif word_idx != previous_word_idx:
                new_label_ids.append(label_ids[word_idx])
            else:
                original_tag_id = label_ids[word_idx]
                # Use passed id2label_map
                label_name = id2label_map.get(str(original_tag_id))
                if label_name and label_name.startswith('B-'):
                    i_label_name = 'I-' + label_name[2:]
                    # Use passed label2id_map
                    if i_label_name in label2id_map:
                        new_label_ids.append(int(label2id_map[i_label_name]))
                    else:
                        new_label_ids.append(original_tag_id)
                else:
                    new_label_ids.append(original_tag_id)
            previous_word_idx = word_idx
        labels.append(new_label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# --- 3. Run Processing (Safe Mode with ALL variables passed) ---
if 'raw_datasets' in locals():
    print("\n‚è≥ Starting token-label alignment...")
    tokenized_datasets = raw_datasets.map(
        tokenize_and_align_labels, 
        batched=True, 
        num_proc=1, 
        load_from_cache_file=False,
        desc="Running tokenizer",
        # CRITICAL FIX: Pass ALL three variables explicitly
        fn_kwargs={
            "tokenizer": MAIN_TOKENIZER,
            "id2label_map": ID2LABEL,
            "label2id_map": LABEL2ID
        }
    )
    print("‚úÖ Done! Data is ready for BioBERT.")
    
    # Optional: Quick check of one example
    print(tokenized_datasets['train'][0]['labels'][:10])
else:
    print("‚ùå Error: 'raw_datasets' is missing. Please re-run Phase 1 data loading.")
# --- FINAL VERIFICATION ---
print("Checking dataset structure...")
print(tokenized_datasets)

# Check if 'input_ids' exists in the train features
if 'input_ids' in tokenized_datasets['train'].features:
    print("\n‚úÖ SUCCESS: Dataset has been tokenized and is ready for training!")
    print("Features:", tokenized_datasets['train'].features.keys())
else:
    print("\n‚ùå SOMETHING FAILED: 'input_ids' column is missing.")

import numpy as np
import evaluate
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# --- 1. Load Evaluation Metric ---
# We use seqeval, the standard for NER tasks
try:
    metric = evaluate.load("seqeval")
    print("‚úÖ Seqeval metric loaded!")
except Exception as e:
    print("‚ö†Ô∏è could not load 'evaluate', trying 'datasets' fallback...")
    from datasets import load_metric
    metric = load_metric("seqeval")

# --- 2. Define Compute Metrics Function ---
# This function handles the -100 ignore index during evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [ID2LABEL[str(p)] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ID2LABEL[str(l)] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# --- 3. Load the Model ---
print("\n‚è≥ Loading BioBERT for Token Classification...")
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(ID2LABEL),
    id2label=ID2LABEL,
    label2id=LABEL2ID
)
print("‚úÖ Model loaded!")

# --- 4. Define Training Arguments ---
args = TrainingArguments(
    "biobert-finetuned-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,             # Start with 3 epochs for a quick first run
    weight_decay=0.01,
    save_strategy="epoch",          # Save model after every epoch
    load_best_model_at_end=True,    # Load the best model when finished
    metric_for_best_model="f1",     # Use F1 score to determine "best"
    push_to_hub=False,
)

# --- 5. Initialize the Trainer ---
data_collator = DataCollatorForTokenClassification(tokenizer=MAIN_TOKENIZER)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=MAIN_TOKENIZER,
    compute_metrics=compute_metrics
)

print("\n‚úÖ Trainer initialized! Ready to train.")

‚è≥ Loading BioBERT tokenizer...
‚úÖ Tokenizer loaded!

‚è≥ Starting token-label alignment...


Running tokenizer (num_proc=1): 100%|##########| 5228/5228 [00:00<?, ? examples/s]

Running tokenizer (num_proc=1): 100%|##########| 5330/5330 [00:00<?, ? examples/s]

Running tokenizer (num_proc=1): 100%|##########| 5865/5865 [00:00<?, ? examples/s]

‚úÖ Done! Data is ready for BioBERT.
[-100, 1, 4, 4, 4, 0, 0, 0, 0, 0]
Checking dataset structure...
DatasetDict({
    train: Dataset({
        features: ['tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5865
    })
})

‚úÖ SUCCESS: Dataset has been tokenized and is ready for training!
Features: dict_keys(['tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])
‚úÖ Seqeval metric loaded!

‚è≥ Loading BioBERT for Token Classification...


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


‚úÖ Model loaded!

‚úÖ Trainer initialized! Ready to train.


In [2]:
import transformers
from transformers import AutoTokenizer

# --- 1. Define Mappings (from your Phase 1 output) ---
# We need these to convert between tag IDs and names
# Note: Based on your output, the IDs are integers, but the
# map keys/values are strings. We will handle this.

id2label = {'0': 'O', '1': 'B-Chemical', '2': 'B-Disease', '3': 'I-Disease', '4': 'I-Chemical'}
label2id = {'O': '0', 'B-Chemical': '1', 'B-Disease': '2', 'I-Disease': '3', 'I-Chemical': '4'}

# --- 2. Define Model Checkpoint & Load Tokenizer ---
model_checkpoint = "dmis-lab/biobert-base-cased-v1.1"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    print("‚úÖ BioBERT tokenizer loaded successfully!")

except Exception as e:
    print(f"‚ùå Error loading tokenizer: {e}")


# --- 3. Define the NEW Token-Label Alignment Function ---
# This function implements the B-I propagation logic

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True
    )

    labels = []
    for i, label_ids in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        
        previous_word_idx = None
        new_label_ids = []
        
        for word_idx in word_ids:
            # Special token (e.g., [CLS], [SEP])
            if word_idx is None:
                new_label_ids.append(-100) # Still ignore special tokens
            
            # New word
            elif word_idx != previous_word_idx:
                # Get the original tag ID
                tag_id = label_ids[word_idx]
                new_label_ids.append(tag_id)
            
            # Subsequent subword (same word)
            else:
                # Get the original tag ID for this word
                original_tag_id = label_ids[word_idx]
                
                # Look up the string name (e.g., 'B-Chemical')
                # We convert the int tag_id to a string for the lookup
                label_name = id2label.get(str(original_tag_id))

                if label_name and label_name.startswith('B-'):
                    # It's a "B-" tag. Create the corresponding "I-" tag
                    # e.g., 'B-Chemical' -> 'I-Chemical'
                    i_label_name = 'I-' + label_name[2:]
                    
                    if i_label_name in label2id:
                        # Find the ID for the "I-" tag (e.g., 4)
                        i_label_id = int(label2id[i_label_name])
                        new_label_ids.append(i_label_id)
                    else:
                        # This shouldn't happen if your labels are consistent
                        new_label_ids.append(original_tag_id)
                else:
                    # It was already an 'O' or 'I-' tag, so just repeat it
                    new_label_ids.append(original_tag_id)
                
            previous_word_idx = word_idx

        labels.append(new_label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# --- 4. Apply the Function to the Entire Dataset ---
# We assume 'raw_datasets' exists from Phase 1
if 'raw_datasets' in locals():
    print("\n--- Applying NEW token-label alignment (B-I propagation) ---")
    
    # Use .map() to apply the function to all splits
    tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)

    print("‚úÖ Token-label alignment complete!")
    print("\n--- Processed Dataset ---")
    print(tokenized_datasets)
    
    # --- 5. Inspect the Result (Recommended) ---
    print("\n--- Inspecting First Processed Example ---")
    example = tokenized_datasets['train'][0]
    
    # Print subwords and their corresponding new labels
    for token, label_id in zip(tokenizer.convert_ids_to_tokens(example["input_ids"]), example["labels"]):
        # We'll use id2label to make it readable
        # -100 won't be in id2label, so we handle it
        label_str = id2label.get(str(label_id), "IGNORE_INDEX")
        print(f"{token:<15} {label_id:<5} {label_str}")

else:
    print("‚ùå 'raw_datasets' not found. Please run Phase 1 code first.")

‚úÖ BioBERT tokenizer loaded successfully!

--- Applying NEW token-label alignment (B-I propagation) ---


Map:   0%|          | 0/5228 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/5330 [00:00<?, ? examples/s]

Map:   0%|          | 0/5865 [00:00<?, ? examples/s]

‚úÖ Token-label alignment complete!

--- Processed Dataset ---
DatasetDict({
    train: Dataset({
        features: ['tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5865
    })
})

--- Inspecting First Processed Example ---
[CLS]           -100  IGNORE_INDEX
na              1     B-Chemical
##lo            4     I-Chemical
##xon           4     I-Chemical
##e             4     I-Chemical
reverse         0     O
##s             0     O
the             0     O
anti            0     O
##hy            0     O
##pert          0     O
##ens           0     O
##ive           0     O
effect          0     O
of              0     O
c               1    

In [None]:
import numpy as np
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate
import transformers
from transformers import AutoTokenizer

# --- 1. Setup Mappings & Tokenizer ---
# These are defined in the main process
ID2LABEL = {'0': 'O', '1': 'B-Chemical', '2': 'B-Disease', '3': 'I-Disease', '4': 'I-Chemical'}
LABEL2ID = {'O': '0', 'B-Chemical': '1', 'B-Disease': '2', 'I-Disease': '3', 'I-Chemical': '4'}
model_checkpoint = "dmis-lab/biobert-base-cased-v1.1"

print("‚è≥ Loading BioBERT tokenizer...")
MAIN_TOKENIZER = AutoTokenizer.from_pretrained(model_checkpoint)
print("‚úÖ Tokenizer loaded!")

# --- 2. Define the Alignment Function (UPDATED to accept ALL needed variables) ---
def tokenize_and_align_labels(examples, tokenizer, id2label_map, label2id_map):
    # Use passed arguments, not global variables
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True
    )

    labels = []
    for i, label_ids in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        new_label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                new_label_ids.append(-100)
            elif word_idx != previous_word_idx:
                new_label_ids.append(label_ids[word_idx])
            else:
                original_tag_id = label_ids[word_idx]
                # Use passed id2label_map
                label_name = id2label_map.get(str(original_tag_id))
                if label_name and label_name.startswith('B-'):
                    i_label_name = 'I-' + label_name[2:]
                    # Use passed label2id_map
                    if i_label_name in label2id_map:
                        new_label_ids.append(int(label2id_map[i_label_name]))
                    else:
                        new_label_ids.append(original_tag_id)
                else:
                    new_label_ids.append(original_tag_id)
            previous_word_idx = word_idx
        labels.append(new_label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# --- 3. Run Processing (Safe Mode with ALL variables passed) ---
if 'raw_datasets' in locals():
    print("\n‚è≥ Starting token-label alignment...")
    tokenized_datasets = raw_datasets.map(
        tokenize_and_align_labels, 
        batched=True, 
        num_proc=1, 
        load_from_cache_file=False,
        desc="Running tokenizer",
        # CRITICAL FIX: Pass ALL three variables explicitly
        fn_kwargs={
            "tokenizer": MAIN_TOKENIZER,
            "id2label_map": ID2LABEL,
            "label2id_map": LABEL2ID
        }
    )
    print("‚úÖ Done! Data is ready for BioBERT.")
    
    # Optional: Quick check of one example
    print(tokenized_datasets['train'][0]['labels'][:10])
else:
    print("‚ùå Error: 'raw_datasets' is missing. Please re-run Phase 1 data loading.")


# We will use the 'seqeval' metric, as specified in the project plan
seqeval = evaluate.load("seqeval")

# --- 1. Load the Model ---
# We assume 'model_checkpoint', 'id2label', and 'label2id' exist
# from the previous steps.
try:
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint, 
        num_labels=num_labels, # This was '5' in your Phase 1
        id2label=id2label,
        label2id=label2id
    )
    print("‚úÖ BioBERT model (AutoModelForTokenClassification) loaded successfully!")
    
except NameError as e:
    print(f"‚ùå Error: A variable from Phase 1 is missing (e.g., 'model_checkpoint' or 'id2label'). {e}")
except Exception as e:
    print(f"‚ùå Error loading model: {e}")


# --- 2. Define the Data Collator ---
# This will pad our inputs and labels dynamically
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
print("‚úÖ Data collator initialized.")


# --- 3. Define the Metrics Calculation Function ---
# This function will be called at the end of each epoch to compute F1, etc.
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    
    # Get the most likely prediction (argmax)
    predictions = np.argmax(predictions, axis=2)

    # Convert numeric labels back to string labels
    # We need to remove the -100 "IGNORE_INDEX" labels
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Get the results from seqeval
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    
    # As per your plan, we want the overall scores
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

print("‚úÖ 'compute_metrics' function defined.")


# --- 4. Set Up Training Arguments ---
training_args = TrainingArguments(
    output_dir="./results/biobert",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,           # You can adjust this (3 is a good start)
    weight_decay=0.01,
    save_strategy="epoch",        # Save checkpoint every epoch
    load_best_model_at_end=True,  # Load the best model based on loss
    push_to_hub=False,            # Set to True if you want to upload
)
print("‚úÖ TrainingArguments defined.")


# --- 5. Initialize the Trainer ---
if 'model' in locals():
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"], # Use 'validation' split
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    print("\nüéâ Trainer is initialized and ready to go!")
    print("You can now start training by running: trainer.train()")
else:
    print("‚ùå Model was not loaded. Trainer cannot be initialized.")

In [None]:
import json
from datasets import load_dataset, DatasetDict

# --- 1. Global Configuration ---
# These paths match exactly what you provided earlier
DATA_FILES = {
    'train': 'dataset/train.json',
    'validation': 'dataset/valid.json',
    'test': 'dataset/test.json'
}
LABEL_FILE = 'dataset/label.json'

# --- 2. Load Raw Data ---
print("‚è≥ Loading dataset from local files...")
raw_datasets = load_dataset('json', data_files=DATA_FILES)
print("‚úÖ Raw dataset loaded successfully!")

# --- 3. Load & Define Master Label Mappings ---
with open(LABEL_FILE, 'r') as f:
    label_list = json.load(f)

# Create global mappings used by ALL tracks
ID2LABEL = {str(i): label for i, label in enumerate(label_list)}
LABEL2ID = {label: str(i) for i, label in enumerate(label_list)}

print("\n‚úÖ Master label mappings established:")
print(f"Total Labels: {len(label_list)}")
print(f"ID2LABEL: {ID2LABEL}")

import transformers
from transformers import AutoTokenizer

# --- 1. Track A Configuration ---
MODEL_CHECKPOINT = "dmis-lab/biobert-base-cased-v1.1"

print(f"‚è≥ Loading tokenizer for {MODEL_CHECKPOINT}...")
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
print("‚úÖ BioBERT Tokenizer loaded.")

# --- 2. The Robust Alignment Function ---
# This function fixes the "subword" problem by propagating labels correctly.
def align_labels_biobert(examples, tokenizer_obj, id2label_map, label2id_map):
    tokenized_inputs = tokenizer_obj(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True
    )

    labels = []
    for i, label_ids in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        new_label_ids = []
        for word_idx in word_ids:
            # 1. Handle Special Tokens ([CLS], [SEP]) -> Ignore (-100)
            if word_idx is None:
                new_label_ids.append(-100)
            # 2. Handle New Words -> Keep original label
            elif word_idx != previous_word_idx:
                new_label_ids.append(label_ids[word_idx])
            # 3. Handle Subsequent Subwords -> Propagate "I-" tag
            else:
                original_tag_id = label_ids[word_idx]
                label_name = id2label_map.get(str(original_tag_id))
                # If it was a "B-" tag, change it to "I-" for subwords
                if label_name and label_name.startswith('B-'):
                    i_label_name = 'I-' + label_name[2:]
                    # Look up new ID, or keep original if "I-" version doesn't exist
                    new_label_ids.append(int(label2id_map.get(i_label_name, original_tag_id)))
                else:
                    # Otherwise just keep the same tag (O stays O, I- stays I-)
                    new_label_ids.append(original_tag_id)
            previous_word_idx = word_idx
        labels.append(new_label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# --- 3. Execute Preprocessing ---
print("\n‚è≥ Starting Token-Label Alignment (Track A)...")
# We pass all variables explicitly to avoid multiprocessing errors
tokenized_datasets_A = raw_datasets.map(
    align_labels_biobert,
    batched=True,
    num_proc=1, # Keep at 1 for safety in notebooks
    load_from_cache_file=False, # Ensure fresh run
    desc="Aligning labels for BioBERT",
    fn_kwargs={"tokenizer_obj": TOKENIZER, "id2label_map": ID2LABEL, "label2id_map": LABEL2ID}
)
print("‚úÖ Track A Data Ready!")

# --- 4. Verification ---
# Always check the first example to ensure alignment worked as expected
print("\nüìä Verification (First 15 tokens of train[0]):")
ex = tokenized_datasets_A['train'][0]
for token, label in zip(TOKENIZER.convert_ids_to_tokens(ex['input_ids'][:15]), ex['labels'][:15]):
    print(f"{token:<12} | {label:<4} ({ID2LABEL.get(str(label), 'IGNORE')})")


print("üìä Final Verification (Last 5 tokens):")
ex = tokenized_datasets_A['train'][0]
for token, label in zip(TOKENIZER.convert_ids_to_tokens(ex['input_ids'][-5:]), ex['labels'][-5:]):
    print(f"{token:<12} | {label:<4} ({ID2LABEL.get(str(label), 'IGNORE')})")

import torch
import numpy as np
import evaluate
from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

# --- 0. GPU Check ---
# This tells you if PyTorch can actually SEE your GPU.
if torch.cuda.is_available():
    print(f"‚úÖ GPU Detected: {torch.cuda.get_device_name(0)}")
    print("üöÄ Training will automatically use the GPU.")
else:
    print("‚ö†Ô∏è GPU NOT detected. Training will run SLOWLY on CPU.")
    print("If you have a GPU, ensure you installed the CUDA version of PyTorch.")

# --- 1. Metrics Setup ---
METRIC = evaluate.load("seqeval")

def compute_metrics_biobert(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Filter out the -100 ignore tokens
    true_predictions = [
        [ID2LABEL[str(p)] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ID2LABEL[str(l)] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = METRIC.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



# --- 2. Model Loading (with ERROR FIX) ---
print(f"\n‚è≥ Loading BioBERT Model: {MODEL_CHECKPOINT}...")
model_A = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=len(ID2LABEL),
    id2label=ID2LABEL,
    label2id=LABEL2ID,
    # CRITICAL FIX for older PyTorch versions:
    weights_only=False 
)
print("‚úÖ Model loaded successfully.")

training_args = TrainingArguments(
    output_dir="biobert_track_a_improved_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,             # Increased epochs to 10
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    fp16=torch.cuda.is_available(),
    warmup_ratio=0.1,                # Add warmup for learning rate
    lr_scheduler_type="linear",      # Use linear decay for learning rate
)
data_collator = DataCollatorForTokenClassification(tokenizer=TOKENIZER)
# --- Initialize Trainer with Early Stopping ---
trainer_A = Trainer(
    model=model_A,
    args=training_args,
    train_dataset=tokenized_datasets_A["train"],
    eval_dataset=tokenized_datasets_A["validation"],
    tokenizer=TOKENIZER,
    data_collator=data_collator,
    compute_metrics=compute_metrics_biobert,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Stop after 3 epochs of no improvement
)

print("\nüöÄ Improved Track A (BioBERT) Trainer is ready! Run 'trainer_A.train()' to begin.")


trainer_A.train()

# --- Save Track A (BioBERT) Model ---
# This is CRITICAL to avoid re-training later.
save_path_a = "./saved_models/biobert_track_a"
trainer_A.save_model(save_path_a)
TOKENIZER.save_pretrained(save_path_a)

print(f"‚úÖ Track A (BioBERT) model successfully saved to: {save_path_a}")


from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorForTokenClassification
import torch
# --- 1. Track B Configuration ---
MODEL_CHECKPOINT_B = "microsoft/deberta-v3-base"

print(f"‚è≥ Loading DeBERTa tokenizer: {MODEL_CHECKPOINT_B}...")
TOKENIZER_B = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT_B)
print("‚úÖ DeBERTa Tokenizer loaded.")

# --- 2. Data Processing for DeBERTa ---
# We reuse the EXACT SAME alignment function 'align_labels_biobert' from Track A
# but we pass it the NEW tokenizer. This ensures perfectly fair comparison.
print("\n‚è≥ Aligning data for DeBERTa (Track B)...")
tokenized_datasets_B = raw_datasets.map(
    align_labels_biobert,
    batched=True,
    num_proc=1,
    load_from_cache_file=False,
    desc="Aligning for DeBERTa",
    fn_kwargs={"tokenizer_obj": TOKENIZER_B, "id2label_map": ID2LABEL, "label2id_map": LABEL2ID}
)

# --- 3. Load DeBERTa Model ---
print(f"\n‚è≥ Loading DeBERTa Model: {MODEL_CHECKPOINT_B}...")
model_B = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT_B,
    num_labels=len(ID2LABEL),
    id2label=ID2LABEL,
    label2id=LABEL2ID,
    weights_only=False # Safety bypass for older PyTorch
)

# --- 4. Define Track B Trainer ---
# Using comparable settings to Track A for fairness
args_B = TrainingArguments(
    output_dir="deberta_track_b_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16, # If you get CUDA out of memory, change this to 8
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none",
    fp16=torch.cuda.is_available(),
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
)


trainer_B = Trainer(
    model=model_B,
    args=args_B,
    train_dataset=tokenized_datasets_B["train"],
    eval_dataset=tokenized_datasets_B["validation"],
    tokenizer=TOKENIZER_B,
    data_collator=DataCollatorForTokenClassification(tokenizer=TOKENIZER_B),
    compute_metrics=compute_metrics_biobert, # We can reuse the same metric function
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


print("üöÄ Starting Track B (DeBERTa) training...")
trainer_B.train()

# --- Save Track B (DeBERTa) Model ---
save_path_b = "./saved_models/deberta_track_b"
trainer_B.save_model(save_path_b)
TOKENIZER_B.save_pretrained(save_path_b)

print(f"‚úÖ Track B (DeBERTa) model successfully saved to: {save_path_b}")

from collections import Counter

# --- 1. Build Vocabulary (FIXED: all lowercase) ---
print("‚è≥ Building lowercase vocabulary from training data...")
word_counts = Counter()
for example in raw_datasets['train']:
    # Convert all tokens to lowercase before counting
    word_counts.update([w.lower() for w in example['tokens']])

PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

# Create mappings
word2id = {PAD_TOKEN: 0, UNK_TOKEN: 1}
for word, _ in word_counts.most_common():
    word2id[word] = len(word2id)

id2word = {v: k for k, v in word2id.items()}

VOCAB_SIZE = len(word2id)
print(f"‚úÖ Lowercase vocabulary built! Size: {VOCAB_SIZE} unique words.")
print(f"Example mapping: 'the' -> {word2id.get('the')}")

import numpy as np
import os
from tqdm import tqdm 
import numpy as np
import torch
from gensim.models import KeyedVectors
import os

# --- Re-define Configuration ---
# (Make sure these match your actual paths)
FASTTEXT_PATH = "./embeddings/wiki-news-300d-1M.vec"
BIOWORDVEC_PATH = "./embeddings/bio_embedding_intrinsic"

# --- Superior Loading Function (using Gensim) ---
def load_embeddings_gensim(path, word2id, embedding_dim, binary=False):
    print(f"‚è≥ Loading embeddings from {path} (Binary={binary})...")

    # 1. Initialize standard random matrix
    vocab_size = len(word2id)
    embedding_matrix = np.random.uniform(-0.25, 0.25, (vocab_size, embedding_dim))
    embedding_matrix[word2id["<PAD>"]] = np.zeros((embedding_dim,))

    try:
        # 2. Use Gensim to load the file (fast and robust)
        # limit=500000 loads only top 500k words to save RAM, remove if you have >16GB RAM
        kv = KeyedVectors.load_word2vec_format(path, binary=binary, limit=500000)

        # 3. Transfer weights to our matrix
        hit_count = 0
        for word, idx in word2id.items():
            if word in kv:
                embedding_matrix[idx] = kv[word]
                hit_count += 1

        print(f"‚úÖ Loaded! Coverage: {hit_count / vocab_size:.2%}")
        return embedding_matrix

    except FileNotFoundError:
        print(f"‚ùå ERROR: File not found at {path}")
        return None
    except Exception as e:
        print(f"‚ùå ERROR loading file: {e}")
        return None

# --- Execute Loads ---
# Ensure 'word2id' exists from previous steps!

# 1. Load FastText (Text format, binary=False)
print("\n--- Loading Track C2 (FastText) ---")
embedding_matrix_fasttext = load_embeddings_gensim(FASTTEXT_PATH, word2id, 300, binary=False)

# 2. Load BioWordVec (Binary format, binary=True)
print("\n--- Loading Track C1 (BioWordVec) ---")
# Notice we set binary=True here to fix your error
embedding_matrix_bio = load_embeddings_gensim(BIOWORDVEC_PATH, word2id, 200, binary=True)

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# --- 1. Define Custom Dataset Class (FIXED: all lowercase) ---
class NERDataset(Dataset):
    def __init__(self, hf_dataset, word2id, label2id):
        self.dataset = hf_dataset
        self.word2id = word2id
        self.label2id = label2id
        self.unk_id = word2id["<UNK>"]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        # Convert words to lowercase IDs
        token_ids = [self.word2id.get(w.lower(), self.unk_id) for w in item['tokens']]
        label_ids = item['tags']

        return {
            'token_ids': torch.tensor(token_ids, dtype=torch.long),
            'label_ids': torch.tensor(label_ids, dtype=torch.long)
        }

# --- 2. Define Collate Function (No changes needed) ---
def collate_fn(batch):
    token_seqs = [item['token_ids'] for item in batch]
    label_seqs = [item['label_ids'] for item in batch]
    padded_tokens = pad_sequence(token_seqs, batch_first=True, padding_value=0)
    padded_labels = pad_sequence(label_seqs, batch_first=True, padding_value=-100)
    attention_masks = (padded_tokens != 0).long()
    return padded_tokens, padded_labels, attention_masks

# --- 3. Create DataLoaders ---
BATCH_SIZE = 32

train_dataset = NERDataset(raw_datasets['train'], word2id, LABEL2ID)
valid_dataset = NERDataset(raw_datasets['validation'], word2id, LABEL2ID)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("‚úÖ PyTorch DataLoaders created!")
# Test one batch
sample_tokens, sample_labels, sample_masks = next(iter(train_loader))
print(f"Sample batch shape: {sample_tokens.shape}")


import torch.nn as nn
from torchcrf import CRF

# --- 1. Install CRF Library (if needed) ---
# If you already ran this, you can comment it out, but it's safe to leave.
# !pip install pytorch-crf

# --- 2. Define the Model Architecture ---
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels, embedding_matrix=None):
        super(BiLSTM_CRF, self).__init__()
        
        # 1. Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # Load pre-trained weights
        if embedding_matrix is not None:
            print("üîß Loading pre-trained embedding weights into model...")
            self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        # 2. BiLSTM Layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, 
                            num_layers=1, bidirectional=True, batch_first=True)
        
        # 3. Linear Mapping (Hidden -> Tag Space)
        self.hidden2tag = nn.Linear(hidden_dim, num_labels)
        
        # 4. CRF Layer
        self.crf = CRF(num_labels, batch_first=True)
        
    def forward(self, input_ids, mask):
        embeds = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embeds)
        emissions = self.hidden2tag(lstm_out)
        return emissions

    def compute_loss(self, emissions, tags, mask):
        log_likelihood = self.crf(emissions, tags, mask=mask.bool())
        return -log_likelihood

    def decode(self, emissions, mask):
        return self.crf.decode(emissions, mask=mask.bool())

print("‚úÖ BiLSTM_CRF model class defined!")


# --- Configuration ---
HIDDEN_DIM = 256
NUM_LABELS = len(LABEL2ID)
VOCAB_SIZE = len(word2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Using device: {device}")

# --- Track C1: BioWordVec Model ---
print("\nüèóÔ∏è Initializing Track C1 (BioWordVec)...")
model_c1 = BiLSTM_CRF(
    vocab_size=VOCAB_SIZE,
    embedding_dim=200, # BioWordVec is 200d
    hidden_dim=HIDDEN_DIM,
    num_labels=NUM_LABELS,
    embedding_matrix=embedding_matrix_bio
)
model_c1.to(device)

# --- Track C2: FastText Model ---
print("\nüèóÔ∏è Initializing Track C2 (FastText)...")
model_c2 = BiLSTM_CRF(
    vocab_size=VOCAB_SIZE,
    embedding_dim=300, # FastText is 300d
    hidden_dim=HIDDEN_DIM,
    num_labels=NUM_LABELS,
    embedding_matrix=embedding_matrix_fasttext
)
model_c2.to(device)

print("\n‚úÖ Both custom models initialized and moved to GPU!")


import torch.optim as optim
from tqdm import tqdm
import numpy as np

# Ensure METRIC is loaded globally from previous steps

# --- UPDATED Training Loop with CRF Padding Fix ---
def train_bilstm(model, train_loader, valid_loader, epochs=10, learning_rate=0.01, patience=3, name="Model"):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    best_f1 = 0.0
    epochs_no_improve = 0
    
    print(f"\nüöÄ Starting training for {name}...")
    
    for epoch in range(epochs):
        # --- TRAINING PHASE ---
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]"):
            tokens, labels, mask = [b.to(device) for b in batch]
            
            # CRITICAL FIX: Replace -100 padding with a valid tag (e.g., 0 for 'O')
            # The mask will still correctly tell the CRF to ignore these positions.
            safe_labels = torch.where(mask.bool(), labels, torch.tensor(0, device=device))
            
            model.zero_grad()
            emissions = model(tokens, mask)
            
            # Use safe_labels here instead of raw labels
            loss = model.compute_loss(emissions, safe_labels, mask)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        
        # --- VALIDATION PHASE ---
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in tqdm(valid_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]"):
                tokens, labels, mask = [b.to(device) for b in batch]
                emissions = model(tokens, mask)
                pred_tags = model.decode(emissions, mask)
                
                for i, sent_preds in enumerate(pred_tags):
                    real_len = mask[i].sum().item()
                    sent_labels = labels[i][:real_len].cpu().numpy()
                    sent_preds = sent_preds[:real_len]
                    all_preds.append([ID2LABEL[str(p)] for p in sent_preds])
                    all_labels.append([ID2LABEL[str(l)] for l in sent_labels])

        # --- METRICS ---
        results = METRIC.compute(predictions=all_preds, references=all_labels)
        f1 = results["overall_f1"]
        print(f"üìä Epoch {epoch+1}: Train Loss={avg_train_loss:.4f} | Val F1={f1:.4%}")
        
        if f1 > best_f1:
            best_f1 = f1
            epochs_no_improve = 0
            torch.save(model.state_dict(), f"{name}_best.pth")
        else:
            epochs_no_improve += 1
            
        if epochs_no_improve >= patience:
            print(f"\nüõë Early stopping triggered at epoch {epoch+1}.")
            break

    print(f"\nüèÅ Training complete for {name}! Best Val F1: {best_f1:.4%}")
    model.load_state_dict(torch.load(f"{name}_best.pth"))
    return model


# --- Run Track C1 (BioWordVec) ---
# 10 epochs, stop if no improvement for 3 epochs
model_c1_trained = train_bilstm(model_c1, train_loader, valid_loader, 
                                epochs=10, patience=3, learning_rate=0.01, name="Track_C1_BioWordVec")



# --- Run Track C2 (FastText) ---
model_c2_trained = train_bilstm(model_c2, train_loader, valid_loader, 
                                epochs=10, patience=3, learning_rate=0.01, name="Track_C2_FastText")


# --- Create Test Loader for Track C ---
# We reuse the same Dataset/collate_fn definitions
test_dataset = NERDataset(raw_datasets['test'], word2id, LABEL2ID)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("‚úÖ Track C Test DataLoader is ready.")

# --- 1. Define function to add word_ids ---
# We need to re-pass the tokenizers to this function
def add_word_ids(example, tokenizer_obj):
    # This creates the list of word_ids for the tokenized inputs
    word_ids = tokenizer_obj(example["tokens"], truncation=True, is_split_into_words=True).word_ids()
    return {"word_ids": word_ids}

print("‚è≥ Adding word_ids to Track A (BioBERT) test set...")
tokenized_datasets_A["test"] = tokenized_datasets_A["test"].map(
    add_word_ids, 
    fn_kwargs={"tokenizer_obj": TOKENIZER},
    num_proc=1
)

print("‚è≥ Adding word_ids to Track B (DeBERTa) test set...")
tokenized_datasets_B["test"] = tokenized_datasets_B["test"].map(
    add_word_ids, 
    fn_kwargs={"tokenizer_obj": TOKENIZER_B},
    num_proc=1
)

print("\n‚úÖ Test sets are now ready for span-level evaluation.")
# Check the new feature
print(tokenized_datasets_A["test"].features)


import numpy as np
from seqeval.metrics import classification_report


# --- REVISED Evaluation Function for Trainers (A & B) ---
def evaluate_transformer(trainer, test_dataset):
    print(f"\n‚è≥ Evaluating {trainer.model.name_or_path} (Span-Level)...")
    
    # 1. Get raw predictions from the trainer
    predictions_output, label_ids, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions_output, axis=2)

    # 2. Get the word_ids from the test_dataset (which we just added)
    all_word_ids = test_dataset['word_ids']

    reconciled_preds = []
    reconciled_labels = []

    # 3. Loop through each sentence
    for i in range(len(all_word_ids)):
        word_ids = all_word_ids[i]
        preds = predictions[i]
        labels = label_ids[i]
        
        reconciled_sentence_preds = []
        reconciled_sentence_labels = []
        previous_word_idx = None
        
        # 4. Loop through subword tokens
        for j, word_idx in enumerate(word_ids):
            # Skip special tokens ([CLS], [SEP])
            if word_idx is None:
                continue
                
            # If this is a new word (i.e., the first subword)
            if word_idx != previous_word_idx:
                # This is the "first subword" rule:
                # We append the label for this subword
                reconciled_sentence_preds.append(ID2LABEL[str(preds[j])])
                reconciled_sentence_labels.append(ID2LABEL[str(labels[j])])
            
            previous_word_idx = word_idx
        
        # Add the fully reconciled word-level lists
        reconciled_preds.append(reconciled_sentence_preds)
        reconciled_labels.append(reconciled_sentence_labels)

    # 5. Calculate metrics on the WORD-LEVEL span labels
    print("‚úÖ Span-level reconciliation complete. Final Results:")
    print(classification_report(reconciled_labels, reconciled_preds, digits=4))


# --- Evaluation Function for BiLSTM-CRF (FIXED) ---
def evaluate_bilstm(model_path, model_instance, test_loader):
    print(f"\n‚è≥ Evaluating {model_path} on test set...")
    
    # Load the best weights
    # FIX: Removed 'weights_only=True', which is not a valid arg here
    # We load the state_dict *from* the file
    model_instance.load_state_dict(torch.load(model_path))
    model_instance.to(device)
    model_instance.eval()
    
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            tokens, labels, mask = [b.to(device) for b in batch]
            emissions = model_instance(tokens, mask)
            pred_tags = model_instance.decode(emissions, mask)
            
            for i, sent_preds in enumerate(pred_tags):
                real_len = mask[i].sum().item()
                sent_labels = labels[i][:real_len].cpu().numpy()
                sent_preds = sent_preds[:real_len]
                all_preds.append([ID2LABEL[str(p)] for p in sent_preds])
                all_labels.append([ID2LABEL[str(l)] for l in sent_labels])
                
    print("‚úÖ Evaluation complete. Results:")
    print(classification_report(all_labels, all_preds, digits=4))

print("‚úÖ Fixed `evaluate_bilstm` function is defined.")


# --- Run Final Evaluation (Corrected) ---

# --- Run Track A (BioBERT) ---
evaluate_transformer(trainer_A, tokenized_datasets_A["test"])

# --- Run Track B (DeBERTa) ---
evaluate_transformer(trainer_B, tokenized_datasets_B["test"])

# --- Run Track C1 (BioWordVec) ---
# (This function was already correct, as it works at the word level)
evaluate_bilstm("Track_C1_BioWordVec_best.pth", model_c1, test_loader)

# --- Run Track C2 (FastText) ---
evaluate_bilstm("Track_C2_FastText_best.pth", model_c2, test_loader)


import numpy as np
from seqeval.metrics import classification_report

# --- NEW Evaluation Function (Strategy 2b: Prioritized Vote) ---
def evaluate_transformer_prioritized_vote(trainer, test_dataset):
    print(f"\n‚è≥ Evaluating {trainer.model.name_or_path} (Strategy 2b: Prioritized Vote)...")
    
    predictions_output, label_ids, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions_output, axis=2)
    all_word_ids = test_dataset['word_ids']

    reconciled_preds = []
    reconciled_labels = []

    for i in range(len(all_word_ids)):
        word_ids = all_word_ids[i]
        preds = predictions[i]
        labels = label_ids[i]
        
        reconciled_sentence_preds = []
        reconciled_sentence_labels = []
        
        # --- Prioritized Vote Logic ---
        current_word_idx = None
        current_word_preds = set() # Use a set to store unique predictions
        current_word_true_label = 'O'
        
        for j, word_idx in enumerate(word_ids):
            # A. This subword starts a new word
            if word_idx != current_word_idx:
                # 1. Vote on the PREVIOUS word (if it exists)
                if current_word_idx is not None:
                    # --- This is the new logic ---
                    winner_label = 'O' # Default to O
                    
                    # Check for B- tags first
                    b_tags = [p for p in current_word_preds if p.startswith('B-')]
                    if b_tags:
                        winner_label = b_tags[0] # Take the first B- tag
                    else:
                        # No B- tags, check for I- tags
                        i_tags = [p for p in current_word_preds if p.startswith('I-')]
                        if i_tags:
                            winner_label = i_tags[0] # Take the first I- tag
                    
                    reconciled_sentence_preds.append(winner_label)
                    reconciled_sentence_labels.append(current_word_true_label)
                
                # 2. Reset for the NEW word
                current_word_preds = set()
                current_word_idx = word_idx
                
                if word_idx is not None:
                    # This is the "first subword"
                    current_word_true_label = ID2LABEL[str(labels[j])]
                    current_word_preds.add(ID2LABEL[str(preds[j])])
                else:
                    current_word_true_label = 'O'
            
            # B. This subword continues the current word
            elif word_idx is not None:
                current_word_preds.add(ID2LABEL[str(preds[j])])
        
        # C. Tally the votes for the VERY LAST word
        if current_word_idx is not None:
            winner_label = 'O'
            b_tags = [p for p in current_word_preds if p.startswith('B-')]
            if b_tags:
                winner_label = b_tags[0]
            else:
                i_tags = [p for p in current_word_preds if p.startswith('I-')]
                if i_tags:
                    winner_label = i_tags[0]
            
            reconciled_sentence_preds.append(winner_label)
            reconciled_sentence_labels.append(current_word_true_label)

        reconciled_preds.append(reconciled_sentence_preds)
        reconciled_labels.append(reconciled_sentence_labels)

    print("‚úÖ Prioritized vote reconciliation complete. Final Results:")
    print(classification_report(reconciled_labels, reconciled_preds, digits=4))


# --- Run Evaluation (Strategy 2b - Prioritized) ---

print("--- 1. Evaluating Track A (BioBERT) with Prioritized Vote ---")
evaluate_transformer_prioritized_vote(trainer_A, tokenized_datasets_A["test"])

print("\n--- 2. Evaluating Track B (DeBERTa) with Prioritized Vote ---")
evaluate_transformer_prioritized_vote(trainer_B, tokenized_datasets_B["test"])

import numpy as np
from collections import Counter

# --- NEW Trace Function (Strategy 2b: Prioritized Vote) ---
def trace_prioritized_vote(trainer, tokenized_test_dataset, raw_test_dataset, indices_to_trace):
    
    examples_to_trace = tokenized_test_dataset.select(indices_to_trace)
    raw_examples_to_trace = raw_test_dataset.select(indices_to_trace)
    
    predictions_output, label_ids, _ = trainer.predict(examples_to_trace)
    predictions = np.argmax(predictions_output, axis=2)
    all_word_ids = examples_to_trace['word_ids']
    
    tokenizer = trainer.tokenizer
    
    for i, idx in enumerate(indices_to_trace):
        print(f"\n--- üåä TRACING EXAMPLE {idx} (Prioritized Vote) ---")
        
        original_tokens = raw_examples_to_trace[i]['tokens']
        subword_ids = examples_to_trace[i]['input_ids']
        subwords = tokenizer.convert_ids_to_tokens(subword_ids)
        word_ids = all_word_ids[i]
        pred_labels_ids = predictions[i]
        true_labels_ids = label_ids[i]

        print(f"Original Sentence:\n{' '.join(original_tokens)}\n")
        print("Reconciliation Trace (Strategy 2b: Prioritized Vote):")
        print("-------------------------------------------------------------------------------------")
        print(f"{'Subword':<15} | {'Pred Label':<12} | {'True Label':<12} | {'Word ID':<5} | {'Vote Status'}")
        print("-------------------------------------------------------------------------------------")

        reconciled_preds = []
        reconciled_labels = []
        
        current_word_idx = None
        current_word_preds = set() # Use a set for unique predictions
        current_word_true_label = 'O'
        
        for j, word_idx in enumerate(word_ids):
            
            pred_label_str = ID2LABEL.get(str(pred_labels_ids[j]), "PAD")
            true_label_str = ID2LABEL.get(str(true_labels_ids[j]), "PAD")
            
            # A. This subword starts a new word
            if word_idx != current_word_idx:
                
                # 1. Vote on the PREVIOUS word (if it exists)
                if current_word_idx is not None:
                    # --- This is the new logic ---
                    winner_label = 'O' # Default to O
                    b_tags = [p for p in current_word_preds if p.startswith('B-')]
                    if b_tags:
                        winner_label = b_tags[0] # Priority 1: B- tags
                    else:
                        i_tags = [p for p in current_word_preds if p.startswith('I-')]
                        if i_tags:
                            winner_label = i_tags[0] # Priority 2: I- tags
                    
                    reconciled_preds.append(winner_label)
                    reconciled_labels.append(current_word_true_label)
                    
                    print("-------------------------------------------------------------------------------------")
                    print(f"TALLY VOTES for Word {current_word_idx}: {current_word_preds} -> WINNER: {winner_label} (B-tag priority)")
                    print("-------------------------------------------------------------------------------------")
                
                # 2. Reset for the NEW word
                current_word_preds = set()
                current_word_idx = word_idx
                
                if word_idx is not None:
                    current_word_true_label = ID2LABEL[str(true_labels_ids[j])]
                    current_word_preds.add(pred_label_str)
                    print(f"{subwords[j]:<15} | {pred_label_str:<12} | {true_label_str:<12} | {word_idx:<5} | Add 1st vote: {pred_label_str}")
                else:
                    current_word_true_label = 'O'
                    print(f"{subwords[j]:<15} | {pred_label_str:<12} | {true_label_str:<12} | {'None':<5} | Skipping [CLS]/[SEP]")
            
            # B. This subword continues the current word
            elif word_idx is not None:
                current_word_preds.add(pred_label_str)
                print(f"{subwords[j]:<15} | {pred_label_str:<12} | {true_label_str:<12} | {word_idx:<5} | Add vote: {pred_label_str}")
        
        # C. Tally the votes for the VERY LAST word
        if current_word_idx is not None:
            winner_label = 'O'
            b_tags = [p for p in current_word_preds if p.startswith('B-')]
            if b_tags:
                winner_label = b_tags[0]
            else:
                i_tags = [p for p in current_word_preds if p.startswith('I-')]
                if i_tags:
                    winner_label = i_tags[0]
            
            reconciled_preds.append(winner_label)
            reconciled_labels.append(current_word_true_label)
            print("-------------------------------------------------------------------------------------")
            print(f"TALLY VOTES for Word {current_word_idx} (FINAL): {current_word_preds} -> WINNER: {winner_label} (B-tag priority)")
            print("-------------------------------------------------------------------------------------")

        print("\n--- FINAL WORD-LEVEL RECONCILIATION ---")
        print(f"Reconciled Preds: {reconciled_preds}")
        print(f"Reconciled Labels: {reconciled_labels}")

# --- Run the trace with the new PRIORITIZED logic ---
indices_to_run = [2, 8, 20]

trace_prioritized_vote(
    trainer=trainer_B, 
    tokenized_test_dataset=tokenized_datasets_B["test"],
    raw_test_dataset=raw_datasets["test"],
    indices_to_trace=indices_to_run
)
# --- Pick 3 examples to trace ---
# We pass the DeBERTa trainer, its tokenized test set, and the RAW test set
indices_to_run = [2, 5, 20]

trace_prioritized_vote(
    trainer=trainer_A, 
    tokenized_test_dataset=tokenized_datasets_A["test"],
    raw_test_dataset=raw_datasets["test"],
    indices_to_trace=indices_to_run
)

import torch

# --- NEW Trace Function (BiLSTM-CRF) ---
def trace_bilstm(model_instance, model_name, raw_test_dataset, indices_to_trace):
    
    # Load the best weights we saved
    model_path = f"{model_name}_best.pth"
    try:
        model_instance.load_state_dict(torch.load(model_path, weights_only=True))
        model_instance.to(device)
        model_instance.eval()
    except Exception as e:
        print(f"Error loading model weights for {model_name}: {e}")
        return

    raw_examples_to_trace = raw_test_dataset.select(indices_to_trace)
    
    print(f"\n--- üåä TRACING MODEL: {model_name} ---")

    for i, idx in enumerate(indices_to_trace):
        print(f"\n--- Example {idx} ---")
        
        # 1. Get raw data
        original_tokens = raw_examples_to_trace[i]['tokens']
        true_label_ids = raw_examples_to_trace[i]['tags']
        true_labels_str = [ID2LABEL[str(lid)] for lid in true_label_ids]
        
        # 2. Convert to Tensors
        unk_id = word2id["<UNK>"]
        token_ids = [word2id.get(w, unk_id) for w in original_tokens]
        
        # 3. Create a batch of 1
        token_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)
        mask_tensor = torch.ones_like(token_tensor, dtype=torch.long).to(device)
        
        # 4. Get predictions
        with torch.no_grad():
            emissions = model_instance(token_tensor, mask_tensor)
            decoded_ids = model_instance.decode(emissions, mask_tensor)[0]
        
        pred_labels_str = [ID2LABEL[str(pid)] for pid in decoded_ids]
        
        # 5. Print the comparison table
        print(f"Original Sentence:\n{' '.join(original_tokens)}\n")
        print("Word-Level Prediction Trace (Strategy 1: CRF):")
        print("-------------------------------------------------------")
        print(f"{'Word':<20} | {'Predicted Label':<15} | {'True Label':<15}")
        print("-------------------------------------------------------")
        
        for k in range(len(original_tokens)):
            # Add a '‚úÖ' if the prediction is correct
            is_correct = pred_labels_str[k] == true_labels_str[k]
            mark = "‚úÖ" if is_correct else "‚ùå"
            print(f"{original_tokens[k]:<20} | {pred_labels_str[k]:<15} | {true_labels_str[k]:<15} | {mark}")
        print("-------------------------------------------------------")


# --- Run BiLSTM Traces ---
indices_to_run = [2, 5, 20]

# Trace Track C1 (BioWordVec)
trace_bilstm(
    model_instance=model_c1, 
    model_name="Track_C1_BioWordVec",
    raw_test_dataset=raw_datasets["test"],
    indices_to_trace=indices_to_run
)

# Trace Track C2 (FastText)
trace_bilstm(
    model_instance=model_c2, 
    model_name="Track_C2_FastText",
    raw_test_dataset=raw_datasets["test"],
    indices_to_trace=indices_to_run
)