In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets transformers seqeval -q

import os
import json
import glob
import random
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, ClassLabel, load_metric
from seqeval.metrics import classification_report
import re




In [None]:
folder_path = '/content/drive/MyDrive/openave_jsons'
data = []

for file in sorted(glob.glob(f'{folder_path}/*.json')):
    with open(file, 'r') as f:
        item = json.load(f)
        data.append(item)

df = pd.DataFrame(data)
df.head()


Unnamed: 0,ReportText,findings,clinicaldata,ExamName,impression
0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography.
1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormali...
2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 03/24/2020 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 03/24/2020 ...,IMPRESSION: Normal 2-view chest radiography.
3,CHEST TWO VIEWS CLINICAL INFORMATION: Chest pa...,"FINDINGS: Heart, lungs and vessels normal. No ...",CLINICAL INFORMATION: Chest pain. \n\n,CHEST TWO VIEWS\n\nCOMPARISON: XR CHEST PA AND...,IMPRESSION: Negative chest. Dictated by: [[PER...
4,Exam: - CHEST-PORTABLE History: Chest pain Com...,Findings: AP portable examination of the chest...,History: Chest pain \n\n,Exam: - CHEST-PORTABLE\n\nComparison: None,Impression: Normal portable chest.


In [None]:
print(df.isnull().sum())


ReportText      0
findings        0
clinicaldata    0
ExamName        0
impression      0
dtype: int64


### 🔹 Text Cleaning and Token Tagging

- `clean_text(text)`: replaces `\n` with space and collapses multiple spaces.
- `tag_tokens(row)`:
  - Cleans `ReportText` and tokenizes it.
  - Initializes all tokens as `"O"`.
  - For each label field (`findings`, `impression`, `clinicaldata`, `ExamName`):
    - Cleans and tokenizes the span.
    - If a match is found in the report tokens:
      - Tags as `B-`, `I-`, `E-` (or `S-` for single-token spans).
    - Advances index to skip already-tagged spans.
- Output: `df["labels"]` contains BIOES tags aligned to `ReportText` tokens.


In [None]:


def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text


In [None]:
def tag_tokens(row):
    text = clean_text(row["ReportText"])
    tokens = text.split()
    tags = ["O"] * len(tokens)


    label_spans = []
    for label_name in ["findings", "impression", "clinicaldata", "ExamName"]:
        span = clean_text(row.get(label_name, ""))
        span_tokens = span.split()
        if span_tokens:
            label_spans.append((span_tokens, label_name))

    i = 0
    while i < len(tokens):
        matched = False
        for span_tokens, label in label_spans:
            n = len(span_tokens)
            if tokens[i:i+n] == span_tokens:
                if n == 1:
                    tags[i] = f"S-{label}"
                else:
                    tags[i] = f"B-{label}"
                    for j in range(1, n - 1):
                        tags[i+j] = f"I-{label}"
                    tags[i+n-1] = f"E-{label}"
                i += n
                matched = True
                break
        if not matched:
            i += 1
    return tags

df["labels"] = df.apply(tag_tokens, axis=1)


### 🔹 Dataset Splitting and HuggingFace Wrapping

- Split the full DataFrame `df`:
  - 80% for `train_df`
  - 10% for `val_df`
  - 10% for `test_df`
- Wrap them into a Hugging Face `DatasetDict` for compatibility with Transformers:



In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [None]:
dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})



### 🔹 Tokenization and Label Alignment for the Model

- Used `emilyalsentzer/Bio_ClinicalBERT` tokenizer.
- Created `label2id` and `id2label` mappings from `df["labels"]`.
- Defined a `tokenize_and_align` function to:
  - Tokenize the `ReportText` using `is_split_into_words=True` for word-level alignment.
  - Align each token with its corresponding label ID using `word_ids()`.
  - Assign `-100` to special tokens (ignored during loss computation).
- Applied the function to the dataset:
  ```python
  tokenized_datasets = dataset_dict.map(tokenize_and_align)


In [None]:
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

label_list = sorted({label for row in df["labels"] for label in row})
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

def tokenize_and_align(examples):
    tokenized = tokenizer(examples["ReportText"].split(), truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized.word_ids()

    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        else:
            labels.append(label2id[examples["labels"][word_id]])

    tokenized["labels"] = labels
    return tokenized

tokenized_datasets = dataset_dict.map(tokenize_and_align)


Map:   0%|          | 0/789 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

### Model Setup and Training

- **Model**: Loads `Bio_ClinicalBERT` for token classification with a custom number of labels.
- **Arguments**: Sets training configuration using `TrainingArguments`:
  - 5 epochs after initially considering 3
  - Batch size of 8
  - Learning rate of 2e-5
  - Logging every 50 steps
- **Collator**: Uses `DataCollatorForTokenClassification` for dynamic padding.
- **Trainer**: Defines a `Trainer` object with model, args, datasets, tokenizer, and collator.
- **Training**: Launches the fine-tuning process via `trainer.train()`.


In [None]:


model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir="ner_model",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    report_to="none"
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.9889
100,0.2203
150,0.0907
200,0.0574
250,0.0481
300,0.0311
350,0.0385
400,0.0156
450,0.0146


TrainOutput(global_step=495, training_loss=0.15487127075291643, metrics={'train_runtime': 240.41, 'train_samples_per_second': 16.409, 'train_steps_per_second': 2.059, 'total_flos': 338769718696698.0, 'train_loss': 0.15487127075291643, 'epoch': 5.0})

###  Model Evaluation

- **Prediction**: Runs inference on the test set using `trainer.predict`.
- **Decoding**:
  - Converts predicted label IDs to label strings using `id2label`.
  - Filters out ignored indices (`-100`) to align predictions and ground truth.
- **Report**: Uses `classification_report` to display precision, recall, and F1-score for each entity class (BIOES format), as well as macro/micro/weighted averages.


In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
preds = predictions.argmax(-1)

true_labels = [[id2label[label] for label in example if label != -100] for example in labels]
true_preds = [[id2label[pred] for pred, lab in zip(pred_row, label_row) if lab != -100] for pred_row, label_row in zip(preds, labels)]

print(classification_report(true_labels, true_preds))


              precision    recall  f1-score   support

    ExamName       0.98      0.92      0.95        52
clinicaldata       0.94      0.96      0.95       223
    findings       0.99      1.00      0.99       380
  impression       0.99      0.99      0.99       370

   micro avg       0.98      0.98      0.98      1025
   macro avg       0.98      0.97      0.97      1025
weighted avg       0.98      0.98      0.98      1025



### Evaluation Summary

- **Micro avg** (strict): `0.98` — all labels treated equally
- **Macro avg** (class balance): `0.97` — average across all classes
- **Weighted avg** (label frequency): `0.98` — reflects true class distribution

**High performance across all categories**, with the only relatively lower recall on `ExamName` due to lower support size.

