In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install huggingface_hub
!pip install datasets transformers seqeval -q



In [3]:

from huggingface_hub import notebook_login
import os
import json
import glob
import random
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, ClassLabel
from seqeval.metrics import classification_report
import re
from sklearn.metrics import classification_report
from transformers import AutoTokenizer





In [4]:
import glob, json, os, pandas as pd

folder_path = '/content/drive/MyDrive/openave_jsons'
data = []

# 1) Grab all .json files
files = glob.glob(os.path.join(folder_path, '*.json'))

# 2) Sort by the integer part of the filename (so "10.json" comes after "2.json")
files = sorted(
    files,
    key=lambda path: int(os.path.splitext(os.path.basename(path))[0])
)

# 3) Load them in that order, keeping track of the index
for path in files:
    idx = int(os.path.splitext(os.path.basename(path))[0])
    item = json.load(open(path))
    item['_file_idx'] = idx
    data.append(item)

# 4) Build your DataFrame and set its index from the filename number
df = pd.DataFrame(data).set_index('_file_idx').sort_index()

# Now df.loc[0] comes from 0.json, df.loc[1] from 1.json, etc.
df.head(10)


Unnamed: 0_level_0,ReportText,findings,clinicaldata,ExamName,impression
_file_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography.
1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormali...
2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,IMPRESSION: No acute cardiopulmonary process.
3,Exam: - CHEST-PORTABLE History: Chest pain Com...,Findings: Heart size appears normal. Lungs cle...,History: Chest pain \n\n,Exam: - CHEST-PORTABLE\n\nComparison: None,Impression: Lungs clear
4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,IMPRESSION: Normal single view chest.
5,EXAM: CHEST RADIOGRAPHY EXAM DATE: 08/11/2021 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Chest pain. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 08/11/2021 ...,IMPRESSION: Normal single view chest.
6,Exam: XR CHEST AP OR PA ONLY INDICATION: Respi...,FINDINGS: Stable support devices. Stable heart...,INDICATION: Respiratory distress or failure re...,Exam: XR CHEST AP OR PA ONLY\n\nTECHNIQUE: AP ...,IMPRESSION: Slightly decreased lung volumes.
7,Exam: CR CHEST 2 VIEWS History: ACUTE BRONCHIT...,Findings: 2 views. Heart size appears normal. ...,History: ACUTE BRONCHITIS Views of the chest \n\n,Exam: CR CHEST 2 VIEWS\n\nComparison: None,Impression: Lungs clear. Electronically Signed...
8,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/27/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Chest pain. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/27/2019 ...,IMPRESSION: Normal 2-view chest radiography.
9,EXAM: CHEST RADIOGRAPHY EXAM DATE: 10/22/2020 ...,FINDINGS: The mediastinal and cardiac silhouet...,CLINICAL HISTORY: Chest pain. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 10/22/2020 ...,IMPRESSION: Clear lungs.


In [5]:
print(df.isnull().sum())


ReportText      0
findings        0
clinicaldata    0
ExamName        0
impression      0
dtype: int64


### 🔹 Section Extraction (`extract_sections`)

- **Purpose**: Break each row’s raw text fields into the full report plus clean, per-section blocks.  
- **Input**: A DataFrame row with keys:
  - `"ReportText"` (full report string)  
  - One or more label columns (e.g. `"findings"`, `"clinicaldata"`, `"ExamName"`, `"impression"`) containing raw text separated by blank lines.  
- **Process**:  
  1. Copy the full report under `"ReportText"`.  
  2. For each label column, split on `\n\n`, strip whitespace, and collect non-empty blocks.  
- **Output**: A dict with  
  - `"ReportText"` → the untouched full report  
  - Each label column → a **list** of its cleaned text blocks


In [6]:
def extract_sections(row,
                     text_col="ReportText",
                     label_cols=("findings","clinicaldata","ExamName","impression"),
                     sep="\n\n"):
    """
    For a given DataFrame row, returns a dict:
      - "ReportText"   -> full report text
      - each label_col -> list of text blocks split on sep
    """
    out = {}
    # 1) full report
    out[text_col] = row[text_col]

    # 2) each other column → list of non-empty strips
    for col in label_cols:
        raw = row.get(col, "") or ""
        parts = [blk.strip()
                 for blk in raw.split(sep)
                 if blk.strip()]
        out[col] = parts

    return out


### 🔹 Chunk Labeling with Priority (`chunk_labels_for_row`)

- **Purpose**: Assign each report token to at most one section, preferring longer blocks first.  
- **Input**:  
  - `report_tokens`: the list of tokens from `ReportText`  
  - `sections`: the dict from `extract_sections`  
  - `label_cols`: the tuple of section names in desired priority order  
- **Process**:  
  1. Initialize an array `chunk_labels` of length N (tokens), all `None`.  
  2. For each section in `label_cols`:  
     - Sort its blocks by **descending token length**.  
     - Slide a window of that block’s length over `report_tokens`.  
     - If the window exactly matches the block and none of its positions have been tagged, assign those token indices to the section name and mark them used.  
- **Output**: A list of length N where each index is either the section name (if matched) or `None`.  


In [7]:
def chunk_labels_for_row(report_tokens, sections, label_cols):
    """
    Given:
      - report_tokens: list of normalized tokens from ReportText
      - sections: dict mapping each col in label_cols to its list of text blocks
      - label_cols: tuple of section names, e.g. ("clinicaldata", "ExamName", ...)
    Returns:
      - chunk_labels: list of length len(report_tokens), where each position is
        either the section name (col) it belongs to, or None.
    """
    N = len(report_tokens)
    chunk_labels = [None] * N
    used = set()

    for sec in label_cols:
        for block in sorted(sections[sec], key=lambda b: -len(b.split())):
            blk_toks = (block).split()
            L = len(blk_toks)
            if L == 0:
                continue
            for i in range(0, N - L + 1):
                # skip if any already tagged
                if any((i + k) in used for k in range(L)):
                    continue
                # exact match of contiguous block
                if report_tokens[i:i+L] == blk_toks:
                    for k in range(L):
                        chunk_labels[i + k] = sec
                    used.update(range(i, i+L))
    return chunk_labels


### 🔹 BIOES Tag Generation (`bioes_from_chunks`)

- **Purpose**: Convert contiguous runs of section assignments into BIOES tags.  
- **Input**: `chunk_labels`, the per-token section list from the previous step.  
- **Process**:  
  1. Scan left→right over `chunk_labels`.  
  2. Whenever you hit a non-`None` section, find how many consecutive tokens share that same section.  
  3. If the span length = 1 → tag `S-Section`;  
     = 2 → `B-Section`, `E-Section`;  
     ≥3 → `B-Section`, `I-Section`…`E-Section`.  
  4. Any `None` positions remain `O`.  
- **Output**: A parallel list of BIOES tags aligned to the report tokens.  


In [8]:
def bioes_from_chunks(chunk_labels):
    """
    Given:
      - chunk_labels: list where each position is either a section name (str)
        or None
    Returns:
      - bioes_tags: list of BIOES tags of the same length, e.g.
        ["B-clinicaldata","I-clinicaldata","E-clinicaldata","O",...]
    """
    N = len(chunk_labels)
    tags = ["O"] * N
    i = 0

    while i < N:
        sec = chunk_labels[i]
        if sec is None:
            tags[i] = "O"
            i += 1
        else:
            # find the end of this contiguous chunk
            j = i + 1
            while j < N and chunk_labels[j] == sec:
                j += 1
            length = j - i

            if length == 1:
                tags[i] = f"S-{sec}"
            elif length == 2:
                tags[i]   = f"B-{sec}"
                tags[i+1] = f"E-{sec}"
            else:
                tags[i] = f"B-{sec}"
                for k in range(i+1, j-1):
                    tags[k] = f"I-{sec}"
                tags[j-1] = f"E-{sec}"

            i = j

    return tags


### 🔹 Full Pipeline Overview

In [9]:
LABEL_COLS = ("findings","clinicaldata","ExamName","impression")

# Extract sections dict into its own column
df["sections"] = df.apply(extract_sections, axis=1)

# Tokenize the full report
df["tokens"] = df["sections"].apply(lambda s:(s["ReportText"]).split())

# Chunk each token to a section name or None
df["chunk_labels"] = df.apply(
    lambda row: chunk_labels_for_row(
        report_tokens = row["tokens"],
        sections      = row["sections"],
        label_cols    = LABEL_COLS
    ),
    axis=1
)

# Convert those chunks into BIOES tags
df["labels"] = df["chunk_labels"].apply(bioes_from_chunks)




### 🔹 Filtering Out Incomplete Taggings

- **Identify incomplete rows**  
  Create a boolean mask that marks any row whose BIOES tag list still contains an `"O"`, indicating at least one token was left untagged.

- **Inspect the culprits**  
  From that mask, extract the DataFrame indices (and/or zero-based positions) of all rows needing attention, so you can review their content and understand why some tokens weren’t matched.

- **Prune the dataset**  
  Drop every row flagged by the mask. The resulting `df_clean` contains only those reports whose tokens were **fully** covered by one of your defined sections, ensuring no stray `"O"` remains before training your model.


In [10]:

# Boolean mask: True for rows containing an “O”
mask = df["labels"].apply(lambda tags: "O" in tags)

#list of index values
rows_with_O = df.index[mask].tolist()
print("Row indices containing an “O” tag:", rows_with_O)

# Keep only rows that do NOT contain an “O” in their labels
df_clean = df[~mask].reset_index(drop=True)



Row indices containing an “O” tag: [32, 315, 438, 938]


### 🔹 Dataset Splitting and HuggingFace Wrapping

- Split the full DataFrame `df`:
  - 80% for `train_df`
  - 10% for `val_df`
  - 10% for `test_df`
- Wrap them into a Hugging Face `DatasetDict` for compatibility with Transformers:



In [20]:
train_df, temp_df = train_test_split(df_clean, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)




In [21]:
from google.colab import files
# Save just the ReportText column of the test set to CSV for a demo
test_df[['ReportText']].to_csv('/content/drive/MyDrive/demo_reports.csv', index=False)
files.download('demo_reports.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})



### 🔹 Tokenization and Label Alignment for the Model

- Used `emilyalsentzer/Bio_ClinicalBERT` tokenizer.
- Created `label2id` and `id2label` mappings from `df["labels"]`.
- Defined a `tokenize_and_align` function to:
  - Tokenize the `ReportText` using `is_split_into_words=True` for word-level alignment.
  - Align each token with its corresponding label ID using `word_ids()`.
  - Assign `-100` to special tokens (ignored during loss computation).
- Applied the function to the dataset:
  ```python
  tokenized_datasets = dataset_dict.map(tokenize_and_align)


In [13]:


# Load tokenizer and set a max length for truncation
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 512

# Build label ↔ id maps
label_list = sorted({lab for row in df["labels"] for lab in row})
print(label_list)
label2id   = {l: i for i, l in enumerate(label_list)}
id2label   = {i: l for l, i in label2id.items()}

# Adapted tokenize_and_align for single examples
def tokenize_and_align(examples):
    # examples["tokens"] is a list of words, examples["labels"] is list of BIOES strings
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
    )
    word_ids = tokenized.word_ids()
    labels   = []
    for wid in word_ids:
        if wid is None:
            labels.append(-100)
        else:
            # now examples["labels"] is a single list of strings
            labels.append(label2id[ examples["labels"][wid] ])
    tokenized["labels"] = labels
    return tokenized

# Map *without* batching
tokenized_datasets = dataset_dict.map(
    tokenize_and_align,
    batched=False,
    remove_columns=dataset_dict["train"].column_names
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


['B-ExamName', 'B-clinicaldata', 'B-findings', 'B-impression', 'E-ExamName', 'E-clinicaldata', 'E-findings', 'E-impression', 'I-ExamName', 'I-clinicaldata', 'I-findings', 'I-impression', 'O', 'S-impression']


Map:   0%|          | 0/786 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

### Model Setup and Training

- **Model**: Loads `Bio_ClinicalBERT` for token classification with a custom number of labels.
- **Arguments**: Sets training configuration using `TrainingArguments`:
  - 5 epochs after initially considering 3
  - Batch size of 8
  - Learning rate of 2e-5
  - Logging every 50 steps
- **Collator**: Uses `DataCollatorForTokenClassification` for dynamic padding.
- **Trainer**: Defines a `Trainer` object with model, args, datasets, tokenizer, and collator.
- **Training**: Launches the fine-tuning process via `trainer.train()`.


In [14]:


model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir="ner_model",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    report_to="none"
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
50,1.0112
100,0.1555
150,0.0426
200,0.0309
250,0.0159
300,0.0172
350,0.0117
400,0.0093
450,0.0072


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=495, training_loss=0.132336922306003, metrics={'train_runtime': 44.8098, 'train_samples_per_second': 87.704, 'train_steps_per_second': 11.047, 'total_flos': 327419940731616.0, 'train_loss': 0.132336922306003, 'epoch': 5.0})

### 🔹 Publishing the Model for Future Use

- **Authenticate with Hugging Face Hub**  
  The notebook uses an interactive login to connect Colab session with Hugging Face account, so subsequent operations can securely upload artifacts.

- **Upload the fine-tuned model**  
  Once training is complete, the model weights, configuration, and tokenizer files are pushed to private repository on the Hub, making it accessible for future loading or sharing.


In [15]:
notebook_login()
trainer.push_to_hub("mo191919/ner-model")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mo191919/ner_model/commit/9af4440f92cc7c11fb7959a87a60bc09596cb889', commit_message='mo191919/ner-model', commit_description='', oid='9af4440f92cc7c11fb7959a87a60bc09596cb889', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mo191919/ner_model', endpoint='https://huggingface.co', repo_type='model', repo_id='mo191919/ner_model'), pr_revision=None, pr_num=None)

###  Model Evaluation

- **Prediction**: Runs inference on the test set using `trainer.predict`.
- **Decoding**:
  - Converts predicted label IDs to label strings using `id2label`.
  - Filters out ignored indices (`-100`) to align predictions and ground truth.
- **Report**: Uses `classification_report` to display precision, recall, and F1-score for each entity class (BIOES format), as well as macro/micro/weighted averages.


In [16]:
# Run inference on the test split
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
preds = predictions.argmax(axis=-1)  # shape (num_examples, seq_len)

# Reconstruct per-example label sequences (drop -100)
true_labels = [
    [id2label[label_id] for label_id in label_row if label_id != -100]
    for label_row in labels
]
true_preds = [
    [id2label[pred_id] for pred_id, label_id in zip(pred_row, label_row) if label_id != -100]
    for pred_row, label_row in zip(preds, labels)
]


# Flatten the lists of lists
flat_true = [lab for example in true_labels for lab in example]
flat_pred = [pr  for example, preds in zip(true_labels, true_preds)
                for pr, lab in zip(preds, example)]

print(classification_report(
    flat_true,
    flat_pred,
    digits=4
))


  return forward_call(*args, **kwargs)


                precision    recall  f1-score   support

    B-ExamName     0.9973    0.9973    0.9973       366
B-clinicaldata     0.9866    1.0000    0.9932       147
    B-findings     0.9851    0.9950    0.9900       199
  B-impression     1.0000    1.0000    1.0000       198
    E-ExamName     0.9858    0.9971    0.9914       347
E-clinicaldata     0.9896    1.0000    0.9948       191
    E-findings     0.9962    1.0000    0.9981       262
  E-impression     0.9969    0.9969    0.9969       320
    I-ExamName     0.9954    0.9931    0.9942      1737
I-clinicaldata     0.9885    0.9977    0.9931       430
    I-findings     0.9998    0.9991    0.9995      4598
  I-impression     0.9995    0.9951    0.9973      1850

      accuracy                         0.9972     10645
     macro avg     0.9934    0.9976    0.9955     10645
  weighted avg     0.9972    0.9972    0.9972     10645



### Evaluation Summary

-- **Accuracy**: `0.9972` — fraction of all tokens correctly classified  
- **Micro avg** (token-level, strict): `0.9972` — treats every prediction equally  
- **Macro avg** (class balance): `0.9955` — unweighted mean F1 across all 12 BIOES tags  
- **Weighted avg** (label frequency): `0.9972` — F1 averaged by support size, reflecting true class distribution  

**Key observations**  
- **Exceptional overall performance**, with nearly 99.8 % of tokens correctly labeled.  
- The rarest tags (`B-clinicaldata`, `E-ExamName`) dip slightly (F1 ≃ 0.9932 and 0.9914 respectively) due to lower support and higher variability in how those sections are written.  
- Common tags like `I-findings` (support = 4598) achieve F1 ≥ 0.999, driving the high micro- and weighted averages.  

