# Train BERT NER

This dataset has been made for Colaboratory.

Acknowledgements:
- [The base model used](https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT)
- [A huggingface notebook abour NER](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
%cd ..

/Users/antoinestutz/Documents/Cours/IA challenge/health_data_challenge


In [3]:
%cd /content/drive/MyDrive/Synchronisé/Cours/Illuin/
!pip install transformers
!pip install datasets
!pip install seqeval

/content/drive/MyDrive/Synchronisé/Cours/Illuin


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_metric
from typing import Optional
import re
from dataclasses import dataclass

In [10]:
TRAIN_CSV = './data/train.csv'
VALIDATION_CSV = './data/val.csv'
NB_EPOCHS = 5
BATCH_SIZE = 16
LEARNING_RATE = 2e-5

## Load data

In [11]:
df_train = pd.read_csv(TRAIN_CSV, index_col=0)
df_valid = pd.read_csv(VALIDATION_CSV, index_col=0)
df_train

Unnamed: 0_level_0,txt,concept,rel,ast
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
289811204,./data/train_data/partners/txt/289811204.txt,./data/train_data/partners/concept/289811204.con,./data/train_data/partners/rel/289811204.rel,./data/train_data/partners/ast/289811204.ast
record-69,./data/train_data/beth/txt/record-69.txt,./data/train_data/beth/concept/record-69.con,./data/train_data/beth/rel/record-69.rel,./data/train_data/beth/ast/record-69.ast
433651389,./data/train_data/partners/txt/433651389.txt,./data/train_data/partners/concept/433651389.con,./data/train_data/partners/rel/433651389.rel,./data/train_data/partners/ast/433651389.ast
555509347_PUMC,./data/train_data/partners/txt/555509347_PUMC.txt,./data/train_data/partners/concept/555509347_P...,./data/train_data/partners/rel/555509347_PUMC.rel,./data/train_data/partners/ast/555509347_PUMC.ast
708739405_DH,./data/train_data/partners/txt/708739405_DH.txt,./data/train_data/partners/concept/708739405_D...,./data/train_data/partners/rel/708739405_DH.rel,./data/train_data/partners/ast/708739405_DH.ast
...,...,...,...,...
record-105,./data/train_data/beth/txt/record-105.txt,./data/train_data/beth/concept/record-105.con,./data/train_data/beth/rel/record-105.rel,./data/train_data/beth/ast/record-105.ast
498710998,./data/train_data/partners/txt/498710998.txt,./data/train_data/partners/concept/498710998.con,./data/train_data/partners/rel/498710998.rel,./data/train_data/partners/ast/498710998.ast
record-80,./data/train_data/beth/txt/record-80.txt,./data/train_data/beth/concept/record-80.con,./data/train_data/beth/rel/record-80.rel,./data/train_data/beth/ast/record-80.ast
record-47,./data/train_data/beth/txt/record-47.txt,./data/train_data/beth/concept/record-47.con,./data/train_data/beth/rel/record-47.rel,./data/train_data/beth/ast/record-47.ast


In [13]:
data = []
for _, row in df_train.iterrows():
    with open(row["txt"], encoding="utf-8") as file:
        text = file.read()
    with open(row["concept"]) as file:
        concepts = file.read()
    data.append({"text": text, "concept": concepts})

In [15]:
data_valid = []
for _, row in df_valid.iterrows():
    with open(row["txt"], encoding="utf-8") as file:
        text = file.read()
    with open(row["concept"]) as file:
        concepts = file.read()
    data_valid.append({"text": text, "concept": concepts})

## Format data

In [9]:
@dataclass
class EntityAnnotation:
    """Entity Annotation"""

    label: str
    text: str
    start_line: int
    end_line: int
    start_word: int
    end_word: int

In [10]:
def parse_concept_annotation(text: str) -> Optional[EntityAnnotation]:
        try:
            return EntityAnnotation(
                label=text.split("||")[1].split("=")[1].replace('"', "").replace("\n", ""),
                text=re.split("(\d{1,6}:\d{1,6} \d{1,6}:\d{1,6})", text.split("||")[0])[0]
                .split("=")[1]
                .replace('"', ""),
                start_line=int(
                    re.split("(\d{1,6}:\d{1,6} \d{1,6}:\d{1,6})", text.split("||")[0])[1]
                    .split(" ")[0]
                    .split(":")[0]
                ),
                start_word=int(
                    re.split("(\d{1,6}:\d{1,6} \d{1,6}:\d{1,6})", text.split("||")[0])[1]
                    .split(" ")[0]
                    .split(":")[1]
                ),
                end_line=int(
                    re.split("(\d{1,6}:\d{1,6} \d{1,6}:\d{1,6})", text.split("||")[0])[1]
                    .split(" ")[1]
                    .split(":")[0]
                ),
                end_word=int(
                    re.split("(\d{1,6}:\d{1,6} \d{1,6}:\d{1,6})", text.split("||")[0])[1]
                    .split(" ")[1]
                    .split(":")[1]
                ),
            )
        except (ValueError, IndexError):
            return None

In [11]:
parse_concept_annotation(data[1]["concept"].split("\n")[1])

EntityAnnotation(label='problem', text='definite pulmonary embolism ', start_line=75, end_line=75, start_word=2, end_word=4)

In [12]:
def format_data(data):
    processed_data = []
    nb_tokens_on_multiple_lines = 0
    for elt in data:
        words = [s.split(" ") for s in elt["text"].split("\n")]
        processed_labels = [["O"] * len(s) for s in words]
        labels = list(map(parse_concept_annotation, elt["concept"].split("\n")))
        for label in labels:
            if label != None:
                if label.start_line != label.end_line:
                    nb_tokens_on_multiple_lines += 1
                begin = True
                for i in range(label.start_line-1, label.end_line):
                    for j in range(label.start_word if i == label.start_line-1 else 0, label.end_word+1 if i == label.end_line-1 else len(processed_labels[i])):
                        processed_labels[i][j] = f"B_{label.label}" if begin else f"I_{label.label}"
                        begin = False
        processed_data.append([{"words": sentence, "labels": sentence_labels} for sentence, sentence_labels in zip(words, processed_labels)])
    print(f"{nb_tokens_on_multiple_lines} tokens were on multiple lines")
    return processed_data

formatted_data = format_data(data)
formatted_data_valid = format_data(data_valid)

0 tokens were on multiple lines
0 tokens were on multiple lines


In [13]:
[[(w,c) for w,c in zip(s["words"], s["labels"]) if c != "O"] for s in formatted_data[0]]

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('painless', 'B_problem'), ('jaundice', 'I_problem')],
 [('noninsulin', 'B_problem'),
  ('dependent', 'I_problem'),
  ('diabetes', 'I_problem'),
  ('mellitus', 'I_problem'),
  ('painless', 'B_problem'),
  ('jaundice', 'I_problem')],
 [('a', 'B_problem'),
  ('23', 'I_problem'),
  ('pound', 'I_problem'),
  ('weight', 'I_problem'),
  ('loss', 'I_problem'),
  ('an', 'B_problem'),
  ('increased', 'I_problem'),
  ('appetite', 'I_problem')],
 [('fatigue', 'B_problem'),
  ('&quot;', 'B_problem'),
  ('feeling', 'I_problem'),
  ('down', 'I_problem'),
  ('&quot;', 'I_problem')],
 [('personality', 'B_problem'),
  ('changes', 'I_problem'),
  ('increased', 'B_problem'),
  ('irritability', 'I_problem')],
 [('night', 'B_problem'), ('sweats', 'I_problem')],
 [('melena', 'B_problem'),
  ('hematochezia', 'B_problem'),
  ('nausea', 'B_problem'),
  ('abdominal', 'B_problem'),
  ('pain', 'I_problem')],
 [('occasionally', 'B_problem

As every new line seems to be a new sentence, we can consider every line independently.

In [14]:
formatted_data = [{"words": sentence["words"], "labels": sentence["labels"]} for doc in formatted_data for sentence in doc]
formatted_data_valid = [{"words": sentence["words"], "labels": sentence["labels"]} for doc in formatted_data_valid for sentence in doc]

## Tokenize data

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
device

device(type='cuda')

In [16]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

The labels used in the model are:
- **O** is the word shouldn't be recognized by the model
- **\*_problem** if the word is (part of) a health problem
- **\*_test** if the word is (part of) a medical test
- **\*_tratment** if the word is (part of) a medical treatment

The labels beginning by **B** are linked to first words of entities.\
The labels beginning by **I** are linked to other words of the entities.

In [17]:
label_list = [
    'O',
    'B_problem',
    'I_problem',
    'B_test',
    'I_test',
    'B_treatment',
    'I_treatment',
]

In [18]:
def tokenize_and_align_labels_list(examples):
    tokenized_inputs = []
    for i, row in enumerate(examples):
        tokenized_words = tokenizer(row["words"], truncation=True, is_split_into_words=True)
        word_ids = tokenized_words.word_ids()
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            else:
                label_ids.append(label_list.index(row["labels"][word_idx]))
        tokenized_words["labels"] = label_ids
        tokenized_inputs.append(tokenized_words)
    return tokenized_inputs

In [19]:
tokenized_data = tokenize_and_align_labels_list(formatted_data)
tokenized_data_valid = tokenize_and_align_labels_list(formatted_data_valid)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Build and train model

In [20]:
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels = len(label_list))

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 

In [21]:
model_name = "bio_clinical_bert"
batch_size = 16
args = TrainingArguments(
    f"{model_name}-finetuned-ner",
    evaluation_strategy = "epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
metric = load_metric('seqeval')

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_data,
    eval_dataset=tokenized_data_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [24]:
trainer.train()
trainer.save_model('./model')

***** Running training *****
  Num examples = 12631
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3950


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3613,0.166903,0.841313,0.895311,0.867473,0.949006
2,0.1016,0.16806,0.871095,0.899605,0.88512,0.953493
3,0.0602,0.185334,0.868451,0.896472,0.882239,0.952238
4,0.0412,0.207816,0.875807,0.897864,0.886698,0.952903
5,0.0261,0.214812,0.878349,0.89415,0.886179,0.95317


Saving model checkpoint to bio_clinical_bert-finetuned-ner/checkpoint-500
Configuration saved in bio_clinical_bert-finetuned-ner/checkpoint-500/config.json
Model weights saved in bio_clinical_bert-finetuned-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bio_clinical_bert-finetuned-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in bio_clinical_bert-finetuned-ner/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3783
  Batch size = 16
Saving model checkpoint to bio_clinical_bert-finetuned-ner/checkpoint-1000
Configuration saved in bio_clinical_bert-finetuned-ner/checkpoint-1000/config.json
Model weights saved in bio_clinical_bert-finetuned-ner/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in bio_clinical_bert-finetuned-ner/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bio_clinical_bert-finetuned-ner/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to bio_clini