This notebook is for beginners.

Thanks for [Nicholas's work](https://www.kaggle.com/nbroad/qa-ner-hybrid-train-nbme).

# Import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from ast import literal_eval
from itertools import chain
from sklearn.metrics import precision_recall_fscore_support
from tqdm.notebook import tqdm, trange
from sklearn.model_selection import StratifiedKFold

import torch
!pip install transformers
from transformers import AutoModel, AutoTokenizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 21.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 2.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 5.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

# Create df

In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

ROOT = "drive/My Drive/tar-project"
TRAIN_PATH = ROOT + "/train.csv"
VAL_PATH = ROOT + "/valid.csv"
FEATURES_PATH = ROOT + "/features.csv"
PN_NOTES_PATH = ROOT + "/patient_notes.csv"
!pwd

Mounted at /content/drive
/content


In [None]:
from google.colab import drive

def process_feature_text(text):
    return text.replace("-OR-", ";-").replace("-", " ")

def create_df(data_path, features_path, pn_notes_path ,debug = False):
    feats = pd.read_csv(features_path)
    notes = pd.read_csv(pn_notes_path)
    data = pd.read_csv(data_path)
    data["annotation_list"] = [literal_eval(x) for x in data["annotation"]]
    data["location_list"] = [literal_eval(x) for x in data["location"]]
    merged = data.merge(notes, how = "left")
    merged = merged.merge(feats, how = "left")
    merged = merged.loc[merged["annotation"] != "[]"].copy().reset_index(drop = True) # comment out if you train all samples
    merged["feature_text"] = [process_feature_text(x) for x in merged["feature_text"]]
    
    merged["feature_text"] = merged["feature_text"].apply(lambda x: x.lower())
    merged["pn_history"] = merged["pn_history"].apply(lambda x: x.lower())
  
    if debug:
        merged = merged.sample(frac = 0.5).reset_index(drop = True)

    skf = StratifiedKFold(n_splits = 5)
    merged["stratify_on"] = merged["case_num"].astype(str) + merged["feature_num"].astype(str)
    merged["fold"] = -1
    for fold, (_, valid_idx) in enumerate(skf.split(merged["id"], y = merged["stratify_on"])):
        merged.loc[valid_idx, "fold"] = fold
    
    return merged


df_train = create_df(TRAIN_PATH, FEATURES_PATH, PN_NOTES_PATH)
df_val = create_df(VAL_PATH, FEATURES_PATH, PN_NOTES_PATH)



In [None]:
df_train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_list,location_list,pn_history,feature_text,stratify_on,fold
0,10034827_300,3,10034827,300,['uncle with ulcer'],['470 486'],[uncle with ulcer],[470 486],patient is a 35 year old male presenting with ...,fhx of pud; family history of peptic ulcer dis...,3300,0
1,10034827_301,3,10034827,301,['epigastric pain'],['62 77'],[epigastric pain],[62 77],patient is a 35 year old male presenting with ...,epigastric discomfort,3301,0
2,10034827_302,3,10034827,302,['stools dark'],['308 319'],[stools dark],[308 319],patient is a 35 year old male presenting with ...,darker bowel movements,3302,0
3,10034827_303,3,10034827,303,['motrin'],['436 442'],[motrin],[436 442],patient is a 35 year old male presenting with ...,nsaid use; nonsteroidal anti inflammatory drug...,3303,0
4,10034827_306,3,10034827,306,['occurs day 2-3x a'],['120 137'],[occurs day 2-3x a],[120 137],patient is a 35 year old male presenting with ...,getting worse; progressive; symptoms now daily,3306,0


In [None]:
first = df_train.loc[0]
example = {
    "feature_text": first.feature_text,
    "pn_history": first.pn_history,
    "location_list": first.location_list,
    "annotation_list": first.annotation_list
}
for key in example.keys():
    print(key)
    print(example[key])
    print("=" * 100)

feature_text
fhx of pud; family history of peptic ulcer disease
pn_history
patient is a 35 year old male presenting with a 2 month hx of epigastric pain
pain is sharp and stabbing in nature and occurs day 2-3x a 
described as 5/10 in severity, no radiation
tums pain originally made but better the no help longer at eating, related to sometimes endorses not occurs pain night somestools dark, no constipation, diarrhea, no vomiting
endorses occasional nausea
pmh: back pain and spasms, no surgeries
meds: motrin 2 pills/week for pain
fh: uncle with ulcer
sh: works as construction worker, not sexually active, smokes 0.5-1 pack pr day
no allergies
location_list
['470 486']
annotation_list
['uncle with ulcer']


In [None]:
def loc_list_to_ints(loc_list):
    to_return = []
    for loc_str in loc_list:
        loc_strs = loc_str.split(";")
        for loc in loc_strs:
            start, end = loc.split()
            to_return.append((int(start), int(end)))
    return to_return

print(example["location_list"])
example_loc_ints = loc_list_to_ints(example["location_list"])[0]
print(example_loc_ints)
print(example["pn_history"][example_loc_ints[0] : example_loc_ints[1]])

['470 486']
(470, 486)
uncle with ulcer


nice.

# Tokenizer

In [None]:
MODEL_NAME = "bert-base-uncased" # we cant connect internet for submission
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, )

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
def tokenize_and_add_labels_test(tokenizer, example):
    tokenized_inputs = tokenizer(
        example['feature_text'],
        example["pn_history"],
        truncation = "only_second",
        max_length = 416, # max length is 406
        padding = "max_length",
        return_offsets_mapping = True
    )
    labels = [0.0] * len(tokenized_inputs["input_ids"])
    tokenized_inputs["location_int"] = loc_list_to_ints(example["location_list"])
    tokenized_inputs["sequence_ids"] = tokenized_inputs.sequence_ids()

    for idx, (seq_id, offsets) in enumerate(zip(tokenized_inputs["sequence_ids"], tokenized_inputs["offset_mapping"])):
        if seq_id is None or seq_id == 0:
            labels[idx] = -100
            continue
        exit = False
        token_start, token_end = offsets
        for feature_start, feature_end in tokenized_inputs["location_int"]:
            if exit:
                break
            if token_start >= feature_start and token_end <= feature_end:
                labels[idx] = 1.0
                exit = True
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [None]:
tokenized_inputs = tokenize_and_add_labels_test(tokenizer, example)
for key in tokenized_inputs.keys():
    print(key)
    print(tokenized_inputs[key])
    print("=" * 100)

input_ids
[101, 1042, 2232, 2595, 1997, 16405, 2094, 1025, 2155, 2381, 1997, 27233, 4588, 17359, 17119, 4295, 102, 5776, 2003, 1037, 3486, 2095, 2214, 3287, 10886, 2007, 1037, 1016, 3204, 1044, 2595, 1997, 4958, 13340, 3367, 7277, 3255, 3255, 2003, 4629, 1998, 21690, 1999, 3267, 1998, 5158, 2154, 1016, 1011, 1017, 2595, 1037, 2649, 2004, 1019, 1013, 2184, 1999, 18976, 1010, 2053, 8249, 10722, 5244, 3255, 2761, 2081, 2021, 2488, 1996, 2053, 2393, 2936, 2012, 5983, 1010, 3141, 2000, 2823, 2203, 5668, 2229, 2025, 5158, 3255, 2305, 2070, 16033, 27896, 2601, 1010, 2053, 9530, 16643, 24952, 2239, 1010, 22939, 12171, 20192, 1010, 2053, 24780, 2203, 5668, 2229, 8138, 19029, 7610, 2232, 1024, 2067, 3255, 1998, 12403, 19230, 1010, 2053, 12058, 5134, 19960, 2015, 1024, 9587, 18886, 2078, 1016, 15345, 1013, 2733, 2005, 3255, 1042, 2232, 1024, 4470, 2007, 17359, 17119, 14021, 1024, 2573, 2004, 2810, 7309, 1010, 2025, 12581, 3161, 1010, 5610, 2015, 1014, 1012, 1019, 1011, 1015, 5308, 10975, 2154, 20

In [None]:
def tokenize_and_add_labels(tokenizer, example):
    tokenized_inputs = tokenizer(
        example["feature_text"],
        example["pn_history"],
        truncation = "only_second",
        max_length = 416, # max length is 406
        padding = "max_length",
        return_offsets_mapping = True
    )
    labels = [0.0] * len(tokenized_inputs["input_ids"])
    tokenized_inputs["location_int"] = loc_list_to_ints(example["location_list"])
    tokenized_inputs["sequence_ids"] = tokenized_inputs.sequence_ids()

    for idx, (seq_id, offsets) in enumerate(zip(tokenized_inputs["sequence_ids"], tokenized_inputs["offset_mapping"])):
        if seq_id is None or seq_id == 0:
            labels[idx] = -100
            continue
        exit = False
        token_start, token_end = offsets
        for feature_start, feature_end in tokenized_inputs["location_int"]:
            if exit:
                break
            if token_start >= feature_start and token_end <= feature_end:
                labels[idx] = 1.0
                exit = True
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [None]:
tokenized_inputs = tokenize_and_add_labels(tokenizer, example)
for key in tokenized_inputs.keys():
    print(key)
    print(tokenized_inputs[key])
    print("=" * 100)

input_ids
[101, 1042, 2232, 2595, 1997, 16405, 2094, 1025, 2155, 2381, 1997, 27233, 4588, 17359, 17119, 4295, 102, 5776, 2003, 1037, 3486, 2095, 2214, 3287, 10886, 2007, 1037, 1016, 3204, 1044, 2595, 1997, 4958, 13340, 3367, 7277, 3255, 3255, 2003, 4629, 1998, 21690, 1999, 3267, 1998, 5158, 2154, 1016, 1011, 1017, 2595, 1037, 2649, 2004, 1019, 1013, 2184, 1999, 18976, 1010, 2053, 8249, 10722, 5244, 3255, 2761, 2081, 2021, 2488, 1996, 2053, 2393, 2936, 2012, 5983, 1010, 3141, 2000, 2823, 2203, 5668, 2229, 2025, 5158, 3255, 2305, 2070, 16033, 27896, 2601, 1010, 2053, 9530, 16643, 24952, 2239, 1010, 22939, 12171, 20192, 1010, 2053, 24780, 2203, 5668, 2229, 8138, 19029, 7610, 2232, 1024, 2067, 3255, 1998, 12403, 19230, 1010, 2053, 12058, 5134, 19960, 2015, 1024, 9587, 18886, 2078, 1016, 15345, 1013, 2733, 2005, 3255, 1042, 2232, 1024, 4470, 2007, 17359, 17119, 14021, 1024, 2573, 2004, 2810, 7309, 1010, 2025, 12581, 3161, 1010, 5610, 2015, 1014, 1012, 1019, 1011, 1015, 5308, 10975, 2154, 20

we need "input_ids" and "attention_mask" for BERT.

labels are 1.0 at annotation.

so we can train as binary classification; does this word(token) represent the feature? -> 1 or 0

# Dataset

In [None]:
class NBMEData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.loc[idx]
        tokenized = tokenize_and_add_labels(self.tokenizer, example)

        input_ids = np.array(tokenized["input_ids"]) # for input BERT
        attention_mask = np.array(tokenized["attention_mask"]) # for input BERT
        labels = np.array(tokenized["labels"]) # for calculate loss and cv score

        offset_mapping = np.array(tokenized["offset_mapping"]) # for calculate cv score
        sequence_ids = np.array(tokenized["sequence_ids"]).astype("float16") # for calculate cv score
        
        return input_ids, attention_mask, labels, offset_mapping, sequence_ids

# Model

In [None]:
class NBMEModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(MODEL_NAME) # BERT model
        self.dropout = torch.nn.Dropout(p = 0.2)
        self.classifier = torch.nn.Linear(768, 1) # BERT has last_hidden_state(size: sequqence_length, 768)
    
    def forward(self, input_ids, attention_mask):
        last_hidden_state = self.backbone(input_ids = input_ids, attention_mask = attention_mask)[0] # idx 0 is last_hidden_state; backbone().last_hidden_state is also good
        logits = self.classifier(self.dropout(last_hidden_state)).squeeze(-1)
        return logits

# Training

In [None]:
fold = 0
BATCH_SIZE = 16
EPOCHS = 2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = NBMEModel().to(DEVICE)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-5)

#train = df.loc[df["fold"] != fold].reset_index(drop = True)
#valid = df.loc[df["fold"] == fold].reset_index(drop = True)
train_ds = NBMEData(df_train, tokenizer)
valid_ds = NBMEData(df_val, tokenizer)

#reducing size, for debugging

#train_part_ds = torch.utils.data.random_split(train_ds, [100, len(train_ds) - 100])[0]
#valid_part_ds = torch.utils.data.random_split(valid_ds, [100, len(valid_ds) - 100])[0]
#train_ds = train_part_ds
#valid_ds = valid_part_ds


train_dl = torch.utils.data.DataLoader(train_ds, batch_size = BATCH_SIZE, pin_memory = True, shuffle = True, drop_last = True)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size = BATCH_SIZE * 2, pin_memory = True, shuffle = False, drop_last = False)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def get_location_predictions(preds, offset_mapping, sequence_ids, test = False):
    all_predictions = []
    for pred, offsets, seq_ids in zip(preds, offset_mapping, sequence_ids):
        pred = sigmoid(pred)
        start_idx = None
        current_preds = []
        for p, o, s_id in zip(pred, offsets, seq_ids):
            if s_id is None or s_id == 0:
                continue
            if p > 0.5:
                if start_idx is None:
                    start_idx = o[0]
                end_idx = o[1]
            elif start_idx is not None:
                if test:
                    current_preds.append(f"{start_idx} {end_idx}")
                else:
                    current_preds.append((start_idx, end_idx))
                start_idx = None
        if test:
            all_predictions.append("; ".join(current_preds))
        else:
            all_predictions.append(current_preds)
    return all_predictions

def calculate_char_CV(predictions, offset_mapping, sequence_ids, labels):
    all_labels = []
    all_preds = []
    for preds, offsets, seq_ids, labels in zip(predictions, offset_mapping, sequence_ids, labels):
        num_chars = max(list(chain(*offsets)))
        char_labels = np.zeros((num_chars))
        for o, s_id, label in zip(offsets, seq_ids, labels):
            if s_id is None or s_id == 0:
                continue
            if int(label) == 1:
                char_labels[o[0]:o[1]] = 1
        char_preds = np.zeros((num_chars))
        for start_idx, end_idx in preds:
            char_preds[start_idx:end_idx] = 1
        all_labels.extend(char_labels)
        all_preds.extend(char_preds)
    results = precision_recall_fscore_support(all_labels, all_preds, average = "binary")
    return {
        "precision": results[0],
        "recall": results[1],
        "f1": results[2]
    }

def compute_metrics(p):
    predictions, y_true = p
    y_true = y_true.astype(int)
    y_pred = [
        [int(p > 0.5) for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, y_true)
    ]
    y_true = [
        [l for l in label if l != -100] for label in y_true
    ]
    results = precision_recall_fscore_support(list(chain(*y_true)), list(chain(*y_pred)), average = "binary")
    return {
        "token_precision": results[0],
        "token_recall": results[1],
        "token_f1": results[2]
    }

In [None]:
history = {"train": [], "valid": []}
best_loss = np.inf

for epoch in range(EPOCHS):
    #training
    model.train()
    train_loss = AverageMeter()
    pbar = tqdm(train_dl)
    for batch in pbar:
        optimizer.zero_grad()
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        labels = batch[2].to(DEVICE)
        offset_mapping = batch[3]
        sequence_ids = batch[4]
        logits = model(input_ids, attention_mask)
        loss_fct = torch.nn.BCEWithLogitsLoss(reduction = "none")
        loss = loss_fct(logits, labels)
        loss = torch.masked_select(loss, labels > -1).mean() # we should calculate at "pn_history"; labels at "feature_text" are -100 < -1
        loss.backward()
        optimizer.step()
        train_loss.update(val = loss.item(), n = len(input_ids))
        pbar.set_postfix(Loss = train_loss.avg)
    print(epoch, train_loss.avg)
    history["train"].append(train_loss.avg)

    #evaluation
    model.eval()
    valid_loss = AverageMeter()
    with torch.no_grad():
        for batch in tqdm(valid_dl):
            input_ids = batch[0].to(DEVICE)
            attention_mask = batch[1].to(DEVICE)
            labels = batch[2].to(DEVICE)
            offset_mapping = batch[3]
            sequence_ids = batch[4]
            logits = model(input_ids, attention_mask)
            loss_fct = torch.nn.BCEWithLogitsLoss(reduction = "none")
            loss = loss_fct(logits, labels)
            loss = torch.masked_select(loss, labels > -1).mean()
            valid_loss.update(val = loss.item(), n = len(input_ids))
            pbar.set_postfix(Loss = valid_loss.avg)
    print(epoch, valid_loss.avg)
    history["valid"].append(valid_loss.avg)

    # save model
    if valid_loss.avg < best_loss:
        best_loss = valid_loss.avg
        torch.save(model.state_dict(), f"{ROOT}/nbme.pth")

  0%|          | 0/237 [00:00<?, ?it/s]

# Evaluation

In [None]:
model.load_state_dict(torch.load(f"{ROOT}/nbme.pth", map_location = DEVICE))

model.eval()
preds = []
offsets = []
seq_ids = []
lbls = []
with torch.no_grad():
    for batch in tqdm(valid_dl):
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        labels = batch[2].to(DEVICE)
        offset_mapping = batch[3]
        sequence_ids = batch[4]
        logits = model(input_ids, attention_mask)
        preds.append(logits.cpu().numpy())
        offsets.append(offset_mapping.numpy())
        seq_ids.append(sequence_ids.numpy())
        lbls.append(labels.cpu().numpy())
preds = np.concatenate(preds, axis = 0)
offsets = np.concatenate(offsets, axis = 0)
seq_ids = np.concatenate(seq_ids, axis = 0)
lbls = np.concatenate(lbls, axis = 0)
location_preds = get_location_predictions(preds, offsets, seq_ids, test = False)
score = calculate_char_CV(location_preds, offsets, seq_ids, lbls)
print(score)
file = open(ROOT +'/results.txt', 'a')
file.write('\n' + str(score))
file.close()

# Inrefence