In [None]:
""" USER INPUTS """
MODEL_GROUPS = ['17y1th7h'] # Models to use for inference
LOCATION = 'Local'  # Local vs. Kaggle Run Directories
# https://arthought.com/transformer-model-fine-tuning-for-text-classification-with-pytorch-lightning/

In [None]:
from pathlib import Path
import os
import torch

if LOCATION == 'Local':
    transformer_path = Path('./venv/lib/python3.9/site-packages/transformers')
    input_dir = Path('./input')
    os.environ["TOKENIZERS_PARALLELISM"] = "true"
    from nlp.datasets import TestDataset
elif LOCATION == 'Kaggle':
    transformer_path = Path('/opt/conda/lib/python3.7/site-packages/transformers')
    input_dir = Path('../input')
    %env TOKENIZERS_PARALLELISM=true
    from nlp_datasets import TestDataset

PATHS = {'transformer': transformer_path,
         'input': input_dir }
print(PATHS)

CSV_DATA_PATH = os.path.join(PATHS['input'], 'nbme-score-clinical-patient-notes')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil

transformers_path = PATHS['transformer']
input_dir = PATHS['input'] / 'deberta-v2-3-fast-tokenizer'
convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path / convert_file.name
if conversion_path.exists():
    conversion_path.unlink()
shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"
for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path / filename
    if filepath.exists():
        filepath.unlink()
    shutil.copy(input_dir / filename, filepath)
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast



In [None]:
import torch.nn as nn
from torch.optim import Adam, SGD, AdamW
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
)
import pytorch_lightning as pl
from sklearn.metrics import f1_score
import ast
from torch.utils.data import DataLoader
import torch
from transformers import AutoTokenizer
import os
import gc
import pandas as pd
import yaml
from tqdm.auto import tqdm
import numpy as np
import itertools

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [None]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.
    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.
    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.
    Args:
        spans (list of lists of two ints): Spans.
    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1

    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.
    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.
    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text,
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

In [None]:
def dict2obj(d):
    # checking whether object d is a
    # instance of class list
    if isinstance(d, list):
        d = [dict2obj(x) for x in d]

        # if d is not a instance of dict then
    # directly object is returned
    if not isinstance(d, dict):
        return d

    # declaring a class
    class C:
        pass

    # constructor of the class passed to obj
    obj = C()

    for k in d:
        obj.__dict__[k] = dict2obj(d[k])

    return obj


def get_char_probs(texts, predictions, tokenizer):
    """ Probability for each Character """
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text,
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


In [None]:
def list_model_folds(group_id):
    """ List all models and their files (i.e., *.ckpt and *.config) """
    list_models = []
    filelist = []
    model_group = os.path.join(PATHS['input'], 'nbme-' + group_id)

    for root, dirs, files in os.walk(model_group):
        for file in files:
            # append the file name to the list
            filelist.append(os.path.join(root, file))

    model_folds = []
    for filelist_ in filelist:
        # fold_id = filelist_.split(model_group)[-1].split(f'/{group_id}/')[-1].split('/')[1].split('_')[-1]
        fold_id = filelist_.split('_')[-1].split('/')[0]
        if fold_id not in model_folds:
            model_folds.append(fold_id)

    config, ckpt = None, None
    for model_fold in model_folds:
        for filelist_ in filelist:
            if (model_fold in filelist_) and ('.yaml' in filelist_):
                config = filelist_
            if (model_fold in filelist_) and ('.ckpt' in filelist_):
                ckpt = filelist_
        list_models.append([config, ckpt])
    return list_models

""" List all models and files for prediction """
all_model_files = []
for MODEL_GROUP in MODEL_GROUPS:
    model_folds = list_model_folds(group_id=MODEL_GROUP)
    for model_fold in model_folds:
        all_model_files.append(model_fold)

print(all_model_files)
for all_model_file in all_model_files:
    print(f'Model Files: {all_model_file}')

print('check here')

In [None]:

class BaseLineModel(pl.LightningModule):
    def __init__(self, model_repo, *,
                 autoconfig_path=None,
                 automodel_path=None,
                 learning_rate: float = 2e-5,
                 adam_epsilon: float = 1e-8,
                 warmup_steps: int = 0,
                 weight_decay: float = 0.0,
                 dropout_rate=0.1,
                 th: float = 0.5,
                 ):
        super().__init__()
        self.save_hyperparameters()
        print(f'Save Hyperparameters: {self.save_hyperparameters()}')
        self.th = th
        if autoconfig_path is None:
            autoconfig_path = PATHS['input'] / self.hparams.model_repo / 'config.json'
        if automodel_path is None:
            automodel_path = PATHS['input'] / self.hparams.model_repo

        print(f'Loading AutoConfig inside Model Class: {autoconfig_path}')
        self.config = AutoConfig.from_pretrained(autoconfig_path, output_hidden_states=True)
        print('\tLoaded AutoConfig inside Model Class')
        print(f'Loading AutoModel inside Model Class: {automodel_path}')
        self.base = AutoModel.from_pretrained(automodel_path, config=self.config)
        print('\tLoaded AutoModel inside Model Class')
        self.fc_dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(self.config.hidden_size, 1)

    def feature(self, inputs):
        outputs = self.base(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        outs = self.fc(self.fc_dropout(feature))
        return outs

    def training_step(self, batch, batch_idx):
        inputs, labels, idx = batch
        y_preds = self(inputs)
        loss = nn.BCEWithLogitsLoss(reduction="none")(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        # return {"loss": loss, "preds": y_preds.detach(), "labels": labels.detach(), "idxs": idx.detach()}
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        inputs, labels, idx = batch
        preds = self(inputs)
        val_loss = nn.BCEWithLogitsLoss(reduction="none")(preds.view(-1, 1), labels.view(-1, 1))
        val_loss = torch.masked_select(val_loss, labels.view(-1, 1) != -1).mean()
        self.log("val_loss", val_loss, prog_bar=True)
        return {"loss": val_loss, "preds": preds, "labels": labels, "idxs": idx}

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.learning_rate,
                          eps=self.hparams.adam_epsilon)
        return [optimizer]

    def training_epoch_end(self, train_step_outputs):
        if not self.trainer.sanity_checking:
            # Average Training Loss
            train_avg_loss = float(torch.tensor([x["loss"] for x in train_step_outputs]).mean().cpu().numpy())
            self.log('train_avg_loss', train_avg_loss, on_epoch=True, prog_bar=True)

    def validation_epoch_end(self, val_step_outputs):
        if not self.trainer.sanity_checking:
            # Scoring
            idxs = torch.cat([x["idxs"] for x in val_step_outputs]).detach().cpu().numpy()
            val_texts = [self.trainer.datamodule.val_texts[idx] for idx in idxs]
            val_labels = [self.trainer.datamodule.val_labels[idx] for idx in idxs]
            predictions = np.squeeze(torch.cat([x["preds"].sigmoid() for x in val_step_outputs]).detach().cpu().numpy())
            char_probs = get_char_probs(val_texts, predictions, self.trainer.datamodule.cfg.tokenizer)
            results = get_results(char_probs, th=self.th)
            preds = get_predictions(results)
            score = get_score(val_labels, preds)
            self.log('val_avg_f1', score, on_epoch=True, prog_bar=True)

            # Average Validation Loss
            val_avg_loss = float(torch.tensor([x["loss"] for x in val_step_outputs]).mean().cpu().numpy())
            self.log('val_avg_loss', val_avg_loss, on_epoch=True, prog_bar=True)

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()

    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions_ = np.concatenate(preds)
    return predictions_

In [None]:
""" Data Loading """
test = pd.read_csv(os.path.join(CSV_DATA_PATH, 'test.csv'))
submission = pd.read_csv(os.path.join(CSV_DATA_PATH, 'sample_submission.csv'))
features = pd.read_csv(os.path.join(CSV_DATA_PATH, 'features.csv'))
patient_notes = pd.read_csv(os.path.join(CSV_DATA_PATH, 'patient_notes.csv'))
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')

""" Loop through each model and make inference """
predictions = []
for model_files in all_model_files:
    config_path, ckpt_path = model_files[0], model_files[1]

    """ Config File """
    CFG = MODEL_GROUP
    with open(config_path, 'r') as file:
        CFG = yaml.safe_load(file)
    file.close()
    CFG = dict2obj(CFG)

    """ Tokenizer """
    print(f'About to load tokenizer: {CFG.model}')
    if "large" in CFG.model:
        CFG.tokenizer = DebertaV2TokenizerFast.from_pretrained(PATHS['input'] / 'get-token/tokenizer')
    else:
        CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_repo)
    print('LOADED TOKENIZER')

    """ Test Dataset """
    test_dataset = TestDataset(CFG, test)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             pin_memory=True,
                             drop_last=False,
                             )
    print("LOADED TEST DATASETS")

    """ Load Inference Model """
    autoconfig_path = PATHS['input'] / CFG.model
    automodel_path = ckpt_path
    print(f'LOAD MODEL: {ckpt_path}\n'
          f'autoconfig_path: {autoconfig_path}\n'
          f'automodel_path: {automodel_path}\n')
    nbme_model = BaseLineModel.load_from_checkpoint(ckpt_path)
    # nbme_model = BaseLineModel(model_repo=CFG.model,
    #                            autoconfig_path=autoconfig_path,
    #                            automodel_path=automodel_path,
    #                            dropout_rate=CFG.fc_dropout).load_from_checkpoint(ckpt_path)
    # nbme_model = BaseLineModel(model_repo=CFG.model)
    nbme_model = BaseLineModel.load_from_checkpoint(ckpt_path, autoconfig_path=autoconfig_path, automodel_path=automodel_path)
    nbme_model.to(DEVICE)
    nbme_model.freeze()
    nbme_model.eval()
    print(f'LOADED BASE MODEL: {ckpt_path}')
    prediction = inference_fn(test_loader, nbme_model, DEVICE)
    prediction = prediction.reshape((len(test), CFG.max_len))
    char_probs = get_char_probs(test['pn_history'].values, prediction, CFG.tokenizer)
    predictions.append(char_probs)
    del nbme_model, prediction, char_probs
    gc.collect()
    torch.cuda.empty_cache()
    print(f'Complted Inference: {ckpt_path}\n')

predictions = np.mean(predictions, axis=0)

""" Submission """
print('Started Submission Section')
results = get_results(predictions, th=0.1)
submission['location'] = results
print(submission.head())
submission[['id', 'location']].to_csv('submission.csv', index=False)
print('Completed Submission')
print('End of Inference')
