In [1]:
import typing
import os
import os
import glob
import pandas as pd
import numpy as np
import pickle
import torch
import transformers

from transformers import BertForTokenClassification, BertTokenizer
from sqlalchemy import create_engine
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_device():
    device = torch.device('cpu')
    if (torch.cuda.is_available()):
        device = torch.device("cuda")

    return device


DEVICE = get_device()


tag2idx = {
    'B': 2,
    'I': 0,
    'O': 1,
    'PAD': 3
}
idx2tag = {v: k for k,v in tag2idx.items()}

In [3]:
def get_ner_from_string(string, model, tokenizer):
    tokenized_sentence = tokenizer.encode(string)
    print(len(tokenized_sentence))
    print(string)
    input_ids = torch.tensor([tokenized_sentence]).to(DEVICE)

    with torch.no_grad():
        output = model(input_ids)

    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

    # Join the split tokens
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]

        else:
            new_labels.append(idx2tag[label_idx])
            new_tokens.append(token)

    return new_tokens, new_labels


def get_unique_treatments(string, model, tokenizer):
    tokens, labels = get_ner_from_string(string, model, tokenizer)
    treatments = []

    for i in range(len(tokens)):
        token = tokens[i]
        label = labels[i]

    if label == 'B':
        treatments.append(token)

    if label == 'I' and treatments:
        treatments[-1] += token
  
    return list(set(treatments))

In [4]:
def get_groups(connection):
    print("Reading groups...")
    groups = pd.read_sql("select id, title, description from temp_schema.groups", connection)
    groups['text'] = 'Title: ' + groups['title'] + ' Description: ' + groups['description']

    return groups


def get_treatments(connection):
    print("Reading treatments...")
    treats = pd.read_sql('select id as treat_id, name as treatments from temp_schema.treatments', connection)

    return treats 


def get_effect_groups(connection):
    print("Reading effects groups...")
    effect_groups = pd.read_sql("select id, title, description from temp_schema.effectsgroups", connection)
    effect_groups['text'] = 'Title: ' + effect_groups['title'] + ' Description: ' + effect_groups['description']

    return effect_groups

In [5]:
def load_tokenizer():
    print("Loading tokenizer...")
    return BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', do_lower_case=True)


In [6]:
def load_ner_model():
    print("Loading NER model...")
    model_path = f'{MODEL_PATH}/ClinicalBertNERModel.pt'
    if (not os.path.exists(model_path)):
        download_model()

    model = BertForTokenClassification.from_pretrained(
        "emilyalsentzer/Bio_ClinicalBERT",
        num_labels=len(tag2idx),
        output_attentions = False,
        output_hidden_states = False)

    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    return model

In [7]:
def parse_treatments(groups: pd.DataFrame, effect_groups: pd.DataFrame):
    # TODO - this should be batched
    model = load_ner_model()
    tokenizer = load_tokenizer()

    tqdm.pandas()

    groups['treatments'] = groups['text'].progress_apply(lambda x: get_unique_treatments(x, model, tokenizer))
    print(groups['treatments'])
    effect_groups['treatments'] = effect_groups['text'].progress_apply(lambda x: get_unique_treatments(x, model, tokenizer))

    unique_treatments = pd.concat([groups['treatments'], effect_groups['treatments']], axis=0)\
        .explode('treatments')\
        .str.lower()\
        .drop_duplicates()\
        .to_frame()

    unique_treatments = unique_treatments.rename(columns={ 'treatments': 'name'})
    unique_treatments['id'] = range(1, len(unique_treatments) + 1)
    unique_treatments = unique_treatments.reset_index(drop=True)

    return unique_treatments, groups, effect_groups


In [8]:
DATABASE_URL = os.environ.get('DATABASE_URL', default="postgresql://meditreats@localhost:5432")
MODEL_PATH = os.environ.get('MODEL_PATH', default="/Users/porterhunley/models")

In [9]:
from transformers import Trainer, TrainingArguments
from torch.nn.utils.rnn import pad_sequence

BATCH_SIZE=100

def get_ner_from_batch(strings, model, tokenizer):
    tokenized_sentences = [tokenizer.encode(s) for s in tqdm(strings)]
    input_ids = pad_sequence([torch.tensor(tokens) for tokens in tokenized_sentences], batch_first=True, padding_value=tokenizer.pad_token_id).to(DEVICE)

    with torch.no_grad():
        output = model(input_ids)
    
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    batch_size, seq_len = label_indices.shape

    tokens_batch = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy())

    new_tokens_batch, new_labels_batch = [], []
    for i in range(batch_size):
        new_tokens, new_labels = [], []
        for j in range(seq_len):
            token, label_idx = tokens_batch[i][j], label_indices[i][j]
            if token.startswith("##") and new_tokens:
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(idx2tag[label_idx])
                new_tokens.append(token)
        new_tokens_batch.append(new_tokens)
        new_labels_batch.append(new_labels)

    return new_tokens_batch, new_labels_batch


In [10]:
def parse_treatments_b(groups: pd.DataFrame, effect_groups: pd.DataFrame):
    # TODO - this should be batched
    tqdm.pandas()

    text_batches = [groups['text'][:500].values, effect_groups['text'][:500].values]
    model = load_ner_model()
    tokenizer = load_tokenizer()

    tqdm.pandas()

    treatments_batches = []
    for text_batch in text_batches:
        tokens_batch, _ = get_ner_from_batch(text_batch, model, tokenizer) #25000 text values
        treatments_batch = [set(t) for t in tokens_batch]
        treatments_batches.append(treatments_batch)

    groups['treatments'] = [list(t) for t in treatments_batches[0]]
    effect_groups['treatments'] = [list(t) for t in treatments_batches[1]]

    unique_treatments = pd.concat([groups['treatments'], effect_groups['treatments']], axis=0)\
        .explode('treatments')\
        .str.lower()\
        .drop_duplicates()\
        .to_frame()

    unique_treatments = unique_treatments.rename(columns={ 'treatments': 'name'})
    unique_treatments['id'] = range(1, len(unique_treatments) + 1)
    unique_treatments = unique_treatments.reset_index(drop=True)

    return unique_treatments, groups, effect_groups


In [11]:
engine = create_engine("postgresql://meditreats:meditreats@localhost:5432")


In [12]:
connection = engine.connect()

groups = get_groups(connection)
effect_groups = get_effect_groups(connection)

treats, treat_groups, treat_effect_groups = parse_treatments_b(groups, effect_groups)

Reading groups...
Reading effects groups...
Loading NER model...


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 

Loading tokenizer...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 1924.13it/s]


TypeError: only size-1 arrays can be converted to Python scalars