In [1]:
import pickle
import pandas as pd
import os
import openai
import numpy as np
import ipdb
import re
from tqdm import tqdm

from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
import spacy
import scipy

nlp = spacy.load("en_core_web_sm")
openai.api_key= os.environ['OPENAI_KEY']

from data_utils import *
from gpt3_utils import *
from eval_utils import *

pd.set_option('max_rows',500,'max_colwidth',1000)

In [3]:
#Loading BC5CDR Chemical and Disease Training Sets

chemical_train = pd.read_csv('../data/bc5cdr_chemical.train.processed.tsv',sep='\t')
disease_train = pd.read_csv('../data/bc5cdr_disease.train.processed.tsv',sep='\t')

In [4]:
def get_entity_samples(df, num_entities):
    
    ent_samples = []
    
    for ents in df.entities:
        
        ent_samples.extend(eval(ents))
        
    return np.random.RandomState(42).permutation(ent_samples)[:num_entities]

In [5]:
chemical_entity_samples = get_entity_samples(chemical_train, 5)
disease_entity_samples = get_entity_samples(disease_train, 5)

In [6]:
chemical_entity_samples, disease_entity_samples

(array(['bradykinin', 'cisapride', 'papaverine', 'glycosaminoglycan',
        'morphine'], dtype='<U124'),
 array(['dyskinesia', 'poisoning', 'migraine', 'cerebral haemorrhage',
        'gallbladder disease'], dtype='<U93'))

In [7]:
engine='davinci'

In [8]:
def get_verbalization_candidates(engine, example_ents, example_ent_type, query_ents, num_candidates=5):

    verbalizing_prompt_template = """Phrases: {}

    The previous phrases are all examples of {}.

    Phrases: {}

    The previous phrases are all examples of"""
    
    example_ents = ['"{}"'.format(e) for e in example_ents]
    query_ents = ['"{}"'.format(e) for e in query_ents]
    
    verbalizing_prompt = verbalizing_prompt_template.format(example_ents, example_ent_type, query_ents)
    print(verbalizing_prompt)
    
    filter_sample = openai.Completion.create(engine=engine,
                                             prompt=verbalizing_prompt,
                                             max_tokens=3,
                                             top_p=0.8,
                                             n=num_candidates,
                                             best_of=num_candidates,
                                             stop=[".", "\n","<|endoftext|>"])
    
    return [re.sub('[^a-z ]','',c['text'].lower().strip()) for c in filter_sample['choices']]

In [21]:
disease_candidates = []

for drug_name in ['drugs','chemicals','medications']:
    disease_candidates.extend(get_verbalization_candidates(engine,chemical_entity_samples, drug_name, disease_entity_samples))
    
disease_candidates = set(disease_candidates)

disease_candidates

Phrases: ['"bradykinin"', '"cisapride"', '"papaverine"', '"glycosaminoglycan"', '"morphine"']

    The previous phrases are all examples of drugs.

    Phrases: ['"dyskinesia"', '"poisoning"', '"migraine"', '"cerebral haemorrhage"', '"gallbladder disease"']

    The previous phrases are all examples of
Phrases: ['"bradykinin"', '"cisapride"', '"papaverine"', '"glycosaminoglycan"', '"morphine"']

    The previous phrases are all examples of chemicals.

    Phrases: ['"dyskinesia"', '"poisoning"', '"migraine"', '"cerebral haemorrhage"', '"gallbladder disease"']

    The previous phrases are all examples of
Phrases: ['"bradykinin"', '"cisapride"', '"papaverine"', '"glycosaminoglycan"', '"morphine"']

    The previous phrases are all examples of medications.

    Phrases: ['"dyskinesia"', '"poisoning"', '"migraine"', '"cerebral haemorrhage"', '"gallbladder disease"']

    The previous phrases are all examples of


{'conditions',
 'diseases',
 'disorders',
 'health conditions',
 'medical conditions',
 'symptoms'}

In [30]:
#Curated list of best candidates retrieved
disease_candidates = {'conditions',
 'diseases',
 'health conditions',
 'health issues',
 'health problems',
 'medical conditions',
 'medical problems',
                      'disorders','symptoms'}

disease_candidates = {
 'diseases',
 'health conditions',
 'health issues',
    'disorders','symptoms'}

In [None]:
# chemical_candidates = []

# for disease_name in ['diseases','symptoms','medical conditions']:
#     chemical_candidates.extend(get_verbalization_candidates(engine, disease_entity_samples, disease_name, chemical_entity_samples))
    
# chemical_candidates = set(chemical_candidates)
# chemical_candidates

In [None]:
#Curated list of best candidates retrieved
chemical_candidates = {'chemicals',
 'drugs',
 'medications',
 'medicines',
 'pharmacological agents'}

In [31]:
train_half1 = disease_train[0:int(len(disease_train)/2)]
train_half2 = disease_train[int(len(disease_train)/2):]

train_half2 = train_half2.sample(frac=1, random_state=np.random.RandomState(1))
    
dev_data = create_prompt_dataset(train_half1, train_half2, 42, 5, 50, 'random')

In [32]:
def select_verbalized_from_candidates(engine, dev_data, candidates, default_entity_name):
    dev_df = dev_data['test_df']
    prompts = dev_df.test_ready_prompt.values
    
    result_by_cand = []
    
    print(candidates)
    
    for candidate in candidates:

        altered_prompts = [p.replace('{}:'.format(default_entity_name) ,candidate[0].upper()+candidate[1:]+':') for p in prompts]
        result_df = run_gpt3_on_df(engine, dev_df, altered_prompts, max_tokens=30, sep=dev_data['sep'], logit_bias=10, sep_logit_bias=10, new_line_logit_bias=10)

        df = create_bio_preds(result_df, "predictions")
        f1, precision, recall = conlleval_eval(df.ner_seq,df.bio_preds)

        result_by_cand.append((candidate, f1, precision, recall, altered_prompts[0]))
        
    return pd.DataFrame(result_by_cand)

In [33]:
print(dev_data['test_df'].test_ready_prompt.values[0])

Sentence: Lithium remains a first - line treatment for the acute and maintenance treatment of bipolar disorder .
Diseases: bipolar disorder

Sentence: The other rats showed a strong decrease in the rigidity and the occurrence of stereotyped ( S ) licking and / or gnawing in presence of akinetic or hyperkinetic ( K ) behaviour ( AS / KS group ) , suggesting signs of dopaminergic activation .
Diseases: rigidity, akinetic, hyperkinetic

Sentence: Blood pressure response to chronic low - dose intrarenal noradrenaline infusion in conscious rats .
Diseases: 

Sentence: Male Wistar rats were challenged intragastrically once daily for 9 days with 1 . 0 ml / kg of corn oil containing vitamin D2 and cholesterol to induce atherosclerosis .
Diseases: atherosclerosis

Sentence: At diagnosis there was no significant difference in OD between HIT patients with thrombosis and those with isolated - HIT .
Diseases: hit, thrombosis

Sentence: METHODS : We present the first case report of a woman with hype

In [34]:
disease_candidates

{'diseases', 'disorders', 'health conditions', 'health issues', 'symptoms'}

In [35]:
disease_experiment_results = select_verbalized_from_candidates('davinci', dev_data, disease_candidates, 'Diseases')

0it [00:00, ?it/s]

{'health issues', 'health conditions', 'diseases', 'disorders', 'symptoms'}


50it [00:27,  1.79it/s]
0it [00:00, ?it/s]

processed 1470 tokens with 57 phrases; found: 110 phrases; correct: 29.
accuracy:  90.14%; (non-O)
accuracy:  90.14%; precision:  26.36%; recall:  50.88%; FB1:  34.73%
                X: precision:  26.36%; recall:  50.88%; FB1:  34.73%  110


50it [00:26,  1.87it/s]
0it [00:00, ?it/s]

processed 1470 tokens with 57 phrases; found: 101 phrases; correct: 32.
accuracy:  90.68%; (non-O)
accuracy:  90.68%; precision:  31.68%; recall:  56.14%; FB1:  40.51%
                X: precision:  31.68%; recall:  56.14%; FB1:  40.51%  101


50it [00:26,  1.87it/s]
0it [00:00, ?it/s]

processed 1470 tokens with 57 phrases; found: 91 phrases; correct: 31.
accuracy:  92.04%; (non-O)
accuracy:  92.04%; precision:  34.07%; recall:  54.39%; FB1:  41.89%
                X: precision:  34.07%; recall:  54.39%; FB1:  41.89%  91


50it [00:26,  1.86it/s]
0it [00:00, ?it/s]

processed 1470 tokens with 57 phrases; found: 105 phrases; correct: 34.
accuracy:  91.84%; (non-O)
accuracy:  91.84%; precision:  32.38%; recall:  59.65%; FB1:  41.98%
                X: precision:  32.38%; recall:  59.65%; FB1:  41.98%  105


50it [00:27,  1.85it/s]

processed 1470 tokens with 57 phrases; found: 109 phrases; correct: 35.
accuracy:  90.34%; (non-O)
accuracy:  90.34%; precision:  32.11%; recall:  61.40%; FB1:  42.17%
                X: precision:  32.11%; recall:  61.40%; FB1:  42.17%  109





In [36]:
disease_experiment_results[[0,1,2,3]]

Unnamed: 0,0,1,2,3
0,health issues,34.730539,26.363636,50.877193
1,health conditions,40.506329,31.683168,56.140351
2,diseases,41.891892,34.065934,54.385965
3,disorders,41.975309,32.380952,59.649123
4,symptoms,42.168675,32.110092,61.403509


In [29]:
print(disease_experiment_results[4].values[0])

Sentence: Lithium remains a first - line treatment for the acute and maintenance treatment of bipolar disorder .
Symptoms: bipolar disorder

Sentence: The other rats showed a strong decrease in the rigidity and the occurrence of stereotyped ( S ) licking and / or gnawing in presence of akinetic or hyperkinetic ( K ) behaviour ( AS / KS group ) , suggesting signs of dopaminergic activation .
Symptoms: rigidity, akinetic, hyperkinetic

Sentence: Blood pressure response to chronic low - dose intrarenal noradrenaline infusion in conscious rats .
Symptoms: 

Sentence: Male Wistar rats were challenged intragastrically once daily for 9 days with 1 . 0 ml / kg of corn oil containing vitamin D2 and cholesterol to induce atherosclerosis .
Symptoms: atherosclerosis

Sentence: At diagnosis there was no significant difference in OD between HIT patients with thrombosis and those with isolated - HIT .
Symptoms: hit, thrombosis

Sentence: The site of common side effects of sumatriptan .
Symptoms:
