## Coverting NER datasets to GPT-3 Datasets

### Inputs:
- BIO NER Dataset.
- Columns: ['tokens','ner_tags']
- Structure: "Word\tNER-Tag"
- Examples Separated by empty line

### Outputs:
- GPT-3 Ready Dataset. 
- Structure: "Prompt\tTestExample"

#### Parameters:
- Training data file
- Test data file
- Random seed
- \# of training samples per prompt
- Selection Strategy
- Prompt Structure

*For now only works with one entity type at a time  

In [4]:
import pandas as pd
import numpy as np
import ipdb
import pickle
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [23]:
data_name = 'bc5cdr_disease'
entity_type = 'Diseases'
train_name = '../data/{}.train.csv'.format(data_name)
dev_name = '../data/{}.dev.csv'.format(data_name)
prompt_sample_structure = 'Sentence: {}\n'+entity_type+': {}'
empty_prompt_sample_structure = 'Sentence: {}\n'+entity_type+':'
seed = 42
few_shot_num = 5
number_of_test_samples = 50
selection_strategy = 'random'
sep = ', '

In [24]:
pd.set_option('max_rows',500,'max_colwidth',500)

In [25]:
def aggregate_to_sequence(df, token_col_name, seq_col_name):
    #Adding Sentence to a BIO dataframe with only tokens and ner_tags as columns.
    
    sents = []
    sent = []

    for i, row in df.iterrows():

        if pd.isna(row['tokens']):
            sents.append(' '.join(sent))
            sent = []
        else:
            sent.append(row[token_col_name])

    sent_col = []
    i = 0

    for j, row in df.iterrows():

        sent_col.append(sents[i])

        if pd.isna(row['tokens']):
            i+=1        

    df[seq_col_name] = sent_col
    
    return df

def extract_entities(df, sep='/'):
    #Extracting entities based on BIO Tags with columns 'sents','ner_seq'
    entities = []
    
    for i, row in df.iterrows():
        sent = row['sents'].split()
        bio_tags = row['ner_seq'].split()

        sent_entities = []
        current_ent = []
        
        num_bi_tags = 0
        
        for token, bio in zip(sent, bio_tags):
            if bio.startswith('B') or bio.startswith('O'):
                if len(current_ent) > 0:
                    current_ent = ' '.join(current_ent).lower()
                    if sep in current_ent:
                        print('woah, separator found in example {}'.format(current_ent))
                    sent_entities.append(current_ent)
                    
                    current_ent = []
                
                if bio.startswith('B'):
                    current_ent.append(token)
                    num_bi_tags += 1
                    
            elif bio.startswith('I'):
                current_ent.append(token)
                num_bi_tags += 1
        
        #Add Entity at end of sentence
        if len(current_ent) > 0:
            current_ent = ' '.join(current_ent).lower()
            if sep in current_ent:
                print('woah, separator found in example {}'.format(current_ent))
            sent_entities.append(current_ent)
                    
        assert num_bi_tags == len(' '.join(sent_entities).split()), ipdb.set_trace()
        entities.append(list(set(sent_entities)))
    
    df['entities'] = entities
    df['num_entities'] = [len(e) for e in entities]
    df['num_tokens'] = [len(tokenizer.encode(sep.join(set(ents)))) for ents in df.entities]
    
    return df


def add_prompts(df,sep='/'):
    """
    Combining sentences and entities to create prompts in Dataframe with 'sents' and 'entities' columns.
    Adds 'prompts' and 'empty_prompts' (prompt without answer) columns to DataFrame
    """
    
    prompts = []
    empty_prompts = []
    
    for i, row in df.iterrows():
        sent = row['sents']
        entities = sep.join(row['entities'])
        
        prompt = prompt_sample_structure.format(sent, entities)
        empty_prompt = empty_prompt_sample_structure.format(sent)
        
        prompts.append(prompt)
        empty_prompts.append(empty_prompt)

    df['prompts'] = prompts
    df['empty_prompts'] = empty_prompts
    
    return df

def add_preds(df, pred_filename):
    preds = pd.read_csv(pred_filename,header=None)
    
    bio_preds = []
    
    for pred_seq in preds[0].values:
        pred_list = pred_seq.split()
    
        for pred_tag in pred_list:
            bio_preds.append(pred_tag)
            
        bio_preds.append(None)
    
    assert len(df) == len(bio_preds)
    df['bio_preds'] = bio_preds
    
    return df

def augment_bio_dataframe(df,pred_filename=None,sep='/'):
    df = aggregate_to_sequence(df, 'tokens', 'sents')
    df = aggregate_to_sequence(df, 'ner_tags', 'ner_seq')
  
    if pred_filename is not None:
        df = add_preds(df, pred_filename)
        df = aggregate_to_sequence(df, 'bio_preds', 'bio_preds')
        df = df[['sents','ner_seq','bio_preds']].drop_duplicates()
    else:
        df = df[['sents','ner_seq']].drop_duplicates()
    df = df.reset_index(drop=True)
    df = extract_entities(df, sep)
    df = add_prompts(df,sep)
    
    return df

In [26]:
train = pd.read_csv(train_name,sep='\t',skip_blank_lines=False)
dev = pd.read_csv(dev_name,sep='\t',skip_blank_lines=False)

In [27]:
train = augment_bio_dataframe(train, sep=sep)
dev = augment_bio_dataframe(dev, '../data/bc5cdr_chemical_pubmed_preds.txt',sep=sep)

woah, separator found in example cancer of the renal pelvis , ureter or bladder
woah, separator found in example colorectal , breast and head and neck cancers
woah, separator found in example colorectal , breast , pancreaticobiliary , gastric , renal cell and head and neck cancers
woah, separator found in example cognitive , motivational , or creative deficits
woah, separator found in example opistothonus , sensory and motor dysfunction
woah, separator found in example impaired word fluency , psychomotor speed and working memory
woah, separator found in example a decrease in map , hr , sv , and co
woah, separator found in example abnormal audiograms with deficits mostly in the high frequency range of 4 , 000 to 8 , 000 hz
woah, separator found in example nausea , vomiting
woah, separator found in example nausea , vomiting
woah, separator found in example declines in simple and sustained attention , working memory , and verbal memory
woah, separator found in example cardiovascular , and

In [28]:
train.head(50)

Unnamed: 0,sents,ner_seq,entities,num_entities,num_tokens,prompts,empty_prompts
0,Naloxone reverses the antihypertensive effect of clonidine .,O O O O O O O O,[],0,0,Sentence: Naloxone reverses the antihypertensive effect of clonidine .\nDiseases:,Sentence: Naloxone reverses the antihypertensive effect of clonidine .\nDiseases:
1,"In unanesthetized , spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine , 5 to 20 micrograms / kg , was inhibited or reversed by nalozone , 0 . 2 to 2 mg / kg .",O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,[hypertensive],1,3,"Sentence: In unanesthetized , spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine , 5 to 20 micrograms / kg , was inhibited or reversed by nalozone , 0 . 2 to 2 mg / kg .\nDiseases: hypertensive","Sentence: In unanesthetized , spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine , 5 to 20 micrograms / kg , was inhibited or reversed by nalozone , 0 . 2 to 2 mg / kg .\nDiseases:"
2,The hypotensive effect of 100 mg / kg alpha - methyldopa was also partially reversed by naloxone .,O B O O O O O O O O O O O O O O O O,[hypotensive],1,3,Sentence: The hypotensive effect of 100 mg / kg alpha - methyldopa was also partially reversed by naloxone .\nDiseases: hypotensive,Sentence: The hypotensive effect of 100 mg / kg alpha - methyldopa was also partially reversed by naloxone .\nDiseases:
3,Naloxone alone did not affect either blood pressure or heart rate .,O O O O O O O O O O O O,[],0,0,Sentence: Naloxone alone did not affect either blood pressure or heart rate .\nDiseases:,Sentence: Naloxone alone did not affect either blood pressure or heart rate .\nDiseases:
4,"In brain membranes from spontaneously hypertensive rats clonidine , 10 ( - 8 ) to 10 ( - 5 ) M , did not influence stereoselective binding of [ 3H ] - naloxone ( 8 nM ) , and naloxone , 10 ( - 8 ) to 10 ( - 4 )",O O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,[hypertensive],1,3,"Sentence: In brain membranes from spontaneously hypertensive rats clonidine , 10 ( - 8 ) to 10 ( - 5 ) M , did not influence stereoselective binding of [ 3H ] - naloxone ( 8 nM ) , and naloxone , 10 ( - 8 ) to 10 ( - 4 )\nDiseases: hypertensive","Sentence: In brain membranes from spontaneously hypertensive rats clonidine , 10 ( - 8 ) to 10 ( - 5 ) M , did not influence stereoselective binding of [ 3H ] - naloxone ( 8 nM ) , and naloxone , 10 ( - 8 ) to 10 ( - 4 )\nDiseases:"
5,"M , did not influence clonidine - suppressible binding of [ 3H ] - dihydroergocryptine ( 1 nM ) .",O O O O O O O O O O O O O O O O O O O O,[],0,0,"Sentence: M , did not influence clonidine - suppressible binding of [ 3H ] - dihydroergocryptine ( 1 nM ) .\nDiseases:","Sentence: M , did not influence clonidine - suppressible binding of [ 3H ] - dihydroergocryptine ( 1 nM ) .\nDiseases:"
6,These findings indicate that in spontaneously hypertensive rats the effects of central alpha - adrenoceptor stimulation involve activation of opiate receptors .,O O O O O O B O O O O O O O O O O O O O O O,[hypertensive],1,3,Sentence: These findings indicate that in spontaneously hypertensive rats the effects of central alpha - adrenoceptor stimulation involve activation of opiate receptors .\nDiseases: hypertensive,Sentence: These findings indicate that in spontaneously hypertensive rats the effects of central alpha - adrenoceptor stimulation involve activation of opiate receptors .\nDiseases:
7,"As naloxone and clonidine do not appear to interact with the same receptor site , the observed functional antagonism suggests the release of an endogenous opiate by clonidine or alpha - methyldopa and the possible role of the opiate in the central control of sympathetic tone .",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,[],0,0,"Sentence: As naloxone and clonidine do not appear to interact with the same receptor site , the observed functional antagonism suggests the release of an endogenous opiate by clonidine or alpha - methyldopa and the possible role of the opiate in the central control of sympathetic tone .\nDiseases:","Sentence: As naloxone and clonidine do not appear to interact with the same receptor site , the observed functional antagonism suggests the release of an endogenous opiate by clonidine or alpha - methyldopa and the possible role of the opiate in the central control of sympathetic tone .\nDiseases:"
8,Lidocaine - induced cardiac asystole .,O O O B I O,[cardiac asystole],1,6,Sentence: Lidocaine - induced cardiac asystole .\nDiseases: cardiac asystole,Sentence: Lidocaine - induced cardiac asystole .\nDiseases:
9,Intravenous administration of a single 50 - mg bolus of lidocaine in a 67 - year - old man resulted in profound depression of the activity of the sinoatrial and atrioventricular nodal pacemakers .,O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O O O O,[depression],1,2,Sentence: Intravenous administration of a single 50 - mg bolus of lidocaine in a 67 - year - old man resulted in profound depression of the activity of the sinoatrial and atrioventricular nodal pacemakers .\nDiseases: depression,Sentence: Intravenous administration of a single 50 - mg bolus of lidocaine in a 67 - year - old man resulted in profound depression of the activity of the sinoatrial and atrioventricular nodal pacemakers .\nDiseases:


In [29]:
train.to_csv('../data/{}.train.processed.tsv'.format(data_name),sep='\t')
dev.to_csv('../data/{}.dev.processed.tsv'.format(data_name),sep='\t')

In [234]:
len(dev)

4746

In [235]:
len(dev[dev.num_tokens < 20])/len(dev), len(train[train.num_tokens < 20])/len(train)

(0.9907290349768226, 0.9856540084388186)

In [237]:
display(train.groupby('num_entities').count()[['sents']]/len(train))
display(dev.groupby('num_entities').count()[['sents']]/len(dev))

Unnamed: 0_level_0,sents
num_entities,Unnamed: 1_level_1
0,0.438186
1,0.372574
2,0.130802
3,0.035232
4,0.01308
5,0.006962
6,0.002321
7,0.000422
8,0.000211
11,0.000211


Unnamed: 0_level_0,sents
num_entities,Unnamed: 1_level_1
0,0.424989
1,0.38032
2,0.136325
3,0.039612
4,0.010957
5,0.0059
6,0.001054
7,0.000632
10,0.000211


In [238]:
def create_prompt_dataset(train_df, test_df, seed, few_shot_num, number_of_test_samples, selection_strategy):
    
    random = np.random.RandomState(seed)
    
    few_shot_prompt, chosen_prompt_ids = create_few_shot_prompt(train_df, random, few_shot_num, selection_strategy)
    
    test_df['test_ready_prompt'] = [few_shot_prompt+'\n\n'+empty_prompt for empty_prompt in test_df['empty_prompts']]
    
    if number_of_test_samples != 'all':
        #Making sure the samples are the same as the first batch
        random = np.random.RandomState(42)
        random.permutation(train_df.index)
        chosen_test_ids = random.permutation(test_df.index)[:number_of_test_samples]
        
        chosen_test_df = test_df.loc[chosen_test_ids]
    else:
        chosen_test_df = test_df
    
    return {'seed':seed,'few_shot_prompt': few_shot_prompt, 'chosen_prompt_ids':chosen_prompt_ids,'sep': sep,'test_df': chosen_test_df}

def create_few_shot_prompt(train_df, random, few_shot_num, selection_strategy):
    
    if selection_strategy == 'random':
        chosen_inds = random.permutation(train_df.index)[:few_shot_num]
    
    few_shot_prompt_list = train_df.loc[chosen_inds, 'prompts'].values
    few_shot_prompt = '\n\n'.join(few_shot_prompt_list)
    
    return few_shot_prompt, chosen_inds

In [239]:
dev

Unnamed: 0,sents,ner_seq,bio_preds,entities,num_entities,num_tokens,prompts,empty_prompts
0,Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .,B I I O O O B O O O O O,O O O O B I O O O O O O,"[toxicity, tricuspid valve regurgitation]",2,11,"Sentence: Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .\nDiseases: toxicity, tricuspid valve regurgitation",Sentence: Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .\nDiseases:
1,"A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .",O O O O B I O B I O B I I O O O O O O O O O O,O O O O O O O O O O O O O O O O O O B O O O O,"[congestive heart failure, tricuspid regurgitation, atrial flutter]",3,18,"Sentence: A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .\nDiseases: congestive heart failure, tricuspid regurgitation, atrial flutter","Sentence: A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .\nDiseases:"
2,"This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .",O O O O O O O O B I O B I O O O O O O O B I O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O,"[tricuspid regurgitation, cardiac disease, atrial flutter]",3,15,"Sentence: This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .\nDiseases: tricuspid regurgitation, cardiac disease, atrial flutter","Sentence: This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .\nDiseases:"
3,Sixty - three percent of these infants had tricuspid valve involvement .,O O O O O O O O O O O O,O O O O O O O O O O O O,[],0,0,Sentence: Sixty - three percent of these infants had tricuspid valve involvement .\nDiseases:,Sentence: Sixty - three percent of these infants had tricuspid valve involvement .\nDiseases:
4,Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .,O O O O O O O O O O O B I I O O O O O O,B I O O O O O O O O O O O O O O O O O O,[congenital heart disease],1,5,Sentence: Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .\nDiseases: congenital heart disease,Sentence: Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .\nDiseases:
...,...,...,...,...,...,...,...,...
4741,"Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .",O O O O O O B O O O O O O O O O O O O O O O O O O O O O O O B I O O O O O O O O O O O O O O O B O O B O O B O,B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,"[water intoxication, asthenia, headaches, abortions, irritability]",5,13,"Sentence: Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .\nDiseases: water intoxication, asthenia, headaches, abortions, irritability","Sentence: Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .\nDiseases:"
4742,"The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .",O O O O O O O O O O O O O O O O O O O O O O O O O O,O B O O O O O O I O O O O O O I O O O O O B O O O O,[],0,0,"Sentence: The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .\nDiseases:","Sentence: The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .\nDiseases:"
4743,The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .,O O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O B O O O O O O O O O O O O O,[],0,0,Sentence: The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .\nDiseases:,Sentence: The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .\nDiseases:
4744,The oxytocin should not be administered in excess of 36 hours .,O O O O O O O O O O O O,O B O O O O O O O O O O,[],0,0,Sentence: The oxytocin should not be administered in excess of 36 hours .\nDiseases:,Sentence: The oxytocin should not be administered in excess of 36 hours .\nDiseases:


In [240]:
test_ready_prompt_dataset = create_prompt_dataset(train, dev, seed, few_shot_num, number_of_test_samples, selection_strategy)

In [241]:
print(test_ready_prompt_dataset['few_shot_prompt'])

Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .
Diseases: vasospasm

Sentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .
Diseases: arthrogryposis

Sentence: Her QT interval returned to normal upon withdrawal of ketoconazole .
Diseases: 

Sentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erectile dysfunction .
Diseases: penile pain, erectile dysfunction

Sentence: We report a patient in whom hypersensitivity to carbamazepine presented with generalized erythroderma , a severe leukemoid reaction , eosinophilia , hyponatremia , and renal failure .
Diseases: eosinophilia, renal failure, hypersensitivity, erythroderma, leukemoid reaction, hyponatremia


In [248]:
test_ready_prompt_dataset['test_df']

Unnamed: 0,sents,ner_seq,bio_preds,entities,num_entities,num_tokens,prompts,empty_prompts,test_ready_prompt
1078,Plasma was tested for amphetamine and the cocaine metabolite benzoylecgonine using enzyme - mediated immunoassay methodology .,O O O O O O O O O O O O O O O O O,O O O O B O O B O B O O O O O O O,[],0,0,Sentence: Plasma was tested for amphetamine and the cocaine metabolite benzoylecgonine using enzyme - mediated immunoassay methodology .\nDiseases:,Sentence: Plasma was tested for amphetamine and the cocaine metabolite benzoylecgonine using enzyme - mediated immunoassay methodology .\nDiseases:,"Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
2098,4 .,O O,O O,[],0,0,Sentence: 4 .\nDiseases:,Sentence: 4 .\nDiseases:,"Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
1648,Animals were tested for four consecutive days ( 4 trial / day ) in MWM during which the position of hidden platform was unchanged .,O O O O O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O O,[],0,0,Sentence: Animals were tested for four consecutive days ( 4 trial / day ) in MWM during which the position of hidden platform was unchanged .\nDiseases:,Sentence: Animals were tested for four consecutive days ( 4 trial / day ) in MWM during which the position of hidden platform was unchanged .\nDiseases:,"Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
3489,A 50 mg higher methadone dose was associated with a 1 . 2 ( 95 % CI 1 . 1 to 1 . 4 ) times higher odds for syncope .,O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B O,O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O,[syncope],1,2,Sentence: A 50 mg higher methadone dose was associated with a 1 . 2 ( 95 % CI 1 . 1 to 1 . 4 ) times higher odds for syncope .\nDiseases: syncope,Sentence: A 50 mg higher methadone dose was associated with a 1 . 2 ( 95 % CI 1 . 1 to 1 . 4 ) times higher odds for syncope .\nDiseases:,"Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
4146,"The incidence of drug - related dependent edema was somewhat higher in the amlodipine group , particularly at a dose of 10 mg per day ( 2 . 4 % for 80 mg valsartan ; 3 . 6 % for 5 mg amlodipine ; 0 % for valsartan plus 5 mg amlodipine ; 14 . 3 % for 10 mg amlodipine ) .",O O O O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O B O O O O O O O O B O O O O B O O O B O O O O O O O O B O O,[edema],1,2,"Sentence: The incidence of drug - related dependent edema was somewhat higher in the amlodipine group , particularly at a dose of 10 mg per day ( 2 . 4 % for 80 mg valsartan ; 3 . 6 % for 5 mg amlodipine ; 0 % for valsartan plus 5 mg amlodipine ; 14 . 3 % for 10 mg amlodipine ) .\nDiseases: edema","Sentence: The incidence of drug - related dependent edema was somewhat higher in the amlodipine group , particularly at a dose of 10 mg per day ( 2 . 4 % for 80 mg valsartan ; 3 . 6 % for 5 mg amlodipine ; 0 % for valsartan plus 5 mg amlodipine ; 14 . 3 % for 10 mg amlodipine ) .\nDiseases:","Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
369,"Radiological studies were mostly angiographies performed with both ionic and non - ionic contrast material , at an average dose of 245 ml .",O O O O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O,[],0,0,"Sentence: Radiological studies were mostly angiographies performed with both ionic and non - ionic contrast material , at an average dose of 245 ml .\nDiseases:","Sentence: Radiological studies were mostly angiographies performed with both ionic and non - ionic contrast material , at an average dose of 245 ml .\nDiseases:","Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
3117,alpha - TC and DFO attenuated the MA - induced hyperthermia as well as the alterations in the locomotor activity .,O O O O O O O O O O B O O O O O O O O O O,B I I O B O O B O O O O O O O O O O O O O,[hyperthermia],1,3,Sentence: alpha - TC and DFO attenuated the MA - induced hyperthermia as well as the alterations in the locomotor activity .\nDiseases: hyperthermia,Sentence: alpha - TC and DFO attenuated the MA - induced hyperthermia as well as the alterations in the locomotor activity .\nDiseases:,"Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
2100,Both the pressor and bradycardia effects of cirazoline were abolished in chronic prazosin treated SHR ( n = 4 ) as compared to the untreated SHR ( n = 4 ) .,O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O O,O O O O O O O B O O O O B O O O O O O O O O O O O O O O O O O O,[bradycardia],1,4,Sentence: Both the pressor and bradycardia effects of cirazoline were abolished in chronic prazosin treated SHR ( n = 4 ) as compared to the untreated SHR ( n = 4 ) .\nDiseases: bradycardia,Sentence: Both the pressor and bradycardia effects of cirazoline were abolished in chronic prazosin treated SHR ( n = 4 ) as compared to the untreated SHR ( n = 4 ) .\nDiseases:,"Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
94,Arterial thromboembolism is a recognized complication of systemic heparin therapy .,O B O O O O O O O O O,O O O O O O O O B O O,[thromboembolism],1,6,Sentence: Arterial thromboembolism is a recognized complication of systemic heparin therapy .\nDiseases: thromboembolism,Sentence: Arterial thromboembolism is a recognized complication of systemic heparin therapy .\nDiseases:,"Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."
457,"Following administration of these agents , the presence , and degree of fasciculation were assessed visually on a four point scale by one investigator who was blinded to the drug administered .",O O O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,[fasciculation],1,4,"Sentence: Following administration of these agents , the presence , and degree of fasciculation were assessed visually on a four point scale by one investigator who was blinded to the drug administered .\nDiseases: fasciculation","Sentence: Following administration of these agents , the presence , and degree of fasciculation were assessed visually on a four point scale by one investigator who was blinded to the drug administered .\nDiseases:","Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erec..."


In [249]:
prompt_filename = '../data/{}_test_prompts_{}_{}.{}.{}.{}'.format(data_name, selection_strategy,seed,few_shot_num,number_of_test_samples,sep)
pickle.dump(test_ready_prompt_dataset,open(prompt_filename,'wb'))

In [201]:
# full_dev_prompt_filename = '../data/{}_test_prompts_full_dev_{}_{}.{}.{}'.format(data_name, selection_strategy,seed,few_shot_num,sep)
# pickle.dump(dev,open(full_dev_prompt_filename,'wb'))

In [202]:
dev

Unnamed: 0,sents,ner_seq,bio_preds,entities,num_entities,prompts,empty_prompts,test_ready_prompt
0,Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .,O O O O B I O O O O O O,O O O O B I O O O O O O,[lithium carbonate],1,Sentence: Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .\nDrugs: lithium carbonate,Sentence: Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
1,"A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .",O O O O O O O O O O O O O O O O O O B O O O O,O O O O O O O O O O O O O O O O O O B O O O O,[lithium],1,"Sentence: A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .\nDrugs: lithium","Sentence: A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
2,"This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .",O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O,[lithium],1,"Sentence: This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .\nDrugs: lithium","Sentence: This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
3,Sixty - three percent of these infants had tricuspid valve involvement .,O O O O O O O O O O O O,O O O O O O O O O O O O,[],0,Sentence: Sixty - three percent of these infants had tricuspid valve involvement .\nDrugs:,Sentence: Sixty - three percent of these infants had tricuspid valve involvement .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4,Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .,B I O O O O O O O O O O O O O O O O O O,B I O O O O O O O O O O O O O O O O O O,[lithium carbonate],1,Sentence: Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .\nDrugs: lithium carbonate,Sentence: Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
...,...,...,...,...,...,...,...,...
4741,"Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .",B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,[oxytocin],1,"Sentence: Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .\nDrugs: oxytocin","Sentence: Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4742,"The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .",O B O O O O O O B O O O O O O B O O O O O B O O O O,O B O O O O O O I O O O O O O I O O O O O B O O O O,"[lactate, oxytocin, dextrose]",3,"Sentence: The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .\nDrugs: lactate, oxytocin, dextrose","Sentence: The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4743,The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .,O O O O O O O O B O O O O O O O O O O O O O,O O O O O O O O B O O O O O O O O O O O O O,[oxytocin],1,Sentence: The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .\nDrugs: oxytocin,Sentence: The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4744,The oxytocin should not be administered in excess of 36 hours .,O B O O O O O O O O O O,O B O O O O O O O O O O,[oxytocin],1,Sentence: The oxytocin should not be administered in excess of 36 hours .\nDrugs: oxytocin,Sentence: The oxytocin should not be administered in excess of 36 hours .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."


In [271]:
test_ready_prompt_training_dataset = create_prompt_dataset(train, train, seed, few_shot_num, number_of_test_samples, selection_strategy)

In [272]:
test_ready_prompt_training_dataset

{'seed': 42,
 'few_shot_prompt': 'Sentence: Topical papaverine was used as a direct therapeutic action to manage vasospasm in a total of 11 patients .\nDiseases: vasospasm\n\nSentence: In summary , the chick embryo provides a reliable and simple experimental animal model of coniine - induced arthrogryposis .\nDiseases: arthrogryposis\n\nSentence: Her QT interval returned to normal upon withdrawal of ketoconazole .\nDiseases: \n\nSentence: Sodium bicarbonate alleviates penile pain induced by intracavernous injections for erectile dysfunction .\nDiseases: penile pain, erectile dysfunction\n\nSentence: We report a patient in whom hypersensitivity to carbamazepine presented with generalized erythroderma , a severe leukemoid reaction , eosinophilia , hyponatremia , and renal failure .\nDiseases: eosinophilia, renal failure, hypersensitivity, erythroderma, leukemoid reaction, hyponatremia',
 'chosen_prompt_ids': array([2313,  354, 1397, 1846, 2830]),
 'sep': ', ',
 'test_df':                

In [273]:
prompt_filename = '../data/{}_test_prompts_{}_{}.{}.{}.{}'.format(data_name+'_train', selection_strategy,seed,few_shot_num,number_of_test_samples,sep)
pickle.dump(test_ready_prompt_training_dataset,open(prompt_filename,'wb'))