## Coverting NER datasets to GPT-3 Datasets

### Inputs:
- BIO NER Dataset.
- Columns: ['tokens','ner_tags']
- Structure: "Word\tNER-Tag"
- Examples Separated by empty line

### Outputs:
- GPT-3 Ready Dataset. 
- Structure: "Prompt\tTestExample"

#### Parameters:
- Training data file
- Test data file
- Random seed
- \# of training samples per prompt
- Selection Strategy
- Prompt Structure

*For now only works with one entity type at a time  

In [187]:
import pandas as pd
import numpy as np
import ipdb
import pickle

data_name = 'bc5cdr_chemical'
entity_type = 'Drugs'
train_name = '../data/{}.train.csv'.format(data_name)
dev_name = '../data/{}.dev.csv'.format(data_name)
prompt_sample_structure = 'Sentence: {}\n'+entity_type+': {}'
empty_prompt_sample_structure = 'Sentence: {}\n'+entity_type+':'
seed = 33
few_shot_num = 5
number_of_test_samples = 200
selection_strategy = 'random'
sep = ', '

In [188]:
pd.set_option('max_rows',500,'max_colwidth',500)

In [189]:
def aggregate_to_sequence(df, token_col_name, seq_col_name):
    #Adding Sentence to a BIO dataframe with only tokens and ner_tags as columns.
    
    sents = []
    sent = []

    for i, row in df.iterrows():

        if pd.isna(row['tokens']):
            sents.append(' '.join(sent))
            sent = []
        else:
            sent.append(row[token_col_name])

    sent_col = []
    i = 0

    for j, row in df.iterrows():

        sent_col.append(sents[i])

        if pd.isna(row['tokens']):
            i+=1        

    df[seq_col_name] = sent_col
    
    return df

def extract_entities(df, sep='/'):
    #Extracting entities based on BIO Tags with columns 'sents','ner_seq'
    entities = []
    
    for i, row in df.iterrows():
        sent = row['sents'].split()
        bio_tags = row['ner_seq'].split()

        sent_entities = []
        current_ent = []
        
        num_bi_tags = 0
        
        for token, bio in zip(sent, bio_tags):
            if bio.startswith('B') or bio.startswith('O'):
                if len(current_ent) > 0:
                    current_ent = ' '.join(current_ent).lower()
                    if sep in current_ent:
                        print('woah, separator found in example {}'.format(current_ent))
                    sent_entities.append(current_ent)
                    
                    current_ent = []
                
                if bio.startswith('B'):
                    current_ent.append(token)
                    num_bi_tags += 1
                    
            elif bio.startswith('I'):
                current_ent.append(token)
                num_bi_tags += 1
        
        #Add Entity at end of sentence
        if len(current_ent) > 0:
            current_ent = ' '.join(current_ent).lower()
            if sep in current_ent:
                print('woah, separator found in example {}'.format(current_ent))
            sent_entities.append(current_ent)
                    
        assert num_bi_tags == len(' '.join(sent_entities).split()), ipdb.set_trace()
        entities.append(list(set(sent_entities)))
    
    df['entities'] = entities
    df['num_entities'] = [len(e) for e in entities]
    return df


def add_prompts(df,sep='/'):
    """
    Combining sentences and entities to create prompts in Dataframe with 'sents' and 'entities' columns.
    Adds 'prompts' and 'empty_prompts' (prompt without answer) columns to DataFrame
    """
    
    prompts = []
    empty_prompts = []
    
    for i, row in df.iterrows():
        sent = row['sents']
        entities = sep.join(row['entities'])
        
        prompt = prompt_sample_structure.format(sent, entities)
        empty_prompt = empty_prompt_sample_structure.format(sent)
        
        prompts.append(prompt)
        empty_prompts.append(empty_prompt)

    df['prompts'] = prompts
    df['empty_prompts'] = empty_prompts
    
    return df

def add_preds(df, pred_filename):
    preds = pd.read_csv(pred_filename,header=None)
    
    bio_preds = []
    
    for pred_seq in preds[0].values:
        pred_list = pred_seq.split()
    
        for pred_tag in pred_list:
            bio_preds.append(pred_tag)
            
        bio_preds.append(None)
    
    assert len(df) == len(bio_preds)
    df['bio_preds'] = bio_preds
    
    return df

def augment_bio_dataframe(df,pred_filename=None,sep='/'):
    df = aggregate_to_sequence(df, 'tokens', 'sents')
    df = aggregate_to_sequence(df, 'ner_tags', 'ner_seq')
  
    if pred_filename is not None:
        df = add_preds(df, pred_filename)
        df = aggregate_to_sequence(df, 'bio_preds', 'bio_preds')
        df = df[['sents','ner_seq','bio_preds']].drop_duplicates()
    else:
        df = df[['sents','ner_seq']].drop_duplicates()
    df = df.reset_index(drop=True)
    df = extract_entities(df, sep)
    df = add_prompts(df,sep)
    
    return df

In [190]:
train = pd.read_csv(train_name,sep='\t',skip_blank_lines=False)
dev = pd.read_csv(dev_name,sep='\t',skip_blank_lines=False)

In [192]:
train = augment_bio_dataframe(train, sep=sep)
dev = augment_bio_dataframe(dev, '../data/bc5cdr_chemical_pubmed_preds.txt',sep=sep)

woah, separator found in example 3 , 4 - methylenedioxymethamphetamine
woah, separator found in example alpha , beta - methylene adenosine - 5' - triphosphate
woah, separator found in example alpha , beta - meatp
woah, separator found in example 3s - 1 - [ 2 - ( 3 , 4 - dichlorophenyl ) ethyl ] - 1 , 4 - diazabicyclo [ 4 . 3 . 0 ] nonane
woah, separator found in example 1 - [ 2 - ( 3 , 4 - dichlorophenyl ) ethyl ] - 4 - methylpiperazine
woah, separator found in example 3r - 1 - [ 2 - ( 3 , 4 - dichlorophenyl ) ethyl ] - 1 , 4 - diazabicyclo [ 4 . 3 . 0 ] nonane
woah, separator found in example ifosfamide , vincristine , and dactinomycin
woah, separator found in example n , n' - diisopropylphosphorodiamidofluoridate
woah, separator found in example 1 , 3 - bis - ( 2 - chloroethyl ) - 1 - nitrosourea
woah, separator found in example 1 , 1 - dichloro - 2 , 2 , 2 - trifluoroethane
woah, separator found in example 1 - chloro - 1 , 2 , 2 , 2 - tetrafluoroethane
woah, separator found in examp

In [193]:
len(dev)

4746

In [194]:
display(train.groupby('num_entities').count()[['sents']]/len(train))
display(dev.groupby('num_entities').count()[['sents']]/len(dev))

Unnamed: 0_level_0,sents
num_entities,Unnamed: 1_level_1
0,0.369409
1,0.383755
2,0.162658
3,0.052532
4,0.01962
5,0.007173
6,0.00211
7,0.001266
8,0.000633
9,0.000211


Unnamed: 0_level_0,sents
num_entities,Unnamed: 1_level_1
0,0.358828
1,0.375474
2,0.180573
3,0.055415
4,0.017278
5,0.00611
6,0.004425
7,0.001054
8,0.000632
9,0.000211


In [195]:
def create_prompt_dataset(train_df, test_df, seed, few_shot_num, number_of_test_samples, selection_strategy):
    
    random = np.random.RandomState(seed)
    
    few_shot_prompt, chosen_prompt_ids = create_few_shot_prompt(train_df, random, few_shot_num, selection_strategy)
    
    test_df['test_ready_prompt'] = [few_shot_prompt+'\n\n'+empty_prompt for empty_prompt in test_df['empty_prompts']]
    
    if number_of_test_samples != 'all':
        #Making sure the samples are the same as the first batch
        random = np.random.RandomState(42)
        random.permutation(train_df.index)
        chosen_test_ids = random.permutation(test_df.index)[:number_of_test_samples]
        
        chosen_test_df = test_df.loc[chosen_test_ids]
    else:
        chosen_test_df = test_df
    
    return {'seed':seed,'few_shot_prompt': few_shot_prompt, 'chosen_prompt_ids':chosen_prompt_ids,'sep': sep,'test_df': chosen_test_df}

def create_few_shot_prompt(train_df, random, few_shot_num, selection_strategy):
    
    if selection_strategy == 'random':
        chosen_inds = random.permutation(train_df.index)[:few_shot_num]
    
    few_shot_prompt_list = train_df.loc[chosen_inds, 'prompts'].values
    few_shot_prompt = '\n\n'.join(few_shot_prompt_list)
    
    return few_shot_prompt, chosen_inds

In [196]:
dev

Unnamed: 0,sents,ner_seq,bio_preds,entities,num_entities,prompts,empty_prompts
0,Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .,O O O O B I O O O O O O,O O O O B I O O O O O O,[lithium carbonate],1,Sentence: Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .\nDrugs: lithium carbonate,Sentence: Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .\nDrugs:
1,"A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .",O O O O O O O O O O O O O O O O O O B O O O O,O O O O O O O O O O O O O O O O O O B O O O O,[lithium],1,"Sentence: A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .\nDrugs: lithium","Sentence: A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .\nDrugs:"
2,"This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .",O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O,[lithium],1,"Sentence: This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .\nDrugs: lithium","Sentence: This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .\nDrugs:"
3,Sixty - three percent of these infants had tricuspid valve involvement .,O O O O O O O O O O O O,O O O O O O O O O O O O,[],0,Sentence: Sixty - three percent of these infants had tricuspid valve involvement .\nDrugs:,Sentence: Sixty - three percent of these infants had tricuspid valve involvement .\nDrugs:
4,Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .,B I O O O O O O O O O O O O O O O O O O,B I O O O O O O O O O O O O O O O O O O,[lithium carbonate],1,Sentence: Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .\nDrugs: lithium carbonate,Sentence: Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .\nDrugs:
...,...,...,...,...,...,...,...
4741,"Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .",B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,[oxytocin],1,"Sentence: Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .\nDrugs: oxytocin","Sentence: Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .\nDrugs:"
4742,"The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .",O B O O O O O O B O O O O O O B O O O O O B O O O O,O B O O O O O O I O O O O O O I O O O O O B O O O O,"[lactate, oxytocin, dextrose]",3,"Sentence: The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .\nDrugs: lactate, oxytocin, dextrose","Sentence: The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .\nDrugs:"
4743,The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .,O O O O O O O O B O O O O O O O O O O O O O,O O O O O O O O B O O O O O O O O O O O O O,[oxytocin],1,Sentence: The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .\nDrugs: oxytocin,Sentence: The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .\nDrugs:
4744,The oxytocin should not be administered in excess of 36 hours .,O B O O O O O O O O O O,O B O O O O O O O O O O,[oxytocin],1,Sentence: The oxytocin should not be administered in excess of 36 hours .\nDrugs: oxytocin,Sentence: The oxytocin should not be administered in excess of 36 hours .\nDrugs:


In [197]:
test_ready_prompt_dataset = create_prompt_dataset(train, dev, seed, few_shot_num, number_of_test_samples, selection_strategy)

In [198]:
print(test_ready_prompt_dataset['few_shot_prompt'])

Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inhibited young rats .
Drugs: warfarin, phosphate

Sentence: , we selected the cases of confusion reported since 1985 with valproic acid .
Drugs: valproic acid

Sentence: No histopathological alterations or differences in bone formation were seen in the limbs or toes of any chicks from any group ; however , extensive cranial hemorrhage occurred in all nicotine sulfate - treated chicks .
Drugs: nicotine

Sentence: Two subsets of patients were identified from this latter group : the first included four p

In [199]:
test_ready_prompt_dataset['test_df']

Unnamed: 0,sents,ner_seq,bio_preds,entities,num_entities,prompts,empty_prompts,test_ready_prompt
1078,Plasma was tested for amphetamine and the cocaine metabolite benzoylecgonine using enzyme - mediated immunoassay methodology .,O O O O B O O B O B O O O O O O O,O O O O B O O B O B O O O O O O O,"[cocaine, amphetamine, benzoylecgonine]",3,"Sentence: Plasma was tested for amphetamine and the cocaine metabolite benzoylecgonine using enzyme - mediated immunoassay methodology .\nDrugs: cocaine, amphetamine, benzoylecgonine",Sentence: Plasma was tested for amphetamine and the cocaine metabolite benzoylecgonine using enzyme - mediated immunoassay methodology .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
2098,4 .,O O,O O,[],0,Sentence: 4 .\nDrugs:,Sentence: 4 .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
1648,Animals were tested for four consecutive days ( 4 trial / day ) in MWM during which the position of hidden platform was unchanged .,O O O O O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O O,[],0,Sentence: Animals were tested for four consecutive days ( 4 trial / day ) in MWM during which the position of hidden platform was unchanged .\nDrugs:,Sentence: Animals were tested for four consecutive days ( 4 trial / day ) in MWM during which the position of hidden platform was unchanged .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
3489,A 50 mg higher methadone dose was associated with a 1 . 2 ( 95 % CI 1 . 1 to 1 . 4 ) times higher odds for syncope .,O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O,O O O O B O O O O O O O O O O O O O O O O O O O O O O O O O O,[methadone],1,Sentence: A 50 mg higher methadone dose was associated with a 1 . 2 ( 95 % CI 1 . 1 to 1 . 4 ) times higher odds for syncope .\nDrugs: methadone,Sentence: A 50 mg higher methadone dose was associated with a 1 . 2 ( 95 % CI 1 . 1 to 1 . 4 ) times higher odds for syncope .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4146,"The incidence of drug - related dependent edema was somewhat higher in the amlodipine group , particularly at a dose of 10 mg per day ( 2 . 4 % for 80 mg valsartan ; 3 . 6 % for 5 mg amlodipine ; 0 % for valsartan plus 5 mg amlodipine ; 14 . 3 % for 10 mg amlodipine ) .",O O O O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O B O O O O O O O O B O O O O B O O O B O O O O O O O O B O O,O O O O O O O O O O O O O B O O O O O O O O O O O O O O O O O O O B O O O O O O O O B O O O O B O O O B O O O O O O O O B O O,"[amlodipine, valsartan]",2,"Sentence: The incidence of drug - related dependent edema was somewhat higher in the amlodipine group , particularly at a dose of 10 mg per day ( 2 . 4 % for 80 mg valsartan ; 3 . 6 % for 5 mg amlodipine ; 0 % for valsartan plus 5 mg amlodipine ; 14 . 3 % for 10 mg amlodipine ) .\nDrugs: amlodipine, valsartan","Sentence: The incidence of drug - related dependent edema was somewhat higher in the amlodipine group , particularly at a dose of 10 mg per day ( 2 . 4 % for 80 mg valsartan ; 3 . 6 % for 5 mg amlodipine ; 0 % for valsartan plus 5 mg amlodipine ; 14 . 3 % for 10 mg amlodipine ) .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
369,"Radiological studies were mostly angiographies performed with both ionic and non - ionic contrast material , at an average dose of 245 ml .",O O O O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O,[],0,"Sentence: Radiological studies were mostly angiographies performed with both ionic and non - ionic contrast material , at an average dose of 245 ml .\nDrugs:","Sentence: Radiological studies were mostly angiographies performed with both ionic and non - ionic contrast material , at an average dose of 245 ml .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
3117,alpha - TC and DFO attenuated the MA - induced hyperthermia as well as the alterations in the locomotor activity .,B I I O B O O B O O O O O O O O O O O O O,B I I O B O O B O O O O O O O O O O O O O,"[ma, alpha - tc, dfo]",3,"Sentence: alpha - TC and DFO attenuated the MA - induced hyperthermia as well as the alterations in the locomotor activity .\nDrugs: ma, alpha - tc, dfo",Sentence: alpha - TC and DFO attenuated the MA - induced hyperthermia as well as the alterations in the locomotor activity .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
2100,Both the pressor and bradycardia effects of cirazoline were abolished in chronic prazosin treated SHR ( n = 4 ) as compared to the untreated SHR ( n = 4 ) .,O O O O O O O B O O O O B O O O O O O O O O O O O O O O O O O O,O O O O O O O B O O O O B O O O O O O O O O O O O O O O O O O O,"[cirazoline, prazosin]",2,"Sentence: Both the pressor and bradycardia effects of cirazoline were abolished in chronic prazosin treated SHR ( n = 4 ) as compared to the untreated SHR ( n = 4 ) .\nDrugs: cirazoline, prazosin",Sentence: Both the pressor and bradycardia effects of cirazoline were abolished in chronic prazosin treated SHR ( n = 4 ) as compared to the untreated SHR ( n = 4 ) .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
94,Arterial thromboembolism is a recognized complication of systemic heparin therapy .,O O O O O O O O B O O,O O O O O O O O B O O,[heparin],1,Sentence: Arterial thromboembolism is a recognized complication of systemic heparin therapy .\nDrugs: heparin,Sentence: Arterial thromboembolism is a recognized complication of systemic heparin therapy .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
457,"Following administration of these agents , the presence , and degree of fasciculation were assessed visually on a four point scale by one investigator who was blinded to the drug administered .",O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,[],0,"Sentence: Following administration of these agents , the presence , and degree of fasciculation were assessed visually on a four point scale by one investigator who was blinded to the drug administered .\nDrugs:","Sentence: Following administration of these agents , the presence , and degree of fasciculation were assessed visually on a four point scale by one investigator who was blinded to the drug administered .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."


In [200]:
prompt_filename = '../data/{}_test_prompts_{}_{}.{}.{}.{}'.format(data_name, selection_strategy,seed,few_shot_num,number_of_test_samples,sep)
pickle.dump(test_ready_prompt_dataset,open(prompt_filename,'wb'))

In [201]:
full_dev_prompt_filename = '../data/{}_test_prompts_full_dev_{}_{}.{}.{}'.format(data_name, selection_strategy,seed,few_shot_num,sep)
pickle.dump(dev,open(full_dev_prompt_filename,'wb'))

In [202]:
dev

Unnamed: 0,sents,ner_seq,bio_preds,entities,num_entities,prompts,empty_prompts,test_ready_prompt
0,Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .,O O O O B I O O O O O O,O O O O B I O O O O O O,[lithium carbonate],1,Sentence: Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .\nDrugs: lithium carbonate,Sentence: Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
1,"A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .",O O O O O O O O O O O O O O O O O O B O O O O,O O O O O O O O O O O O O O O O O O B O O O O,[lithium],1,"Sentence: A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .\nDrugs: lithium","Sentence: A newborn with massive tricuspid regurgitation , atrial flutter , congestive heart failure , and a high serum lithium level is described .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
2,"This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .",O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O,O O O O O O O O O O O O O O O O O O O O O O O O O O B O O O O O O O O,[lithium],1,"Sentence: This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .\nDrugs: lithium","Sentence: This is the first patient to initially manifest tricuspid regurgitation and atrial flutter , and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
3,Sixty - three percent of these infants had tricuspid valve involvement .,O O O O O O O O O O O O,O O O O O O O O O O O O,[],0,Sentence: Sixty - three percent of these infants had tricuspid valve involvement .\nDrugs:,Sentence: Sixty - three percent of these infants had tricuspid valve involvement .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4,Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .,B I O O O O O O O O O O O O O O O O O O,B I O O O O O O O O O O O O O O O O O O,[lithium carbonate],1,Sentence: Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .\nDrugs: lithium carbonate,Sentence: Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
...,...,...,...,...,...,...,...,...
4741,"Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .",B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,B O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O,[oxytocin],1,"Sentence: Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .\nDrugs: oxytocin","Sentence: Oxytocin administration during midtrimester - induced abortions is advocated only if it can be carried out under careful observations of an alert nursing staff , aware of the symptoms of water intoxication and instructed to watch the diuresis and report such early signs of the syndrome as asthenia , muscular irritability , or headaches .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4742,"The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .",O B O O O O O O B O O O O O O B O O O O O B O O O O,O B O O O O O O I O O O O O O I O O O O O B O O O O,"[lactate, oxytocin, dextrose]",3,"Sentence: The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .\nDrugs: lactate, oxytocin, dextrose","Sentence: The oxytocin should be given only in Ringers lactate or , alternately , in Ringers lactate and a 5 per cent dextrose and water solutions .\nDrugs:","Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4743,The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .,O O O O O O O O B O O O O O O O O O O O O O,O O O O O O O O B O O O O O O O O O O O O O,[oxytocin],1,Sentence: The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .\nDrugs: oxytocin,Sentence: The urinary output should be monitored and the oxytocin administration discontinued and the serum electrolytes checked if the urinary output decreases .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
4744,The oxytocin should not be administered in excess of 36 hours .,O B O O O O O O O O O O,O B O O O O O O O O O O,[oxytocin],1,Sentence: The oxytocin should not be administered in excess of 36 hours .\nDrugs: oxytocin,Sentence: The oxytocin should not be administered in excess of 36 hours .\nDrugs:,"Sentence: Although the explanation for the association between artery calcification and growth status can not be determined from the present study , there was a relationship between higher serum phosphate and susceptibility to artery calcification , with 30 % higher levels of serum phosphate in young , ad libitum - fed rats compared with either of the groups that was resistant to Warfarin - induced artery calcification , ie , the 10 - month - old rats and the restricted - diet , growth - inh..."
