# Treatment NER

GOAL: Get treatments out of all descriptions accurately 

### Rough Steps
- Get new training set with real data from mediBoard
- Train BERT off of training seet
- SoftMatch matches to existing treatments from studies
- Pack it all together

In [106]:
import pandas as pd

In [2]:
from sqlalchemy import create_engine

db = create_engine("postgresql://meditreats:meditreats@df-treats-db.cs6hxh6ocizm.us-west-2.rds.amazonaws.com:5432/meditreats")

connection = db.connect()

labels = pd.read_sql_query("select * from administrations join groups on administrations.group = groups.id join treatments on administrations.treatment = treatments.id where groups.annotated=true", connection)

labels[['title', 'description', 'name', 'study_id', 'study']].sort_values('study').to_csv('admin_labels.csv')

## Treatments

In [31]:
labeled_admins = pd.read_csv('admin_labels_edited.csv')

In [32]:
labeled_admins = labeled_admins[['title', 'description', 'name', 'study_id', 'study']]

In [33]:
labeled_admins = labeled_admins.groupby(['title', 'description', 'study_id', 'study']).agg({
    'name': lambda x: ' | '.join(x),
}).reset_index()

In [37]:
labeled_admins['count'] = labeled_admins['name'].str.split('|').apply(lambda x: len(x))

In [43]:
training_admins = pd.concat([labeled_admins[labeled_admins['count'] > 1], labeled_admins[labeled_admins['count'] == 1][:110]])

In [44]:
training_admins

Unnamed: 0,title,description,study_id,study,name,count
8,10 mg IR in Study 326,Donepezil SR 23 mg once daily orally for 12 mo...,OG001,NCT00566501,Donepezil SR | Donepezil,2
11,23 mg SR in Study 326,Donepezil SR 23 mg once daily orally for 12 mo...,OG000,NCT00566501,Donepezil SR | Donepezil,2
12,"A - Tacrolimus, Methotrexate, Sirolimus (TMS) Arm","Rituximab: 375 mg/m2 intravenous (IV), day 1 f...",OG000,NCT00520130,Methotrexate | Mesna | Rituximab | Filgrastim ...,9
13,"A - Tacrolimus, Methotrexate, Sirolimus (TMS) Arm","Rituximab:375 mg/m2 IV, day 1 for patients wit...",OG000,NCT00520130,Rituximab | Filgrastim | Cytarabine | Fludarab...,9
14,"A - Tacrolimus, Methotrexate, Sirolimus (TMS) Arm","TMS Arm Rituximab: 375 mg/m2 IV, day 1 for pat...",OG000,NCT00520130,Filgrastim | Cytarabine | Tacrolimus | Cycloph...,9
...,...,...,...,...,...,...
180,Part B: E2006 Matched Placebo or Zolpidem Matc...,Otherwise healthy participants with primary in...,OG000,NCT01463098,Placebo,1
181,Part B: Zolpidem 10 mg,Otherwise healthy participants with primary in...,OG001,NCT01463098,Zolpidem,1
184,Pen (Period 1) / Vial & Syringe (Period 2),Crossover study group of insulin glargine via ...,OG001,NCT01240200,Insulin Glargine,1
185,Pen (Period 1) / Vial & Syringe (Period 2),Crossover study group where participants were ...,OG001,NCT01240200,Insulin Glargine,1


In [51]:
training_admins['title_and_desc'] = 'Title: '\
    + training_admins['title']\
    + ' Description: '\
    + training_admins['description']\
    + '\n\n$$$\n\n'

In [52]:
train = training_admins.rename(columns = {
    'title_and_desc': 'prompt',
    'name': 'completion'
})[['prompt', 'completion']]

In [54]:
train['completion'] = ' ' + train['completion'] + ' ###'

In [55]:
train.to_json('treat_data_2.jsonl', orient='records', lines=True)

Unnamed: 0,prompt,completion
8,Title: 10 mg IR in Study 326 Description: Done...,Donepezil SR | Donepezil ###
11,Title: 23 mg SR in Study 326 Description: Done...,Donepezil SR | Donepezil ###
12,"Title: A - Tacrolimus, Methotrexate, Sirolimus...",Methotrexate | Mesna | Rituximab | Filgrastim...
13,"Title: A - Tacrolimus, Methotrexate, Sirolimus...",Rituximab | Filgrastim | Cytarabine | Fludara...
14,"Title: A - Tacrolimus, Methotrexate, Sirolimus...",Filgrastim | Cytarabine | Tacrolimus | Cyclop...
...,...,...
180,Title: Part B: E2006 Matched Placebo or Zolpid...,Placebo ###
181,Title: Part B: Zolpidem 10 mg Description: Oth...,Zolpidem ###
184,Title: Pen (Period 1) / Vial & Syringe (Period...,Insulin Glargine ###
185,Title: Pen (Period 1) / Vial & Syringe (Period...,Insulin Glargine ###


## GPT-3
- Let's see if it can do it

In [11]:
# Let's connect to the API
Secret = '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'
import os
import openai
openai.organization = "org-j6fGVx3OgjgpAbCQFHOmdEUe"
openai.api_key = Secret
len(openai.Model.list())

2

In [114]:
# Load up the labeled data
labeled_admins = pd.read_csv('admin_labels_labeled.csv')
labeled_admins = labeled_admins.drop(labeled_admins.columns[0], axis=1).fillna(value='NaN')
labeled_admins

Unnamed: 0,title,Treatment,Dosage,description,study_id,study
0,Clobazam High Dose,Clobazam,5 to 40 mg,5 to 40 mg/day with doses in the morning and a...,OG001,NCT00162981
1,Clobazam Low Dose,Clobazam,5 to 10 mg,5 to 10 mg/day with doses in the morning and a...,OG000,NCT00162981
2,Intranasal Midazolam,Midazolam,0.2 mg,0.2 mg/kg Intranasal Midazolam for a seizure l...,OG000,NCT00326612
3,Intranasal Midazolam,Midazolam,,Patients who required oxygen at discharge from...,OG000,NCT00326612
4,Rectal Diazepam,Diazepam,,Includes intubation,OG001,NCT00326612
...,...,...,...,...,...,...
273,Pembrolizumab + Epacadostat,Pembrolizumab,200 mg,Participants received pembrolizumab 200 mg as ...,OG000,NCT03322540
274,Celecoxib,Celecoxib,,Patients receiving celebrex preoperative and p...,OG001,NCT03331315
275,Ketorolac,Ketorolac,,Patients receiving scheduled ketorolac postope...,OG000,NCT03331315
276,Celecoxib,Celecoxib,,Participants,OG001,NCT03331315


In [115]:
# pair each treatment with its dose
labeled_admins['treat_and_dose'] = labeled_admins['Treatment'] + ' @ ' + labeled_admins['Dosage']

In [116]:
labeled_admins = labeled_admins.groupby(['title', 'description', 'study_id', 'study']).agg({
    'treat_and_dose': lambda x: ' \n\n '.join(x),
}).reset_index()

In [117]:
labeled_admins

Unnamed: 0,title,description,study_id,study,treat_and_dose
0,0.1 g/kg Octagam 10% Every 2 Weeks,Participants received 0.1 g/kg octagam 10% int...,OG001,NCT00812565,Octagam @ 0.1 g/kg
1,0.2 g/kg Octagam 10% Every 4 Weeks,Participants received 0.2 g/kg octagam 10% int...,OG005,NCT00812565,Octagam @ 0.2 g/kg
2,0.25 g/kg Octagam 10% Every 2 Weeks,Participants received 0.25 g/kg octagam 10% ev...,OG002,NCT00812565,Octagam @ 0.25 g/kg
3,0.4 g/kg Octagam 10% Every 2 Weeks,Participants received of 0.4 g/kg octagam 10% ...,OG003,NCT00812565,Octagam @ 0.4 g/kg
4,0.5 g/kg Octagam 10% Every 4 Weeks,Participants received 0.5 g/kg octagam 10% eve...,OG006,NCT00812565,Octagam @ 0.5 g/kg
...,...,...,...,...,...
186,Warfarin,Subjects assigned to warfarin therapy.\n\nWarf...,OG001,NCT01182441,Warfarin @ NaN
187,Warfarin/Placebo Edoxaban,Warfarin tablets plus placebo Edoxaban tablets...,OG002,NCT00781391,Placebo @ NaN \n\n Warfarin @ NaN
188,ZOMIG 0.5 mg,ZOMIG nasal spray,OG001,NCT01211145,Zomig @ 0.5 mg
189,ZOMIG 2.5 mg,ZOMIG nasal spray,OG002,NCT01211145,Zomig @ 2.5 mg


In [118]:
# prep data for GPT3

labeled_admins['title_and_desc'] = 'title: '\
    + labeled_admins['title']\
    + ' $ description: '\
    + labeled_admins['description']\
    + '\n\n###\n\n'\

labeled_admins['treat_and_dose'] = ' ' + labeled_admins['treat_and_dose'] + ' ###'

In [119]:
train = labeled_admins.rename(columns = {
    'title_and_desc': 'prompt',
    'treat_and_dose': 'completion'
})[['prompt', 'completion']]

In [121]:
train.to_json('data.jsonl', orient='records', lines=True)

In [None]:
train

In [83]:
# Train model
!export default OPENAI_API_KEY='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'
!echo $OPENAI_API_KEY
!openai -k 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 api  fine_tunes.create -t data.jsonl -m davinci 



Found potentially duplicated files with name 'data.jsonl', purpose 'fine-tune' and size 56195 bytes
file-RnFXbe2lnpmxol7r9iG4LOTG
file-56w0yf1jGAZZVuRpSPWCYI9N
file-NQT7heIzs1x5DX43x5SdCQRa
file-BynuKtlSUbE0pGWnfbl1c3ET
Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: ^C



### Just Treatments
- Let's look at just treatments

In [208]:
labeled_admins = pd.read_csv('admin_labels_labeled.csv')
labeled_admins = labeled_admins.drop(labeled_admins.columns[0], axis=1).fillna(value='NaN')
labeled_admins

Unnamed: 0,title,Treatment,Dosage,description,study_id,study
0,Clobazam High Dose,Clobazam,5 to 40 mg,5 to 40 mg/day with doses in the morning and a...,OG001,NCT00162981
1,Clobazam Low Dose,Clobazam,5 to 10 mg,5 to 10 mg/day with doses in the morning and a...,OG000,NCT00162981
2,Intranasal Midazolam,Midazolam,0.2 mg,0.2 mg/kg Intranasal Midazolam for a seizure l...,OG000,NCT00326612
3,Intranasal Midazolam,Midazolam,,Patients who required oxygen at discharge from...,OG000,NCT00326612
4,Rectal Diazepam,Diazepam,,Includes intubation,OG001,NCT00326612
...,...,...,...,...,...,...
273,Pembrolizumab + Epacadostat,Pembrolizumab,200 mg,Participants received pembrolizumab 200 mg as ...,OG000,NCT03322540
274,Celecoxib,Celecoxib,,Patients receiving celebrex preoperative and p...,OG001,NCT03331315
275,Ketorolac,Ketorolac,,Patients receiving scheduled ketorolac postope...,OG000,NCT03331315
276,Celecoxib,Celecoxib,,Participants,OG001,NCT03331315


In [209]:
labeled_admins['Treatment'] = labeled_admins['Treatment'].str.lower()

In [210]:
labeled_admins = labeled_admins.groupby(['title', 'description', 'study_id', 'study']).agg({
    'Treatment': lambda x: ', '.join(x),
}).reset_index()

In [211]:
labeled_admins['title_and_desc'] = '*title*: '\
    + labeled_admins['title']\
    + ' *description*: '\
    + labeled_admins['description']\
    + '\n\n$$$\n\n'

In [212]:
train = labeled_admins.rename(columns = {
    'title_and_desc': 'prompt',
    'Treatment': 'completion'
})[['prompt', 'completion']]

In [213]:
train['completion'] = ' *treatments given*: ' + train['completion'] + ' %%%'

In [214]:
train.to_json('treat_data_2.jsonl', orient='records', lines=True)

In [88]:
openai.Completion.create(
    model='curie:ft-the-medical-board-2023-01-17-19-44-57',
    prompt='title: Placebo q2w\n description: 2 subcutaneous injections of Placebo (for Dupilumab) as a loading dose on Day 1, followed by a single injection q2w for 24 weeks in combination with OCS - (prednisone or prednisolone) and stable ICS. OCS dose was reduced according to a predetermined titration schedule every 4 weeks until Week 20.\n\n###\n\n')

<OpenAIObject text_completion id=cmpl-6ZmzyLk0XI8sGUsA1K0yodQMWGuVG at 0x16e69d310> JSON: {
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": " Prednisolone @ NaN \n\n Prednisone @ NaN ###"
    }
  ],
  "created": 1673987850,
  "id": "cmpl-6ZmzyLk0XI8sGUsA1K0yodQMWGuVG",
  "model": "curie:ft-the-medical-board-2023-01-17-19-44-57",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 16,
    "prompt_tokens": 88,
    "total_tokens": 104
  }
}

In [89]:
openai.Completion.create(
    model='curie:ft-the-medical-board-2023-01-17-19-44-57',
    prompt='title: Part A: E2006 Matched Placebo\n description: Healthy participants received E2006-matched placebo, capsule, orally in the morning, one hour after lights-on, on Day 1.\n\n###\n\n')



<OpenAIObject text_completion id=cmpl-6Zn2Dusq9udqH5MKiz34cEohuDPlU at 0x16e69d7f0> JSON: {
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": " Placebo @ NaN ###\n\n E2006 @ NaN ###\n\n Placebo"
    }
  ],
  "created": 1673987989,
  "id": "cmpl-6Zn2Dusq9udqH5MKiz34cEohuDPlU",
  "model": "curie:ft-the-medical-board-2023-01-17-19-44-57",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 16,
    "prompt_tokens": 45,
    "total_tokens": 61
  }
}

# Prod Pipeline

- Pipeline to get the treatments out of groups

In [4]:
from sqlalchemy import create_engine

db = create_engine("postgresql://meditreats:meditreats@df-treats-db.cs6hxh6ocizm.us-west-2.rds.amazonaws.com:5432/meditreats")

connection = db.connect()

In [5]:
# Query groups then do NER
groups = pd.read_sql("select * from groups", connection)

In [6]:
groups['prompt'] = 'Title: ' + groups['title'] + ' Description: ' + groups['description'] + ''

In [171]:
groups['batch_group'] = groups.index % (len(groups.index) // 20)
groups['batch_group'].value_counts()

0       21
5       21
1       21
7       21
6       21
        ..
1640    20
1639    20
1638    20
1637    20
4908    20
Name: batch_group, Length: 4909, dtype: int64

In [178]:
groups[groups['batch_group'] == 2]['hello'] = groups[groups['batch_group'] == 2]['title'].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  groups[groups['batch_group'] == 2]['hello'] = groups[groups['batch_group'] == 2]['title'].apply(lambda x: x[0])


In [179]:
groups[groups['batch_group'] == 2]

Unnamed: 0,id,title,study_id,description,study,annotated,prompt,batch_group,hello
2,2,Arm I,OG000,Patients receive oral vorinostat twice daily o...,NCT00262834,,title: Arm I \n description: Patients receive ...,2,a
4911,4866,Single Arm - Transplant Pre-conditioning Per S...,OG000,"Conditioning with Alemtuzumab, busulfan and fl...",NCT00301834,,title: Single Arm - Transplant Pre-conditionin...,2,a
9820,9752,Part 1: Conversion,OG000,Subjects converted from their previous CD-LD t...,NCT01411137,,title: Part 1: Conversion \n description: Subj...,2,a
14729,14651,Dexamethasone Group,OG000,"0.5% Bupivacaine with 1:200,000epinephrine + D...",NCT01975285,,title: Dexamethasone Group \n description: 0.5...,2,a
19638,19526,OA Participants,OG000,Participants with OA who responded to the WOMA...,NCT01430559,,title: OA Participants \n description: Partici...,2,a
24547,24413,Dose Dense TC + Pegfilgrastim,OG000,Docetaxel + Cyclophosphamide chemotherapy give...,NCT01671319,,title: Dose Dense TC + Pegfilgrastim \n descri...,2,a
29456,29385,Phase Ia: Afatinib 20mg+Trastuzumab 8 mg/kg,OG000,"In each 21 day treatment cycle, patients were ...",NCT01649271,,title: Phase Ia: Afatinib 20mg+Trastuzumab 8 m...,2,a
34365,34252,Asacol,OG001,1.6g/day administered 800 mg BID,NCT00151892,,title: Asacol \n description: 1.6g/day adminis...,2,a
39274,39189,Midazolam,OG001,Subjects who are randomized to be on this grou...,NCT02522377,,title: Midazolam \n description: Subjects who ...,2,a
44183,44116,Dysport NG 50 U,OG002,"Botulinum type A toxin (Dysport RU), Intramusc...",NCT01333397,,title: Dysport NG 50 U \n description: Botulin...,2,a


In [9]:
groups['prompt'][:20].tolist()

['Title: Vorinostat Description: Women in the vorinostat group were scheduled to receive 6 doses of oral vorinostat at 300 mg twice daily (bid), with the last dose administered by study personnel approximately 2 hours before the scheduled breast surgery (or biopsy). \n\n$$$\n\n"',
 'Title: Tissue Only Description: Women who declined vorinostat but agreed to donate tissues for biomarker assessment, signed a separate informed consent and were enrolled as controls. \n\n$$$\n\n"',
 'Title: Arm I Description: Patients receive oral vorinostat twice daily on days -3 to 0. Approximately 2 hours after the final dose of vorinostat, patients undergo conventional surgery of the tumor on day 0. After completion of study treatment, patients are followed for 30 days.\n\nvorinostat: Given orally, conventional surgery to follow.\n\nconventional surgery: Undergo conventional surgery \n\n$$$\n\n"',
 'Title: Ruboxistaurin Description: 32 mg given once daily as oral tablet for 2 years. \n\n$$$\n\n"',
 'Tit

In [31]:
# batch 20 prompts in a requsts
response = openai.Completion.create(
    model="davinci:ft-the-medical-board-2023-01-20-21-50-49",
    prompt=groups['prompt'][:20].tolist(),
    temperature=.5
)

In [29]:
groups['description'][11]

"Placebo SC injections every 4 weeks (wks) from Week (Wk) 0 thru Wk 20 (unless early escape at Wk 16); golimumab - if early escape, 50 mg SC every 4 wks from Wk 16 up to 5 yrs; golimumab - 50 mg SC beginning Wk 24 up to 5 yrs (unless early escape); golimumab- Dr's discretion after unblinding, dose adjust from 50 to 100 mg."

In [33]:
[x['text'] for x in response['choices']]

['vorinostat" ###\n\n Vorinostat ###: ### ###\n',
 'vorinostat" ###\n\n Vorinostat ###\n\n ###\n\n',
 'vorinostat" | "conventional surgery" ### \n\nV',
 'Ruboxistaurin ###" ### ### ### ### ### ### ### ### ###',
 '5-Fluorouracil | Cisplatin | Interfer',
 'Humatrope" ### ### Humatrope ### ### ### ### ###\n',
 'Humatrope" ###\n\n ### ### ### ### ### ### ### ### ### ###',
 'Ibuprofen" ###\n\n Ibuprofen ### ### ### ###',
 'Ibuprofen | Acetaminophen" ###\n\n Acetamin',
 'Ibuprofen | Acetaminophen ###" ### ### ### ###',
 'Zoledronic Acid" ### ### Zoledronic Acid ### ### ### ### ###',
 'Golimumab | Placebo" ### Golimumab | Placebo ###',
 'Golimumab" ### Golimumab ### Golimumab ### Golimum',
 'Golimumab" ### Golimumab ### ### ### ### ### ### ###',
 'Golimumab" ### Golimumab ### Golimumab ### Golimum',
 'Procrit" ### Epoetin Alfa ###: ### ### ### ###',
 'placebo" ### Placebo ### ### ### ### ### ### ### ### ### ###',
 'methylphenidate" ###\n\n Methylphenidate ### ### ### ### ### ###',
 'methylphe

In [190]:
[x['text'] for x in response['choices']]

['vorinostat" ###\n\n vorinostat ###\n\n medication ###',
 'Tissues Only" ###\n\n vorinostat ### controls ### tissue ###',
 'vorinostat"\n\n"conventional surgery"\n\n###\n\n',
 'flumazenil" \n\n"gabapentin" ###',
 'placebo" ###\n\n placebo ### low ### ciwa ###\n\nSal',
 'placebo" ###\n\n saline ###\n\n ###\n\n###\n\n placebo ###',
 'flumazenil" \n\n"gabapentin" ###',
 'ruboxistaurin" ###\n\n ruboxistaurin ###\n\n ###',
 'cisplatin \n\n\nifn-alpha-2b \n\n',
 'humatrope" ### tdros ### \n\nb9r-',
 'Humatrope" ###\n\n b9r-us-gdfg',
 'ibuprofen" ###\n\n ibuprofen ###\n\n ###\n\n ib',
 'ibuprofen" \n\n"acetaminophen" ###\n\n ib',
 'Ibuprofen" \n\n"acetaminophen" ###\n\n',
 'zoledronic acid" ###\n\n zoledronic acid ###\n\n ###\n\n val',
 'golimumab" \n\n"placebo" ###\n\n golimum',
 'golimumab" ###\n\n golimumab ###\n\n diabetes ###\n\n gol',
 'golimumab" ###\n\n golimumab ###\n\n consensus ###\n\n nan',
 'Golimumab" ###\n\n golimumab ###\n\n combine groups ii and',
 'procrit" ###\n\n procrit 

In [154]:
[x['text'] for x in response['choices']]

['Vorinostat @ 300 mg bid" ###\n\n Vorinostat',
 'Vorinostat @ NaN \n\n Tissues @ NaN"',
 'vorinostat @ NaN mg" \n\n\n"conventional surgery',
 'Flumazenil @ 2 mg" \n\n"Gabapentin',
 'Placebo @ 20 mg" \n ###\n\n Saline @ 20 mg ###',
 'Saline @ NaN" \n\n Placebo @ NaN ###\n\n',
 'Flumazenil @ 2 mg \n\n Gabapentin @ 600 mg',
 'Ruboxistaurin" @ 32 mg ###\n\n###\n\n Rubox',
 '5-FU @ 175 mg/m^2" \n"Cis',
 'Humatrope @ B9R-US-GDFG" @',
 'Humatrope @ NaN \n\n Humatrope @ NaN ###',
 'Ibuprofen @ 10mg/kg" ###\n\n Ibupro',
 'Ibuprofen @ 10 mg/kg" \n"Acet',
 'Ibuprofen @ 10 mg/kg" \n "Acet',
 'Zoledronic Acid @ 3.0 to 4.0 mg ###\n\n\n',
 'Golimumab @ 50 mg — 100 mg" \n\n Placebo @',
 'Golimumab" @ 50 mg \n\n Golimumab @ 100 mg',
 'Golimumab @ 100 mg" ### \n\n Golimumab @ 100',
 'golimumab @ 50 mg" \n"golimumab @',
 'Procrit @ 40,000 IU" ###\n\n Epoetin alpha']

## Prompt Engineering
- Fuck fine tuning

In [7]:
# Let's connect to the API
Secret = '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'
import os
import openai
openai.organization = "org-j6fGVx3OgjgpAbCQFHOmdEUe"
openai.api_key = Secret
len(openai.Model.list())

2

In [8]:
example_prompt="""
    "$$$"
    
    Please respond in a JSON list where each element of the list is a dictionary of the form
    
    ```
    {
        "medication": "medication name",
        "dosage": "dosage amount",
        "frequency": "frequency of dosage",
        "start": "start time",
        "end": "end time"
    }
    ```
"""

In [31]:
example = example_prompt.replace("$$$","Title: Elotuzumab 10 mg\/kg Description: Elotuzumab was administered IV at a dose of 10 mg\/kg weekly (Days 1, 8, 15 and 22 of 4-week cycle) for the first 2 cycles and bi-weekly (every 2 weeks) (Day 1 and Day 15) thereafter until disease progression or unacceptable toxicity became apparent. Lenalidomide 25 mg was administered PO once daily for the first 3 weeks of each 4-week cycle. The dose of lenalidomide was administered at least 2-4 hours after completion of elotuzumab infusion. On weeks without elotuzumab administration, dexamethasone was administered PO as a single dose of 40 mg. On weeks of elotuzumab infusion, the weekly dose of dexamethasone was administered as a split dose of: 28 mg PO (between 3 to 24 hours prior to the start of the elotuzumab infusion) and 8 mg IV (At least 45 min prior to the start of the elotuzumab infusion).")


In [32]:
response = openai.Completion.create(
    model="text-davinci-003",
    prompt=example,
    max_tokens=1000,
    temperature=.3)

In [33]:
response

<OpenAIObject text_completion id=cmpl-6cF3aOIgsWAIfd15aVqHRlHJEKCWB at 0x15ccfb4d0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n[\n    {\n        \"medication\": \"Elotuzumab\",\n        \"dosage\": \"10 mg/kg\",\n        \"frequency\": \"Weekly (Days 1, 8, 15 and 22 of 4-week cycle) for the first 2 cycles and bi-weekly (every 2 weeks) (Day 1 and Day 15) thereafter\",\n        \"start\": \"Day 1\",\n        \"end\": \"Until disease progression or unacceptable toxicity becomes apparent\"\n    },\n    {\n        \"medication\": \"Lenalidomide\",\n        \"dosage\": \"25 mg\",\n        \"frequency\": \"Once daily for the first 3 weeks of each 4-week cycle\",\n        \"start\": \"Day 1\",\n        \"end\": \"Until disease progression or unacceptable toxicity becomes apparent\"\n    },\n    {\n        \"medication\": \"Dexamethasone\",\n        \"dosage\": \"40 mg\",\n        \"frequency\": \"Weekly\",\n        \"s

In [9]:
### Not a bad result, let's see if we can build a good parser

def parse_unique_treatments(text):
    start = text.index('[')
    end = text.index(']')+1

    sub_text = text[start:end]
    sub_text = sub_text.replace('\n','')
    sub_text = sub_text.replace('  ','')
    
    med_idx = sub_text.index('medication')
    end_idx = sub_text.rindex('medication')
    treats = []
    
    try:
        while(med_idx <= end_idx):
            colon_idx = sub_text.index(':', med_idx)
            comma_idx = sub_text.index(',', med_idx)

            treat_text = sub_text[colon_idx+1:comma_idx]

            # SAFE operations
            treat_text = treat_text.replace('"', '')
            treat_text = treat_text.replace(' ', '', 1)

            treats.append(treat_text)

            med_idx = sub_text.index('medication', med_idx+len('medication')+1)
    except:
        return list(set([x.lower() for x in treats]))

    return list(set([x.lower() for x in treats]))

    
parse_unique_treatments(response['choices'][0]['text'])

NameError: name 'response' is not defined

In [10]:
groups['simple_prompt'] = groups['prompt'].apply(lambda x: example_prompt.replace("$$$", str(x)))

In [11]:
groups

Unnamed: 0,id,title,study_id,description,study,annotated,prompt,simple_prompt
0,0,Vorinostat,OG000,Women in the vorinostat group were scheduled t...,NCT00262834,,Title: Vorinostat Description: Women in the vo...,"\n ""Title: Vorinostat Description: Women in..."
1,1,Tissue Only,OG001,Women who declined vorinostat but agreed to do...,NCT00262834,,Title: Tissue Only Description: Women who decl...,"\n ""Title: Tissue Only Description: Women w..."
2,2,Arm I,OG000,Patients receive oral vorinostat twice daily o...,NCT00262834,,Title: Arm I Description: Patients receive ora...,"\n ""Title: Arm I Description: Patients rece..."
3,7,Ruboxistaurin,OG000,32 mg given once daily as oral tablet for 2 ye...,NCT00266695,,Title: Ruboxistaurin Description: 32 mg given ...,"\n ""Title: Ruboxistaurin Description: 32 mg..."
4,8,Pancreatic Adenocarcinoma Patients,OG000,Pancreatic Adenocarcinoma Patients treated wit...,NCT00262951,,Title: Pancreatic Adenocarcinoma Patients Desc...,"\n ""Title: Pancreatic Adenocarcinoma Patien..."
...,...,...,...,...,...,...,...,...
98184,48184,Randomized: Lenalidomide+Bortezomib+Dexamethas...,OG000,Induction and Consolidation Phase (12-week ind...,NCT02874742,True,Title: Randomized: Lenalidomide+Bortezomib+Dex...,"\n ""Title: Randomized: Lenalidomide+Bortezo..."
98185,49339,Part 2: VMP (Velcade+Melphalan+Prednisone),OG000,Velcade 1.3 mg/m2 was administered as an intra...,NCT00911859,True,Title: Part 2: VMP (Velcade+Melphalan+Predniso...,"\n ""Title: Part 2: VMP (Velcade+Melphalan+P..."
98186,50684,Pomalidomide,OG000,Oral pomalidomide 4 mg on Days 1-21 of 28-day ...,NCT01324947,True,Title: Pomalidomide Description: Oral pomalido...,"\n ""Title: Pomalidomide Description: Oral p..."
98187,57532,Lenalidomide + Dexamethasone,OG001,"Lenalidomide: Capsules, Oral, 25 mg, once dail...",NCT01239797,True,Title: Lenalidomide + Dexamethasone Descriptio...,"\n ""Title: Lenalidomide + Dexamethasone Des..."


In [12]:
response = openai.Completion.create(
    model="text-davinci-003",
    prompt=groups['simple_prompt'][:20].tolist(),
    max_tokens=1000,
    temperature=.3)

In [13]:
[parse_unique_treatments(x['text']) for x in response['choices']]

[['vorinostat'],
 ['vorinostat'],
 ['conventional surgery', 'vorinostat'],
 ['ruboxistaurin'],
 ['radiation therapy', 'cisplatin', 'bolus 5-fu', '5-fu', 'ifn-alpha-2b'],
 ['humatrope'],
 ['humatrope'],
 ['ibuprofen'],
 ['temperature', 'acetaminophen', 'ibuprofen'],
 ['temperature', 'acetaminophen', 'ibuprofen'],
 ['zoledronic acid'],
 ['placebo', 'golimumab'],
 ['golimumab'],
 ['golimumab'],
 ['golimumab'],
 ['procrit'],
 ['placebo'],
 ['oros-mph'],
 ['methylphenidate (oros-mph)'],
 ['rhc1inh']]

In [14]:
groups['batch_group'] = groups.index % (len(groups.index) // 20)

In [19]:
## Response function
import time

def get_open_ai_responses(prompts):
    response = None
    backoff = 3

    while (not response):
        try:
            response = openai.Completion.create(
                model="text-davinci-003",
                prompt=list(prompts),
                max_tokens=1000,
                temperature=.3)
        except:
            print("Rate limit hit, exponential backoff: ", backoff)
            time.sleep(backoff)
            backoff = backoff * backoff
        
    return response

In [16]:
sub_groups = groups[:1000]

In [17]:
sub_groups

Unnamed: 0,id,title,study_id,description,study,annotated,prompt,simple_prompt,batch_group
0,0,Vorinostat,OG000,Women in the vorinostat group were scheduled t...,NCT00262834,,Title: Vorinostat Description: Women in the vo...,"\n ""Title: Vorinostat Description: Women in...",0
1,1,Tissue Only,OG001,Women who declined vorinostat but agreed to do...,NCT00262834,,Title: Tissue Only Description: Women who decl...,"\n ""Title: Tissue Only Description: Women w...",1
2,2,Arm I,OG000,Patients receive oral vorinostat twice daily o...,NCT00262834,,Title: Arm I Description: Patients receive ora...,"\n ""Title: Arm I Description: Patients rece...",2
3,7,Ruboxistaurin,OG000,32 mg given once daily as oral tablet for 2 ye...,NCT00266695,,Title: Ruboxistaurin Description: 32 mg given ...,"\n ""Title: Ruboxistaurin Description: 32 mg...",3
4,8,Pancreatic Adenocarcinoma Patients,OG000,Pancreatic Adenocarcinoma Patients treated wit...,NCT00262951,,Title: Pancreatic Adenocarcinoma Patients Desc...,"\n ""Title: Pancreatic Adenocarcinoma Patien...",4
...,...,...,...,...,...,...,...,...,...
995,977,Atomoxetine,OG000,Participants in this group received Atomoxetin...,NCT01522404,,Title: Atomoxetine Description: Participants i...,"\n ""Title: Atomoxetine Description: Partici...",995
996,978,Inactive Compound / Placebo,OG001,Participants in this group receive matching pl...,NCT01522404,,Title: Inactive Compound / Placebo Description...,"\n ""Title: Inactive Compound / Placebo Desc...",996
997,979,Atomoxetine,OG000,"Participants in this arm received Atomoxetine,...",NCT01522404,,Title: Atomoxetine Description: Participants i...,"\n ""Title: Atomoxetine Description: Partici...",997
998,980,Inactive Compound (Placebo),OG001,Participants in this arm will receive a matchi...,NCT01522404,,Title: Inactive Compound (Placebo) Description...,"\n ""Title: Inactive Compound (Placebo) Desc...",998


In [18]:
from tqdm import tqdm
import time

results = []
for i in tqdm(range(0,50)):
    try:
        results.append(get_open_ai_responses(list(sub_groups[20*i : 20*(i+1)]['simple_prompt'])))
        backoff=10
    except:
        print("Rate limit hit, exponential backoff: ", backoff)
        time.sleep(backoff)
        backoff = backoff * backoff
    

  6%|██████▍                                                                                                    | 3/50 [00:42<10:04, 12.86s/it]

Rate limit hit, exponential backoff:  10


 10%|██████████▋                                                                                                | 5/50 [01:12<10:25, 13.90s/it]

Rate limit hit, exponential backoff:  10


 12%|████████████▊                                                                                              | 6/50 [01:28<10:36, 14.46s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 20%|█████████████████████▏                                                                                    | 10/50 [03:51<14:39, 21.99s/it]

Rate limit hit, exponential backoff:  10


 22%|███████████████████████▎                                                                                  | 11/50 [04:11<13:53, 21.38s/it]

Rate limit hit, exponential backoff:  10


 24%|█████████████████████████▍                                                                                | 12/50 [04:33<13:41, 21.62s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 30%|███████████████████████████████▊                                                                          | 15/50 [06:58<17:57, 30.78s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 36%|██████████████████████████████████████▏                                                                   | 18/50 [09:17<17:39, 33.12s/it]

Rate limit hit, exponential backoff:  10


 40%|██████████████████████████████████████████▍                                                               | 20/50 [10:00<13:31, 27.04s/it]

Rate limit hit, exponential backoff:  10


 44%|██████████████████████████████████████████████▋                                                           | 22/50 [10:37<10:25, 22.35s/it]

Rate limit hit, exponential backoff:  10


 46%|████████████████████████████████████████████████▊                                                         | 23/50 [10:54<09:17, 20.63s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 54%|█████████████████████████████████████████████████████████▏                                                | 27/50 [13:17<09:07, 23.80s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 60%|███████████████████████████████████████████████████████████████▌                                          | 30/50 [15:40<10:11, 30.57s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 66%|█████████████████████████████████████████████████████████████████████▉                                    | 33/50 [17:56<09:21, 33.00s/it]

Rate limit hit, exponential backoff:  10


 68%|████████████████████████████████████████████████████████████████████████                                  | 34/50 [18:13<07:29, 28.11s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 74%|██████████████████████████████████████████████████████████████████████████████▍                           | 37/50 [20:32<06:58, 32.23s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 78%|██████████████████████████████████████████████████████████████████████████████████▋                       | 39/50 [22:39<08:00, 43.64s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 80%|████████████████████████████████████████████████████████████████████████████████████▊                     | 40/50 [25:28<13:32, 81.27s/it]

Rate limit hit, exponential backoff:  10


 88%|█████████████████████████████████████████████████████████████████████████████████████████████▎            | 44/50 [26:35<03:05, 30.93s/it]

Rate limit hit, exponential backoff:  10
Rate limit hit, exponential backoff:  100


 90%|███████████████████████████████████████████████████████████████████████████████████████████████▍          | 45/50 [28:33<04:44, 56.83s/it]

Rate limit hit, exponential backoff:  10


 92%|█████████████████████████████████████████████████████████████████████████████████████████████████▌        | 46/50 [28:58<03:09, 47.31s/it]

Rate limit hit, exponential backoff:  10


 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 49/50 [29:47<00:26, 26.41s/it]

Rate limit hit, exponential backoff:  10


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [30:12<00:00, 36.25s/it]


In [21]:
subset = results[1]
full_treats = [parse_unique_treatments(x['text']) for x in subset['choices']]

In [30]:
# Link results to the original dataframe

# Split results into choices
nest_results = [x['choices'] for x in results]

In [34]:
flat_results = [item for sublist in nest_results for item in sublist]

In [37]:
sub_groups['openai'] = pd.Series(flat_results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_groups['openai'] = pd.Series(flat_results)


In [53]:
def openai_treats(text):
    try:
        return parse_unique_treatments(text)
    except:
        return float('nan')

sub_groups['openai_treats'] = sub_groups['openai'].apply(lambda x: openai_treats(x['text']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_groups['openai_treats'] = sub_groups['openai'].apply(lambda x: openai_treats(x['text']))


In [60]:
# wow literally .9% formatting error

sub_groups['openai_treats'].isna().sum()

9

In [62]:
sub_groups[['title', 'study', 'description', 'openai_treats']].to_csv('gpt_treats.csv')

In [63]:
sub_groups.to_pickle('openai-pre-trained.pkl')

## Fine Tune Again

In [73]:
sub_groups.head()

Unnamed: 0,id,title,study_id,description,study,annotated,prompt,simple_prompt,batch_group,openai,openai_treats,completion
0,0,Vorinostat,OG000,Women in the vorinostat group were scheduled t...,NCT00262834,,Title: Vorinostat Description: Women in the vo...,"\n ""Title: Vorinostat Description: Women in...",0,"{'text': ' [  {  ""medication"": ""Vori...",[vorinostat],"\n[\n {\n ""medication"": ""Vorinostat""..."
1,1,Tissue Only,OG001,Women who declined vorinostat but agreed to do...,NCT00262834,,Title: Tissue Only Description: Women who decl...,"\n ""Title: Tissue Only Description: Women w...",1,"{'text': ' [{""medication"": ""Vorinostat...",[vorinostat],"\n [{""medication"": ""Vorinostat"", ""dosag..."
2,2,Arm I,OG000,Patients receive oral vorinostat twice daily o...,NCT00262834,,Title: Arm I Description: Patients receive ora...,"\n ""Title: Arm I Description: Patients rece...",2,"{'text': ' [  {  ""medication"": ""vori...","[conventional surgery, vorinostat]","\n[\n {\n ""medication"": ""vorinostat""..."
3,7,Ruboxistaurin,OG000,32 mg given once daily as oral tablet for 2 ye...,NCT00266695,,Title: Ruboxistaurin Description: 32 mg given ...,"\n ""Title: Ruboxistaurin Description: 32 mg...",3,"{'text': ' [  {  ""medication"": ""Rubo...",[ruboxistaurin],"\n[\n {\n ""medication"": ""Ruboxistaur..."
4,8,Pancreatic Adenocarcinoma Patients,OG000,Pancreatic Adenocarcinoma Patients treated wit...,NCT00262951,,Title: Pancreatic Adenocarcinoma Patients Desc...,"\n ""Title: Pancreatic Adenocarcinoma Patien...",4,"{'text': ' [  {  ""medication"": ""5-FU...","[radiation therapy, 5-fu, ifn-alpha-2b, cispla...","\n[\n {\n ""medication"": ""5-FU"",\n ..."


In [68]:
sub_groups['completion'] = sub_groups['openai'].apply(lambda x: x['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_groups['completion'] = sub_groups['openai'].apply(lambda x: x['text'])


In [74]:
sub_groups[['title', 'description', 'completion']].to_csv('gpt_3_training.csv')

In [77]:
annotated = pd.read_csv('gpt_annotated_215.csv')[:218]

In [80]:
annotated = annotated[['title','description','completion']]

In [81]:
annotated['prompt']

Unnamed: 0,title,description,completion
0,Vorinostat,Women in the vorinostat group were scheduled t...,"[\n {\n ""medication"": ""Vorinostat"",\..."
1,Tissue Only,Women who declined vorinostat but agreed to do...,"[{“medication"": “N/A"", ""dosage"": ""N/A"", ""frequ..."
2,Arm I,Patients receive oral vorinostat twice daily o...,"[\n {\n ""medication"": ""vorinostat"",\..."
3,Ruboxistaurin,32 mg given once daily as oral tablet for 2 ye...,"[\n {\n ""medication"": ""Ruboxistaurin..."
4,Pancreatic Adenocarcinoma Patients,Pancreatic Adenocarcinoma Patients treated wit...,"[\n {\n ""medication"": ""5-FU"",\n ..."
...,...,...,...
213,"Treatment (Chemotherapy, Enzyme Inhibitor Ther...",INDUCTION THERAPY: Patients receive methotrexa...,"[\n {\n ""medication"": ""Methotrexate""..."
214,"Bortezomib, Gemcitabine Hydrochloride",bortezomib\n\ngemcitabine hydrochloride\n\nBor...,"[\n {\n ""medication"": ""Bortezomib"",\..."
215,"Standard 24+4 Treatment of EE20/DRSP (YAZ, BAY...","13 cycles of treatment, each cycle comprising ...","[\n {\n ""medication"": ""YAZ"",\n ..."
216,OAT + Omalizumab,"During the 8-week Run-in phase, asthma therapy...","[\n {\n ""medication"": ""OAT"",\n ..."


In [82]:
example_prompt="""
    "$$$"
    
    Please respond in a JSON list where each element of the list is a dictionary of the form
    
    ```
    {
        "medication": "medication name",
        "dosage": "dosage amount",
        "frequency": "frequency of dosage",
        "start": "start time",
        "end": "end time"
    }
    ```
"""

In [84]:
annotated['title_desc'] = 'Title: ' + annotated['title'] + ' Description: ' + annotated['description'] + ''

In [87]:
annotated['prompt'] = annotated['title_desc'].apply(lambda x: example_prompt.replace('$$$', str(x)))

In [101]:
annotated['completion'] = annotated['completion'] +'###'

In [102]:
annotated[['prompt', 'completion']].to_json('prompt_data_2.jsonl', orient='records', lines=True)

### Let's check it out

In [93]:
test = groups[1000:1100]

In [103]:
response = openai.Completion.create(
    model="davinci:ft-the-medical-board-2023-01-26-07-56-18",
    prompt=list(test['simple_prompt'][:20]),
    max_tokens=1000,
    temperature=.3)

  prompt=list(test['simple_prompt'][:20]),


APIError: The server experienced an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. {
  "error": {
    "message": "The server experienced an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists.",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'The server experienced an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists.', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Thu, 26 Jan 2023 08:03:04 GMT', 'Content-Type': 'application/json', 'Content-Length': '292', 'Connection': 'keep-alive', 'Access-Control-Allow-Origin': '*', 'Openai-Model': 'davinci:ft-the-medical-board-2023-01-26-07-56-18', 'Openai-Organization': 'the-medical-board', 'Openai-Processing-Ms': '31544', 'Openai-Version': '2020-10-01', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains', 'X-Request-Id': '8f3d78ded24feba4561dc39f79d3b601'}

In [99]:
response

<OpenAIObject text_completion id=cmpl-6cr9FnoLuSuTA7zfUjU7Ocj73deJA at 0x15ea92990> JSON: {
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": "[\n        {\n            \"medication\": \"Matching placebo\",\n            \"dosage\": \"NA\",\n            \"frequency\": \"NA\",\n            \"start\": \"NA\",\n            \"end\": \"NA\"\n        }\n]\n[\n        {\n            \"medication\": \"Matching placebo\",\n            \"dosage\": \"NA\",\n            \"frequency\": \"NA\",\n            \"start\": \"NA\",\n            \"end\": \"NA\"\n        }\n]\n[\n       {\n            \"medication\": \"Matching placebo\",\n            \"dosage\": \"NA\",\n            \"frequency\": \"NA\",\n            \"start\": \"NA\",\n            \"end\": \"NA\"\n        }\n]\n[\n        {\n            \"medication\": \"Matching placebo\",\n            \"dosage\": \"NA\",\n            \"frequency\": \"NA\",\n            \"start\": \"NA\",\n     

In [None]:
response

In [105]:
groups['simple_prompt'][0]

'\n    "Title: Vorinostat Description: Women in the vorinostat group were scheduled to receive 6 doses of oral vorinostat at 300 mg twice daily (bid), with the last dose administered by study personnel approximately 2 hours before the scheduled breast surgery (or biopsy)."\n    \n    Please respond in a JSON list where each element of the list is a dictionary of the form\n    \n    ```\n    {\n        "medication": "medication name",\n        "dosage": "dosage amount",\n        "frequency": "frequency of dosage",\n        "start": "start time",\n        "end": "end time"\n    }\n    ```\n'

# BERT Training Set

### Open AI set

In [3]:
### Use our labeled set to create a good NER set
# format such that each word is a B I or O
import pandas as pd

openai_groups = pd.read_pickle('openai-pre-trained.pkl')

In [4]:
openai_groups.head()

Unnamed: 0,id,title,study_id,description,study,annotated,prompt,simple_prompt,batch_group,openai,openai_treats
0,0,Vorinostat,OG000,Women in the vorinostat group were scheduled t...,NCT00262834,,Title: Vorinostat Description: Women in the vo...,"\n ""Title: Vorinostat Description: Women in...",0,"{'text': ' [  {  ""medication"": ""Vori...",[vorinostat]
1,1,Tissue Only,OG001,Women who declined vorinostat but agreed to do...,NCT00262834,,Title: Tissue Only Description: Women who decl...,"\n ""Title: Tissue Only Description: Women w...",1,"{'text': ' [{""medication"": ""Vorinostat...",[vorinostat]
2,2,Arm I,OG000,Patients receive oral vorinostat twice daily o...,NCT00262834,,Title: Arm I Description: Patients receive ora...,"\n ""Title: Arm I Description: Patients rece...",2,"{'text': ' [  {  ""medication"": ""vori...","[conventional surgery, vorinostat]"
3,7,Ruboxistaurin,OG000,32 mg given once daily as oral tablet for 2 ye...,NCT00266695,,Title: Ruboxistaurin Description: 32 mg given ...,"\n ""Title: Ruboxistaurin Description: 32 mg...",3,"{'text': ' [  {  ""medication"": ""Rubo...",[ruboxistaurin]
4,8,Pancreatic Adenocarcinoma Patients,OG000,Pancreatic Adenocarcinoma Patients treated wit...,NCT00262951,,Title: Pancreatic Adenocarcinoma Patients Desc...,"\n ""Title: Pancreatic Adenocarcinoma Patien...",4,"{'text': ' [  {  ""medication"": ""5-FU...","[radiation therapy, 5-fu, ifn-alpha-2b, cispla..."


In [10]:
openai_groups['prompt'][2]

'Title: Arm I Description: Patients receive oral vorinostat twice daily on days -3 to 0. Approximately 2 hours after the final dose of vorinostat, patients undergo conventional surgery of the tumor on day 0. After completion of study treatment, patients are followed for 30 days.\n\nvorinostat: Given orally, conventional surgery to follow.\n\nconventional surgery: Undergo conventional surgery'

### Tokenizers

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', do_lower_case=True)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)solve/main/vocab.txt: 100%|██████████████████████████████████████████████████████████████████| 213k/213k [00:00<00:00, 1.39MB/s]
Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████████████████████████████| 385/385 [00:00<00:00, 128kB/s]


In [27]:
import nltk 

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/porterhunley/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [37]:
# Split the words by empty space
# Create a dict linking 
from nltk.tokenize import word_tokenize


def create_treatment_dict(treatments):
    treat_dict = {}
    for treatment in treatments:
        treat_words = word_tokenize(treatment)
        
        treat_dict[treat_words[0]] = treat_words
        
    return treat_dict


# We can tweak with the BERT tokenizer later
def label_text(prompt, treatments):
    treat_dict = create_treatment_dict(treatments)
    
    labels_and_words = []
    
    treat_idx = 0
    curr_treat = None
    for word in word_tokenize(prompt):
        if (word in treat_dict):
            treat_idx = 0
            curr_treat = word
            labels_and_words.append((word, 'B'))
            treat_idx += 1
            continue
            
        if (curr_treat and len(treat_dict[curr_treat]) > treat_idx and word == treat_dict[curr_treat][treat_idx]):
            labels_and_words.append((word, 'I'))
            treat_idx += 1
            continue
                
        if word not in treat_dict:
            labels_and_words.append((word, 'O'))
            treat_idx = 0
            curr_treat = None
            continue
            
    return labels_and_words
        

In [31]:
label_text(openai_groups['prompt'][2], openai_groups['openai_treats'][2])

{'conventional': ['conventional', 'surgery'], 'vorinostat': ['vorinostat']}


[('Title', 'O'),
 (':', 'O'),
 ('Arm', 'O'),
 ('I', 'O'),
 ('Description', 'O'),
 (':', 'O'),
 ('Patients', 'O'),
 ('receive', 'O'),
 ('oral', 'O'),
 ('vorinostat', 'B'),
 ('twice', 'O'),
 ('daily', 'O'),
 ('on', 'O'),
 ('days', 'O'),
 ('-3', 'O'),
 ('to', 'O'),
 ('0', 'O'),
 ('.', 'O'),
 ('Approximately', 'O'),
 ('2', 'O'),
 ('hours', 'O'),
 ('after', 'O'),
 ('the', 'O'),
 ('final', 'O'),
 ('dose', 'O'),
 ('of', 'O'),
 ('vorinostat', 'B'),
 (',', 'O'),
 ('patients', 'O'),
 ('undergo', 'O'),
 ('conventional', 'B'),
 ('surgery', 'I'),
 ('of', 'O'),
 ('the', 'O'),
 ('tumor', 'O'),
 ('on', 'O'),
 ('day', 'O'),
 ('0', 'O'),
 ('.', 'O'),
 ('After', 'O'),
 ('completion', 'O'),
 ('of', 'O'),
 ('study', 'O'),
 ('treatment', 'O'),
 (',', 'O'),
 ('patients', 'O'),
 ('are', 'O'),
 ('followed', 'O'),
 ('for', 'O'),
 ('30', 'O'),
 ('days', 'O'),
 ('.', 'O'),
 ('vorinostat', 'B'),
 (':', 'O'),
 ('Given', 'O'),
 ('orally', 'O'),
 (',', 'O'),
 ('conventional', 'B'),
 ('surgery', 'I'),
 ('to', 'O'),
 (

### OpenAI dataset

In [49]:
openai_cleaned = openai_groups[openai_groups['prompt'].apply(lambda x: isinstance(x, str))]

In [51]:
openai_cleaned = openai_cleaned[openai_cleaned['openai_treats'].apply(lambda x: isinstance(x, list))]

In [56]:
openai_cleaned['labels'] = openai_cleaned.apply(lambda x: label_text(x['prompt'].lower(), x['openai_treats']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  openai_cleaned['labels'] = openai_cleaned.apply(lambda x: label_text(x['prompt'].lower(), x['openai_treats']), axis=1)


In [57]:
openai_cleaned

Unnamed: 0,id,title,study_id,description,study,annotated,prompt,simple_prompt,batch_group,openai,openai_treats,labels
0,0,Vorinostat,OG000,Women in the vorinostat group were scheduled t...,NCT00262834,,Title: Vorinostat Description: Women in the vo...,"\n ""Title: Vorinostat Description: Women in...",0,"{'text': ' [  {  ""medication"": ""Vori...",[vorinostat],"[(title, O), (:, O), (vorinostat, B), (descrip..."
1,1,Tissue Only,OG001,Women who declined vorinostat but agreed to do...,NCT00262834,,Title: Tissue Only Description: Women who decl...,"\n ""Title: Tissue Only Description: Women w...",1,"{'text': ' [{""medication"": ""Vorinostat...",[vorinostat],"[(title, O), (:, O), (tissue, O), (only, O), (..."
2,2,Arm I,OG000,Patients receive oral vorinostat twice daily o...,NCT00262834,,Title: Arm I Description: Patients receive ora...,"\n ""Title: Arm I Description: Patients rece...",2,"{'text': ' [  {  ""medication"": ""vori...","[conventional surgery, vorinostat]","[(title, O), (:, O), (arm, O), (i, O), (descri..."
3,7,Ruboxistaurin,OG000,32 mg given once daily as oral tablet for 2 ye...,NCT00266695,,Title: Ruboxistaurin Description: 32 mg given ...,"\n ""Title: Ruboxistaurin Description: 32 mg...",3,"{'text': ' [  {  ""medication"": ""Rubo...",[ruboxistaurin],"[(title, O), (:, O), (ruboxistaurin, B), (desc..."
4,8,Pancreatic Adenocarcinoma Patients,OG000,Pancreatic Adenocarcinoma Patients treated wit...,NCT00262951,,Title: Pancreatic Adenocarcinoma Patients Desc...,"\n ""Title: Pancreatic Adenocarcinoma Patien...",4,"{'text': ' [  {  ""medication"": ""5-FU...","[radiation therapy, 5-fu, ifn-alpha-2b, cispla...","[(title, O), (:, O), (pancreatic, O), (adenoca..."
...,...,...,...,...,...,...,...,...,...,...,...,...
995,977,Atomoxetine,OG000,Participants in this group received Atomoxetin...,NCT01522404,,Title: Atomoxetine Description: Participants i...,"\n ""Title: Atomoxetine Description: Partici...",995,"{'text': ' [  {  ""medication"": ""Atom...",[atomoxetine],"[(title, O), (:, O), (atomoxetine, B), (descri..."
996,978,Inactive Compound / Placebo,OG001,Participants in this group receive matching pl...,NCT01522404,,Title: Inactive Compound / Placebo Description...,"\n ""Title: Inactive Compound / Placebo Desc...",996,"{'text': ' [  {  ""medication"": ""Plac...",[placebo],"[(title, O), (:, O), (inactive, O), (compound,..."
997,979,Atomoxetine,OG000,"Participants in this arm received Atomoxetine,...",NCT01522404,,Title: Atomoxetine Description: Participants i...,"\n ""Title: Atomoxetine Description: Partici...",997,"{'text': ' Answer: [{  ""medicat...",[atomoxetine],"[(title, O), (:, O), (atomoxetine, B), (descri..."
998,980,Inactive Compound (Placebo),OG001,Participants in this arm will receive a matchi...,NCT01522404,,Title: Inactive Compound (Placebo) Description...,"\n ""Title: Inactive Compound (Placebo) Desc...",998,"{'text': ' [{""medication"": ""Placebo"", ...",[placebo],"[(title, O), (:, O), (inactive, O), (compound,..."
