In [1]:
import polars as pl
import re

In [2]:
splits = {'train': 'phrases_no_exclude_train.jsonl', 'test': 'phrases_no_exclude_test.jsonl'}
df = pl.read_ndjson('hf://datasets/GBaker/MedQA-USMLE-4-options/' + splits['train'])

In [3]:
df.head()

question,answer,options,meta_info,answer_idx,metamap_phrases
str,str,struct[4],str,str,list[str]
"""A 23-year-old pregnant woman a…","""Nitrofurantoin""","{""Ampicillin"",""Ceftriaxone"",""Doxycycline"",""Nitrofurantoin""}","""step2&3""","""D""","[""23 year old pregnant woman"", ""weeks presents"", … ""patient""]"
"""A 3-month-old baby died sudden…","""Placing the infant in a supine…","{""Placing the infant in a supine position on a firm mattress while sleeping"",""Keeping the infant covered and maintaining a high room temperature"",""Application of a device to maintain the sleeping position"",""Avoiding pacifier use during sleep""}","""step2&3""","""A""","[""3 month old baby died"", ""night"", … ""baby""]"
"""A mother brings her 3-week-old…","""Abnormal migration of ventral …","{""Abnormal migration of ventral pancreatic bud"",""Complete failure of proximal duodenum to recanalize"",""Abnormal hypertrophy of the pylorus"",""Failure of lateral body folds to move ventrally and fuse in the midline""}","""step1""","""A""","[""mother"", ""week old infant"", … ""presentation""]"
"""A pulmonary autopsy specimen f…","""Thromboembolism""","{""Thromboembolism"",""Pulmonary ischemia"",""Pulmonary hypertension"",""Pulmonary passive congestion""}","""step1""","""A""","[""pulmonary autopsy specimen"", ""58 year old woman"", … ""present findings""]"
"""A 20-year-old woman presents w…","""Von Willebrand disease""","{""Hemophilia A"",""Lupus anticoagulant"",""Protein C deficiency"",""Von Willebrand disease""}","""step1""","""D""","[""20 year old woman presents"", ""menorrhagia"", … ""patients symptoms""]"


In [4]:
smoking_key_terms = {
        'smoke': ['smoking behavior', 'tobacco use', 'cigarette consumption',
                  'nicotine exposure', 'secondhand smoke', 'smoking cessation'],
        'smoking': ['smoking behavior', 'tobacco use', 'cigarette consumption',
                    'nicotine exposure', 'secondhand smoke', 'smoking cessation'],
        'tobacco': ['tobacco use', 'smoking', 'nicotine', 'cigarette'],
        'nicotine': ['nicotine exposure', 'smoking', 'tobacco', 'cigarette'],
        'cigarette': ['cigarette consumption', 'smoking', 'tobacco', 'nicotine']
      }

obesity_key_terms = {  
        'obesity': ['overweight', 'adiposity', 'morbid obesity', 'excessive body weight',
                    'obesity disorder', 'high BMI', 'body fat accumulation'],
        'overweight': ['obesity', 'excess weight', 'adiposity', 'weight problem',
                       'high body mass', 'increased BMI'],
        'adiposity': ['obesity', 'overweight', 'body fat', 'fat accumulation']
}

covid_key_terms = {
    'covid': ['covid-19', 'coronavirus', 'sars-cov-2', 'pandemic', 'covid infection', 'covid infection', 'respiratory illness']
}

In [5]:
smoking_matches = set()
for terms in smoking_key_terms.values():
    smoking_matches.update(terms)

obesity_matches = set()
for terms in obesity_key_terms.values():
    obesity_matches.update(terms)

covid_matches = set()
for terms in covid_key_terms.values():
    covid_matches.update(terms)

smoking_regex_pattern = '|'.join(re.escape(term) for term in smoking_matches)
filtered_for_smoking = df.filter(pl.col('question').str.contains(smoking_regex_pattern))

obesity_regex_pattern = '|'.join(re.escape(term) for term in obesity_matches)
filtered_for_obesity = df.filter(pl.col('question').str.contains(obesity_regex_pattern))

covid_regex_pattern = '|'.join(re.escape(term) for term in covid_matches)
filtered_for_covid = df.filter(pl.col('question').str.contains(covid_regex_pattern))

In [6]:
print(f"Number of questions which match our key terms, is: {filtered_for_smoking.shape[0]}")

Number of questions which match our key terms, is: 1057


In [7]:
print(f"Number of questions which match our key terms, is: {filtered_for_obesity.shape[0]}")

Number of questions which match our key terms, is: 187


In [8]:
print(f"Number of questions which match our key terms, is: {filtered_for_covid.shape[0]}")

Number of questions which match our key terms, is: 3


# Question1

In [9]:
filtered_for_smoking.select('question')[1, 0]

'A 76-year-old African American man presents to his primary care provider complaining of urinary frequency. He wakes up 3-4 times per night to urinate while he previously only had to wake up once per night. He also complains of post-void dribbling and difficulty initiating a stream of urine. He denies any difficulty maintaining an erection. His past medical history is notable for non-alcoholic fatty liver disease, hypertension, hyperlipidemia, and gout. He takes aspirin, atorvastatin, enalapril, and allopurinol. His family history is notable for prostate cancer in his father and lung cancer in his mother. He has a 15-pack-year smoking history and drinks alcohol socially. On digital rectal exam, his prostate is enlarged, smooth, and non-tender. Which of the following medications is indicated in this patient?'

In [10]:
filtered_for_smoking.select('options')[1, 0]

{'A': 'Hydrochlorothiazide',
 'B': 'Midodrine',
 'C': 'Oxybutynin',
 'D': 'Tamsulosin'}

In [11]:
filtered_for_smoking.select('answer')[1, 0]

'Tamsulosin'

# Question2

In [12]:
filtered_for_obesity.select('question')[2, 0]

'An investigator is studying obesity in mice. Over the course of 2 weeks, mice in the experimental group receive a daily injection with a synthetic analog of an endogenous hormone. Compared to the control group, the hormone-injected mice eat more and gain significantly more weight. Which of the following is the most likely explanation for the observed weight gain in the experimental group?'

In [13]:
filtered_for_obesity.select('options')[2, 0]

{'A': 'Cholecystokinin stimulation of the nucleus tractus solitarius',
 'B': 'Somatostatin inhibition of the anterior pituitary',
 'C': 'Ghrelin stimulation of the lateral hypothalamus',
 'D': 'Glucagon stimulation of hepatocytes'}

In [14]:
filtered_for_obesity.select('answer')[2, 0]

'Ghrelin stimulation of the lateral hypothalamus'

# Question3

In [15]:
filtered_for_smoking.select('question')[3, 0]

"A 16-year-old boy is brought to the physician by his mother because she is worried about his behavior. Yesterday, he was expelled from school for repeatedly skipping classes. Over the past 2 months, he was suspended 3 times for bullying and aggressive behavior towards his peers and teachers. Once, his neighbor found him smoking cigarettes in his backyard. In the past, he consistently maintained an A grade average and had been a regular attendee of youth group events at their local church. The mother first noticed this change in behavior 3 months ago, around the time at which his father moved out after discovering his wife was having an affair. Which of the following defense mechanisms best describes the change in this patient's behavior?"

In [16]:
filtered_for_smoking.select('options')[3, 0]

{'A': 'Acting out',
 'B': 'Projection',
 'C': 'Passive aggression',
 'D': 'Regression'}

In [17]:
filtered_for_smoking.select('answer')[3, 0]

'Acting out'

# Question4

In [18]:
filtered_for_obesity.select('question')[100, 0]

'A 79-year-old man presents to a physician’s office for a routine appointment. He had a myocardial infarction 3 years ago and was started on aspirin, carvedilol, captopril, and high-dose atorvastatin. He denies shortness of breath or cough. He exercises regularly and is on a healthy diet that is good for his heart. The vital signs include: pulse 80/min, respirations 16/min and blood pressure 122/80 mm Hg. The physical examination reveals an overweight male with a body mass index (BMI) of 28 kg/m2. The fasting lipid profile is as follows:\nTotal cholesterol 200 mg/dL\nHigh-density lipoprotein (HDL)  35 mg/dL\nLow-density lipoprotein (LDL) 140 mg/dL\nTriglycerides 120 mg/dL\nWhich of the following drugs should be added to his regimen?'

In [19]:
filtered_for_obesity.select('options')[3, 0]

{'A': 'Topiramate', 'B': 'Exenatide', 'C': 'Pioglitazone', 'D': 'Acarbose'}

In [20]:
filtered_for_obesity.select('answer')[3, 0]

'Exenatide'

# Question5

In [21]:
filtered_for_smoking.select('question')[700, 0]

'A 54-year-old woman comes to the office complaining of increased urinary frequency and dysuria. She is accompanied by her husband. The patient reports that she goes to the bathroom 6-8 times a day. Additionally, she complains of pain at the end of her urinary stream. She denies fever, abdominal pain, vaginal discharge, or hematuria. Her husband adds, “we also don’t have sex as much as we used to.” The patient reports that even when she is “in the mood,” sex is “no longer pleasurable.” She admits feeling guilty about this. The patient’s last menstrual period was 15 months ago. Her medical history is significant for hyperlipidemia and coronary artery disease. She had a non-ST elevation myocardial infarction (NSTEMI) 3 months ago, and she has had multiple urinary tract infections (UTIs) in the past year. She smokes 1 pack of cigarettes a day and denies alcohol or illicit drug use. Body mass index is 32 kg/m^2. Pelvic examination reveals vaginal dryness and vulvar tissue thinning. A urina

In [22]:
filtered_for_smoking.select('options')[700, 0]

{'A': 'Antibiotic prophylaxis',
 'B': 'Topical clobetasol',
 'C': 'Topical estrogen',
 'D': 'Venlafaxine'}

In [23]:
filtered_for_smoking.select('answer')[700, 0]

'Topical estrogen'

# Question6

In [24]:
filtered_for_obesity.select('question')[50, 0]

'A 70-year-old woman is brought to her physician by her daughter who reports that the patient has been increasingly confused and forgetful over the past year. The daughter reports that the patient has difficulty finding words, remembering names, and maintaining a conversation. She has gotten lost twice while driving. Her past medical history is known for obesity, diabetes, and atrial fibrillation. She takes metformin, glyburide, and warfarin. She drinks socially and has a 30 pack-year smoking history. Her family history is notable for Parkinson’s disease in her father and stroke in her mother. A head CT demonstrates sulcal widening and narrowing of the gyri. The physician decides to start the patient on a medication known to inhibit a cell surface glutamate receptor. Which of the following is a downstream effect of this medication?'

In [25]:
filtered_for_obesity.select('options')[50, 0]

{'A': 'Decreased intracellular calcium',
 'B': 'Increased intracellular sodium',
 'C': 'Increased intracellular acetylcholine',
 'D': 'Decreased intracellular acetylcholine'}

In [26]:
filtered_for_obesity.select('answer')[50, 0]

'Decreased intracellular calcium'

# Question7

In [27]:
filtered_for_covid.select('question')[0, 0]

'A 12-year-old boy is brought to his pediatrician with a high fever. He was feeling fatigued yesterday and then developed a high fever overnight that was accompanied by chills and malaise. This morning he also started complaining of headaches and myalgias. He has otherwise been healthy and does not take any medications. He says that his friends came down with the same symptoms last week. He is given oseltamivir and given instructions to rest and stay hydrated. He is also told that this year the disease is particularly infectious and is currently causing a global pandemic. He asks the physician why the same virus can infect people who have already had the disease and is told about a particular property of this virus. Which of the following properties is required for the viral genetic change that permits global pandemics of this virus?'

In [28]:
filtered_for_covid.select('options')[0, 0]

{'A': 'Concurrent infection with 2 viruses',
 'B': 'Crossing over of homologous regions',
 'C': 'One virus that produces a non-functional protein',
 'D': 'Segmented genomic material'}

In [29]:
filtered_for_covid.select('answer')[0, 0]

'Segmented genomic material'

# Question8

In [30]:
filtered_for_covid.select('question')[1, 0]

'A 55-year-old man with a history of fatigue and exertional dyspnea presents to the urgent care clinic following an acute upper respiratory illness. On physical examination, his pulses are bounding, his complexion is very pale, and scleral icterus is apparent. The spleen is moderately enlarged. Oxygen saturation is 79% at rest, with a new oxygen requirement of 9 L by a non-rebreather mask. Laboratory analysis results show a hemoglobin level of 6.8 g/dL. Of the following options, which hypersensitivity reaction does this condition represent?'

In [31]:
filtered_for_covid.select('options')[1, 0]

{'A': 'Type I–anaphylactic hypersensitivity reaction',
 'B': 'Type II–cytotoxic hypersensitivity reaction',
 'C': 'Type III–immune complex-mediated hypersensitivity reaction',
 'D': 'Type II and II–mixed cytotoxic and immune complex hypersensitivity reaction'}

In [32]:
filtered_for_covid.select('answer')[1, 0]

'Type II–cytotoxic hypersensitivity reaction'

# Question9

In [33]:
filtered_for_covid.select('question')[2, 0]

'A homeless 45-year-old man presents to the emergency room in December complaining of malaise, body aches, chills, and fever. He reports that his symptoms started 4 days ago. His myalgias and chills have begun to resolve, but now he is starting to develop a dry cough, dyspnea, and a sore throat. He does not have a primary care provider and has not had any vaccinations in over 2 decades. He receives medical care from the emergency room whenever he is feeling ill. His temperature is 103°F (39.4°C), blood pressure is 130/70 mmHg, pulse is 115/min, and respirations are 22/min. On exam, he appears fatigued with mildly increased work of breathing. A chest radiograph is negative. A nasopharyngeal viral culture is positive for an orthomyxovirus. Upon further review of the patient’s medical record, he was diagnosed with the same condition 1 year ago in November. Which of the following mechanisms is responsible for pandemics of this patient’s disease?'

In [34]:
filtered_for_covid.select('options')[2, 0]

{'A': 'Complementation',
 'B': 'Reassortment',
 'C': 'Recombination',
 'D': 'Transduction'}

In [35]:
filtered_for_covid.select('answer')[2, 0]

'Reassortment'