In [14]:
import pandas as pd

In [None]:
df = pd.read_csv("../sources/afri_med_qa_15k_v2.4_phase_2_15275.csv")
df.info()

In [16]:
df['discipline'].value_counts()

discipline
Medicine                    6986
Non-Health                  1952
Health Research             1036
Other Health Professions     900
Nursing                      831
Pharmacy                     772
Laboratory Sciences          350
Name: count, dtype: int64

In [17]:
df['specialty'].value_counts()

specialty
Obstetrics_and_Gynecology               824
General_Surgery                         757
Pediatrics                              747
Pathology                               381
Infectious_Disease                      321
Neurology                               310
Psychiatry                              299
Cardiology                              258
Internal_Medicine                       238
Endocrinology                           236
Pulmonary_Medicine                      231
Gastroenterology                        225
Allergy_and_Immunology                  217
Hematology                              211
Ophthalmology                           202
Obstetric                               184
Rheumatology                            171
General                                 169
Nephrology                              163
Orthopedic_Surgery                      161
Otolaryngology                          158
Oncology                                135
Urology               

In [18]:
cols = [
    "question_type",
    "tier",
    # "split",
    "prompt",
    "question_clean",
    "answer_options",
    "correct_answer",
    "answer_rationale",
]

In [19]:
df['question_type'].value_counts()

question_type
consumer_queries    10000
mcq                  4039
saq                  1236
Name: count, dtype: int64

In [20]:
# rows with specialty Obstetric
obgyn = df[
    (df["specialty"].str.contains("Obstetric", case=False, na=False)) 
    & (df["question_type"].isin(["mcq", "saq"]))
    & (df["tier"] == "expert")
][cols].reset_index(drop=True)


In [21]:
obgyn

Unnamed: 0,question_type,tier,prompt,question_clean,answer_options,correct_answer,answer_rationale
0,mcq,expert,,56 years old woman has come to you with the co...,"{""option1"": ""Hysterectomy."", ""option2"": ""Vitam...",option3,
1,mcq,expert,,A newly married girl comes to gynae OPD with h...,"{""option1"": ""Trichomonas vaginalis."", ""option2...",option4,
2,mcq,expert,,28 years old woman with previous history of ha...,"{""option1"": ""Obstetric ultrasound."", ""option2""...",option2,
3,mcq,expert,,A 20-year-old medical student presents with fi...,"{""option1"": ""Polycystic ovarian disease."", ""op...",option1,
4,mcq,expert,,A large cystic ovarian tumour is detected in a...,"{""option1"": ""Torsion."", ""option2"": ""Rupture. ....",option1,
...,...,...,...,...,...,...,...
692,mcq,expert,,Which of the following Is not a screening meth...,"{""option1"": ""HbA1C"", ""option2"": ""Fasting Blood...",option3,
693,mcq,expert,,which of these does not define antenatal cardi...,"{""option1"": ""a normal baseline should be 120-1...",option4,
694,mcq,expert,,Which of the following best describes the use ...,"{""option1"": ""should be used for instrumental d...",option3,
695,mcq,expert,,Elective cervical cerclage is indicated in whi...,"{""option1"": ""Three spontaneous first \u2013 tr...",option3,


In [22]:
obgyn.head(5).to_records()

rec.array([(0, 'mcq', 'expert', nan, '56 years old woman has come to you with the complaints of hot flushes irritability, joint pains with lack of sleep. Most appropriate treatment would be: \r\n', '{"option1": "Hysterectomy.", "option2": "Vitamins.", "option3": "Combined oestrogen, progesterone preparations.", "option4": "Phytooestrogens.", "option5": "Selective estrogen receptor modulators (SERMS)."}', 'option3', nan),
           (1, 'mcq', 'expert', nan, 'A newly married girl comes to gynae OPD with history of dysuria, burning, micturition and sore perineum. What is your likely diagnosis: \r\n', '{"option1": "Trichomonas vaginalis.", "option2": "Candida infection.", "option3": "Trauma due to coitus.", "option4": "Honey moon cystitis.", "option5": "Genital herpes."}', 'option4', nan),
           (2, 'mcq', 'expert', nan, "28 years old woman with previous history of having baby with Down's \r\nSyndrome is now 12 weeks pregnant. Which of the following would you suggest to her: \r\nAmni

In [23]:
# remove the newline characters in the question_clean columns, \n, \r, and then strip leading and trailing whitespace
obgyn['question_clean'] = obgyn['question_clean'].str.replace('\n', ' ', regex=False).str.replace('\r', ' ', regex=False).str.strip()

In [24]:
obgyn[obgyn['question_type'] == 'saq'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 37 entries, 499 to 535
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   question_type     37 non-null     object
 1   tier              37 non-null     object
 2   prompt            0 non-null      object
 3   question_clean    37 non-null     object
 4   answer_options    0 non-null      object
 5   correct_answer    0 non-null      object
 6   answer_rationale  37 non-null     object
dtypes: object(7)
memory usage: 2.3+ KB


In [None]:
saq = obgyn[obgyn['question_type'] == 'saq'][['question_clean', 'answer_rationale']].copy()
saq['answer_rationale'] = saq['answer_rationale'].str.replace('\n', ' ', regex=False).str.replace('\r', ' ', regex=False).str.strip()
saq.to_csv("../data/obgyn_saq.tsv", sep="\t", index=False)
saq.head(3)

In [None]:
import json

def parse_options(row):
    opts = json.loads(row['answer_options'])
    letters = {f'option{i}': chr(64+i) for i in range(1, 6)}  # option1→A, option2→B, ...

    # Format options on a single line: "A. Hysterectomy | B. Vitamins | ..."
    formatted = " | ".join(f"{letters[k]}. {v}" for k, v in opts.items())

    # Handle multiple correct answers: "option1,option3" → "A,C"
    answer_letters = ",".join(letters[o.strip()] for o in row['correct_answer'].split(","))

    return formatted, answer_letters

mcq = obgyn[obgyn['question_type'] == 'mcq'].copy()
mcq[['options_formatted', 'correct_letter']] = mcq.apply(parse_options, axis=1, result_type='expand')

mcq[['question_clean', 'options_formatted', 'correct_letter']].to_csv("../data/obgyn_mcq.tsv", sep="\t", index=False)
mcq[['question_clean', 'options_formatted', 'correct_letter']].head(3)