In [2]:
import pandas as pd
import random

In [3]:
target_df = pd.read_csv('process_data/target_data.csv')

In [4]:
target_df.columns

Index(['bcr_patient_uuid', 'pharmaceutical_therapy_drug_name',
       'pharmaceutical_therapy_type', 'treatment_outcome_at_tcga_followup',
       'vital_status_x', 'tumor_status_x', 'followup_reason',
       'treatment_outcome_first_course_x', 'death_days_to_x', 'gender',
       'age_at_initial_pathologic_diagnosis', 'icd_10', 'tumor_tissue_site',
       'death_days_to_x.1'],
      dtype='object')

In [7]:
target_df = target_df.drop_duplicates()

In [8]:
target_df.shape

(1887, 14)

In [17]:
target_df['vital_status_x'].unique()

array(['Dead', 'Alive', nan], dtype=object)

In [23]:
def generate_drug_vital_questions(df, qa_pairs):
    for index, row in df.iterrows():
        patient_id = row['bcr_patient_uuid']
        drug = row['pharmaceutical_therapy_drug_name']
        therapy = row['pharmaceutical_therapy_type']
        vital_status = row['vital_status_x']
        if pd.notna(drug) and pd.notna(therapy):
            question = f"What is the vital status of the patient treated with {drug} and therapy {therapy}?"
            answer = f'Given the features present in this image, the patient will most likely {str(vital_status).lower()}'
            qa_pairs.append((patient_id, question, answer))
    return qa_pairs

In [33]:
def generate_drug_questions(df, qa_pairs):
    for index, row in df.iterrows():
        patient_id = row['bcr_patient_uuid']
        drug = row['pharmaceutical_therapy_drug_name']
        therapy = row['pharmaceutical_therapy_type']
        age = row['age_at_initial_pathologic_diagnosis']
        gender = row['gender']
        vital_status = row['vital_status_x']
        if pd.notna(drug):
            question = f"What drug was prescribed to the patient?"
            answer = f'Given the features present in this image, the patient will most likely {str(drug).lower()}'
            qa_pairs.append((patient_id, question, answer))
        if pd.notna(therapy):
            question = "What therapy is prescribed to the patient?"
            answer = f'Given the features present in this image, the patient will most likely respond best to {str(therapy).lower()}'
            qa_pairs.append((patient_id, question, answer))
        if pd.notna(drug) and pd.notna(age) and vital_status == 'Alive':
            question = f"What therapy and drug will be best prescribed to the patient at {age} and {str(gender).lower()} ?"
            answer = f'Given the features present in this image, the patient will most likely respond best to {str(drug).lower()} and {str(therapy).lower()}'
            qa_pairs.append((patient_id, question, answer))

    return qa_pairs




In [38]:
def generate_drug_multiple_choice_questions(df, qa_pairs):
    for index, row in df.iterrows():
        patient_id = row['bcr_patient_uuid']
        drug = row['pharmaceutical_therapy_drug_name']
        therapy = row['pharmaceutical_therapy_type']

        # Multiple-choice question for drug
        if pd.notna(drug):
            drug_choices = df['pharmaceutical_therapy_drug_name'].dropna().unique().tolist()
            drug_distractors = random.sample([d for d in drug_choices if d != drug], min(2, len(drug_choices)-1))
            choices = [drug] + drug_distractors + ["None"]
            random.shuffle(choices)

            mc_question = f"What drug was prescribed to the patient? Choices: {', '.join(choices)}"
            mc_answer = f"The prescribed drug for the patient is {drug}."
            qa_pairs.append((patient_id, mc_question, {"choices": choices, "answer": mc_answer}))

        # Multiple-choice question when no drug is prescribed
        if pd.isna(drug):
            drug_choices = df['pharmaceutical_therapy_drug_name'].dropna().unique().tolist()
            drug_distractors = random.sample(drug_choices, min(2, len(drug_choices)))
            choices = drug_distractors + ["None"]
            random.shuffle(choices)

            mc_question = f"What drug was prescribed to the patient? Choices: {', '.join(choices)}"
            mc_answer = "No drug was prescribed to the patient."
            qa_pairs.append((patient_id, mc_question, {"choices": choices, "answer": mc_answer}))

        # Multiple-choice question for therapy
        if pd.notna(therapy):
            therapy_choices = df['pharmaceutical_therapy_type'].dropna().unique().tolist()
            therapy_distractors = random.sample([t for t in therapy_choices if t != therapy], min(2, len(therapy_choices)-1))
            choices = [therapy] + therapy_distractors + ["None"]
            random.shuffle(choices)

            mc_question = f"What type of therapy is prescribed to the patient? Choices: {', '.join(choices)}"
            mc_answer = f"The prescribed therapy for the patient is {therapy}."
            qa_pairs.append((patient_id, mc_question, {"choices": choices, "answer": mc_answer}))

        # Combination multiple-choice question for drug and therapy
        if pd.notna(drug) and pd.notna(therapy):
            drug_choices = df['pharmaceutical_therapy_drug_name'].dropna().unique().tolist()
            therapy_choices = df['pharmaceutical_therapy_type'].dropna().unique().tolist()
            combo_answer = f"{drug} and {therapy}"

            distractor_combos = [
                                    f"{random.choice(drug_choices)} and {random.choice(therapy_choices)}" for _ in range(2)
                                ] + ["None"]
            choices = [combo_answer] + distractor_combos
            random.shuffle(choices)

            mc_question = f"Which combination of drug and therapy is prescribed to the patient? Choices: {', '.join(choices)}"
            mc_answer = f"The prescribed treatment includes the drug {drug} along with {therapy}."
            qa_pairs.append((patient_id, mc_question, {"choices": choices, "answer": mc_answer}))

    return qa_pairs

In [26]:
def generate_tumor_questions(df, qa_pairs):
    for index, row in df.iterrows():
        patient_id = row['bcr_patient_uuid']
        tumor_location = row['tumor_tissue_site']

        if pd.notna(tumor_location):
            question = f"What site is this tumor located?"
            answer = tumor_location
            qa_pairs.append((patient_id, question, answer))

    return qa_pairs

In [42]:
def save_qa_pairs(qa_pairs_df, output_file='drug_qa_pairs.csv'):
    qa_pairs_df.to_csv(output_file, index=False)
    print(f"Generated {len(qa_pairs)} question-answer pairs")

In [34]:
qa_pairs = []

In [35]:
qa_pairs = generate_tumor_questions(target_df, qa_pairs)

In [36]:
qa_pairs = generate_drug_vital_questions(target_df, qa_pairs)

In [37]:
qa_pairs = generate_drug_questions(target_df, qa_pairs)

In [39]:
qa_pairs = generate_drug_multiple_choice_questions(target_df, qa_pairs)

In [40]:
df = pd.DataFrame(qa_pairs, columns=['Patient_ID', 'Question', 'Answer'])

In [43]:
save_qa_pairs(df, 'drug_qa_pairs_with_mcq_descriptive.csv')

Generated 13758 question-answer pairs


In [44]:
df.tail()

Unnamed: 0,Patient_ID,Question,Answer
13753,284680B2-F961-402D-8740-E5F9E4FB4A98,What type of therapy is prescribed to the pati...,"{'choices': ['Immunotherapy', 'Hormone Therapy..."
13754,284680B2-F961-402D-8740-E5F9E4FB4A98,Which combination of drug and therapy is presc...,{'choices': ['Cloretazine and Hormone Therapy'...
13755,42156C6A-416E-4832-9C95-CEE6084B0910,What drug was prescribed to the patient? Choic...,"{'choices': ['Temodar', 'IL-13 with Pseudomona..."
13756,42156C6A-416E-4832-9C95-CEE6084B0910,What type of therapy is prescribed to the pati...,"{'choices': ['None', 'Hormone Therapy', 'Ancil..."
13757,42156C6A-416E-4832-9C95-CEE6084B0910,Which combination of drug and therapy is presc...,"{'choices': ['None', 'Imatinib and Targeted Mo..."
