In [3]:
# %%writefile generate_training_data.py
from skimpy import skim
import pandas as pd
import os
import xml.etree.ElementTree as ET

# Change the current working directory to the script's directory
# os.chdir(os.path.dirname(os.path.abspath(__file__)))

train1_data = r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train-1.xml"
train2_data = r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train-2.xml"
test_data = r".\TestDataset\TREC-2017-LiveQA-Medical-Test.xml"
# Open the xml files and read the data
with open(train1_data, 'r', encoding='utf-8') as file:
    train1_data = file.read()

with open(train2_data, 'r', encoding='utf-8') as file:
    train2_data = file.read()

with open(test_data, 'r', encoding='utf-8') as file:
    test_data = file.read()

def parse_xml_training_data(xml_training_data):
    root = ET.fromstring(xml_training_data)

    # Extract data
    questions_data = []
    for question in root.findall('NLM-QUESTION'):
        question_id = question.get('questionid')
        f_ref = question.get('fRef')
        subject = question.find('SUBJECT').text if question.find('SUBJECT') is not None else ""
        question_text = question.find('MESSAGE').text if question.find('MESSAGE') is not None else ""
        
        for sub_question in question.find('SUB-QUESTIONS').findall('SUB-QUESTION'):
            sub_question_id = sub_question.get('subqid')
            focus = sub_question.find('ANNOTATIONS').find('FOCUS').text
            q_type = sub_question.find('ANNOTATIONS').find('TYPE').text
            
            for answer in sub_question.find('ANSWERS').findall('ANSWER'):
                answer_id = answer.get('answerid')
                pair_id = answer.get('pairid')
                answer_text = answer.text
                
                questions_data.append({
                    'question_id': question_id,
                    'f_ref': f_ref,
                    'subject': subject,
                    'question': question_text,
                    'sub_question_id': sub_question_id,
                    'focus': focus,
                    'type': q_type,
                    'answer_id': answer_id,
                    'pair_id': pair_id,
                    'answer_text': answer_text
                })

    return pd.DataFrame(questions_data)

def parse_xml_test_data(xml_test_data):
    root = ET.fromstring(xml_test_data)
    
    # List to store question and answer data
    data = []
    
    for question in root.findall('NLM-QUESTION'):
        qid = question.get('qid')
        original_question = question.find('Original-Question') if question.find('Original-Question') is not None else question
        subject = original_question.find('SUBJECT').text if original_question.find('SUBJECT') is not None else ""
        message = original_question.find('MESSAGE').text if original_question.find('MESSAGE') is not None else ""
        nist_paraphrase = question.find('NIST-PARAPHRASE').text if question.find('NIST-PARAPHRASE') is not None else ""
        
        for ref_answer in question.find('ReferenceAnswers').findall('RefAnswer'):
            aid = ref_answer.get('aid')
            answer = ref_answer.find('ANSWER').text.strip() if ref_answer.find('ANSWER') is not None else ""
            answer_url = ref_answer.find('AnswerURL').text.strip() if ref_answer.find('AnswerURL') is not None else ""
            comment = ref_answer.find('COMMENT').text.strip() if ref_answer.find('COMMENT') and ref_answer.find('COMMENT').text is not None is not None else ""
            
            data.append({
                'qid': qid,
                'subject': subject,
                'message': message,
                'nist_paraphrase': nist_paraphrase,
                'aid': aid,
                'answer': answer,
                'answer_url': answer_url,
                'comment': comment
            })
            
        # Handle ReferenceAnswer elements which might have slightly different tag names
        for ref_answer in question.find('ReferenceAnswers').findall('ReferenceAnswer'):
            aid = ref_answer.get('aid')
            answer = ref_answer.find('ANSWER').text.strip() if ref_answer.find('ANSWER') is not None else ""
            answer_url = ref_answer.find('AnswerURL').text.strip() if ref_answer.find('AnswerURL') is not None else ""
            comment = ref_answer.find('COMMENT').text.strip() if ref_answer.find('COMMENT') is not None and ref_answer.find('COMMENT').text is not None else ""
            
            data.append({
                'qid': qid,
                'subject': subject,
                'message': message,
                'nist_paraphrase': nist_paraphrase,
                'aid': aid,
                'answer': answer,
                'answer_url': answer_url,
                'comment': comment
            })
    
    return pd.DataFrame(data)

df_train1 = parse_xml_training_data(train1_data)
df_train2 = parse_xml_training_data(train2_data)
df_test = parse_xml_test_data(test_data)
# Join the two training datasets
df_train = pd.concat([df_train1, df_train2], ignore_index=True)
# Save the data to csv
df_train1.to_csv(r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train-1.csv", index=False)
df_train2.to_csv(r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train-2.csv", index=False)
df_train.to_csv(r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train.csv", index=False)
df_test.to_csv(r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Test.csv", index=False)
print("trains dataset size:", df_train.shape)
print("test dataset size:", df_test.shape)


trains dataset size: (634, 10)
test dataset size: (167, 8)


In [5]:
print("Train Dataset")
display(skim(df_train))
print("Test Dataset")
display(skim(df_test))
df_test.head()

Train Dataset


None

Test Dataset


None

Unnamed: 0,qid,subject,message,nist_paraphrase,aid,answer,answer_url,comment
0,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,TQ1A1,Noonan's syndrome is an eponymic designation t...,https://www.ncbi.nlm.nih.gov/pubmed/765504,
1,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,TQ1A2,10% of patients with Noonan syndrome have rena...,http://www.thelancet.com/journals/lancet/artic...,
2,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,TQ1A3,"Genitourinary. Renal abnormalities, generally ...",https://www.ncbi.nlm.nih.gov/books/NBK1124/,
3,TQ2,Gluten information,Re:NDC# 0115-0672-50 Zolmitriptan tabkets 5mg....,Do 5 mg. Zolmitriptan tabkets contain gluten?,TQ2A1,Zolmitriptan tablets are available as 2.5 mg (...,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,A list of ingredients is a relevant answer.
4,TQ3,amphetamine salts 20 mg,are they gluten free\t,Are amphetamine salts of 20 mg dosage gluten f...,TQ3A1,Active Ingredients\n\t\t\t\t\tAmphetamine Aspa...,https://www.cvs.com/drug/amphetamine-salts/ora...,A list of ingredients is relevant


In [130]:
columns_to_drop = ['question_id', 'f_ref', 'sub_question_id', 'answer_id', 'pair_id']
df_train_clean = df_train.drop(columns=columns_to_drop, inplace=False)
df_train_clean.head()
# Get the total number of each type of question
df_train_clean['type'].sort_values().value_counts()
# Apply a proper case to the 'type' and 'focus' column
df_train_clean['type'] = df_train_clean['type'].apply(lambda x: x.capitalize())
df_train_clean['focus'] = df_train_clean['focus'].apply(lambda x: x.capitalize())
# Change 'Diagnose_me' for "Diagnosis" in the type column
df_train_clean['type'] = df_train_clean['type'].apply(lambda x: 'Diagnosis' if x == 'Diagnose_me' else x)
# Only get the rows where the question is None
df_train_clean[df_train_clean['question'].isnull()]
df_train_clean['focus'].value_counts().sort_index()

focus
A knot about the size of a bb in my right forearm    1
Abdominal aortic aneurysm                            1
Abetalipoproteinemia                                 5
Abscess teeth/heart attack                           1
Achalasia                                            1
                                                    ..
Williams syndrome                                    2
Wilson's disease                                     2
Worms coming out my nose and out of my eyes          1
Wpw                                                  2
X-linked congenital stationary night blindness       1
Name: count, Length: 376, dtype: int64