# Loading the datasets and parsing the xml files

In [67]:
# %%writefile generate_training_data.py
from skimpy import skim
import pandas as pd
import os
import xml.etree.ElementTree as ET
import re

# Remove warnings such as SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Change the current working directory to the script's directory
# os.chdir(os.path.dirname(os.path.abspath(__file__)))

train1_data = r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train-1.xml"
train2_data = r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train-2.xml"
test_data = r".\TestDataset\TREC-2017-LiveQA-Medical-Test.xml"
test_summary_data = r".\TestDataset\TREC-2017-LiveQA-Medical-Test-Questions-w-summaries.xml"
# Open the xml files and read the data
with open(train1_data, 'r', encoding='utf-8') as file:
    train1_data = file.read()

with open(train2_data, 'r', encoding='utf-8') as file:
    train2_data = file.read()

with open(test_data, 'r', encoding='utf-8') as file:
    test_data = file.read()

def parse_xml_training1_data(xml_training1_data):
    root = ET.fromstring(xml_training1_data)

    # Extract data
    questions_data = []
    for question in root.findall('NLM-QUESTION'):
        question_id = question.get('questionid')
        f_ref = question.get('fRef')
        subject = question.find('SUBJECT').text if question.find('SUBJECT') is not None else ""
        question_text = question.find('MESSAGE').text if question.find('MESSAGE') is not None else ""
        
        for sub_question in question.find('SUB-QUESTIONS').findall('SUB-QUESTION'):
            sub_question_id = sub_question.get('subqid')
            focus = sub_question.find('ANNOTATIONS').find('FOCUS').text
            q_type = sub_question.find('ANNOTATIONS').find('TYPE').text
            
            for answer in sub_question.find('ANSWERS').findall('ANSWER'):
                answer_id = answer.get('answerid')
                pair_id = answer.get('pairid')
                answer_text = answer.text
                
                questions_data.append({
                    'question_id': question_id,
                    'f_ref': f_ref,
                    'subject': subject,
                    'question': question_text,
                    'sub_question_id': sub_question_id,
                    'focus': focus,
                    'type': q_type,
                    'answer_id': answer_id,
                    'pair_id': pair_id,
                    'answer': answer_text
                })

    return pd.DataFrame(questions_data)

def parse_xml_training2_data(xml_training2_data):

    root = ET.fromstring(xml_training2_data)
    
    # List to store question and answer data
    data = []
    
    for question in root.findall('NLM-QUESTION'):
        qid = question.get('qid')
        subject = question.find('SUBJECT').text
        message = question.find('MESSAGE').text
        
        for sub_question in question.find('SUB-QUESTIONS').findall('SUB-QUESTION'):
            focus = sub_question.find('ANNOTATIONS').find('FOCUS').text
            q_type = sub_question.find('ANNOTATIONS').find('TYPE').text
            
            for answer in sub_question.find('ANSWERS').findall('ANSWER'):
                answer_text = answer.text
                
                data.append({
                    'question_id': qid,
                    'subject': subject,
                    'question': message,
                    'focus': focus,
                    'type': q_type,
                    'answer': answer_text
                })
    
    return pd.DataFrame(data)

def parse_xml_test_data(xml_test_data):
    root = ET.fromstring(xml_test_data)
    
    # List to store question and answer data
    data = []
    
    for question in root.findall('NLM-QUESTION'):
        qid = question.get('qid')
        original_question = question.find('Original-Question') if question.find('Original-Question') is not None else question
        subject = original_question.find('SUBJECT').text if original_question.find('SUBJECT') is not None else ""
        message = original_question.find('MESSAGE').text if original_question.find('MESSAGE') is not None else ""
        nist_paraphrase = question.find('NIST-PARAPHRASE').text if question.find('NIST-PARAPHRASE') is not None else ""
        
        for ref_answer in question.find('ReferenceAnswers').findall('RefAnswer'):
            aid = ref_answer.get('aid')
            answer = ref_answer.find('ANSWER').text.strip() if ref_answer.find('ANSWER') is not None else ""
            answer_url = ref_answer.find('AnswerURL').text.strip() if ref_answer.find('AnswerURL') is not None else ""
            comment = ref_answer.find('COMMENT').text.strip() if ref_answer.find('COMMENT') and ref_answer.find('COMMENT').text is not None is not None else ""
            
            data.append({
                'question_id': qid,
                'subject': subject,
                'question': message,
                'nist_paraphrase': nist_paraphrase,
                'aid': aid,
                'answer': answer,
                'answer_url': answer_url,
                'comment': comment
            })
            
        # Handle ReferenceAnswer elements which might have slightly different tag names
        for ref_answer in question.find('ReferenceAnswers').findall('ReferenceAnswer'):
            aid = ref_answer.get('aid')
            answer = ref_answer.find('ANSWER').text.strip() if ref_answer.find('ANSWER') is not None else ""
            answer_url = ref_answer.find('AnswerURL').text.strip() if ref_answer.find('AnswerURL') is not None else ""
            comment = ref_answer.find('COMMENT').text.strip() if ref_answer.find('COMMENT') is not None and ref_answer.find('COMMENT').text is not None else ""
            
            data.append({
                'question_id': qid,
                'subject': subject,
                'question': message,
                'nist_paraphrase': nist_paraphrase,
                'aid': aid,
                'answer': answer,
                'answer_url': answer_url,
                'comment': comment
            })
    
    return pd.DataFrame(data)

def parse_xml_test_summary_data(xml_test_summary_data):

    tree = ET.parse(xml_test_summary_data)
    root = tree.getroot()

    # Initialize an empty list to store the parsed data
    parsed_data = []

    # Iterate through each question in the XML
    for question in root.findall('NLM-QUESTION'):
        qid = question.get('qid')
        
        # Use if-else clauses to handle missing elements
        original_question = question.find('Original-Question') if question.find('Original-Question') is not None else question
        subject = original_question.find('SUBJECT').text if original_question.find('SUBJECT') is not None else ""
        message = original_question.find('MESSAGE').text if original_question.find('MESSAGE') is not None else ""
        nist_paraphrase = question.find('NIST-PARAPHRASE').text if question.find('NIST-PARAPHRASE') is not None else ""
        summary = question.find('NLM-Summary').text if question.find('NLM-Summary') is not None else ""

        # Extract annotations
        annotations = []
        annotation_section = question.find('ANNOTATIONS')
        if annotation_section is not None:
            for annotation in annotation_section:
                annotations.append({
                    'type': annotation.tag,
                    'id': annotation.get('fid') or annotation.get('tid') or annotation.get('kid'),
                    'category': annotation.get('fcategory') or annotation.get('kcategory') or '',
                    'text': annotation.text or ''
                })

        # Extract reference answers
        reference_answers = []
        reference_answers_section = question.find('ReferenceAnswers')
        if reference_answers_section is not None:
            for ref_answer in reference_answers_section.findall('RefAnswer') + reference_answers_section.findall('ReferenceAnswer'):
                aid = ref_answer.get('aid')
                answer = ref_answer.find('ANSWER').text.strip() if ref_answer.find('ANSWER') is not None else ""
                answer_url = ref_answer.find('AnswerURL').text.strip() if ref_answer.find('AnswerURL') is not None else ""
                comment = ref_answer.find('COMMENT').text.strip() if ref_answer.find('COMMENT') and ref_answer.find('COMMENT').text is not None else ""

                reference_answers.append({
                    'aid': aid,
                    'answer': answer,
                    'answer_url': answer_url,
                    'comment': comment
                })

        # Add parsed question data to the list
        parsed_data.append({
            'qid': qid,
            'subject': subject,
            'question': message,
            'nist_paraphrase': nist_paraphrase,
            'summary': summary,
            'annotations': annotations,
            'reference_answers': reference_answers
        })

    return pd.DataFrame(parsed_data)

df_train1 = parse_xml_training1_data(train1_data)
df_train2 = parse_xml_training2_data(train2_data)
df_test = parse_xml_test_data(test_data)
df_test_summary = parse_xml_test_summary_data(test_summary_data)

# Join the two training datasets
df_train = pd.concat([df_train1, df_train2], join='inner', ignore_index=True)

# Save the data to csv
df_train1.to_csv(r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train-1.csv", index=False)
df_train2.to_csv(r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train-2.csv", index=False)
df_train.to_csv(r".\TrainingDatasets\TREC-2017-LiveQA-Medical-Train.csv", index=False)
df_test.to_csv(r".\TestDataset\TREC-2017-LiveQA-Medical-Test.csv", index=False)
df_test_summary.to_csv(r".\TestDataset\TREC-2017-LiveQA-Medical-Test-Summary.csv", index=False)
print("trains dataset size:", df_train.shape)
print("test dataset size:", df_test.shape)
print("test summary dataset size:", len(df_test_summary))

trains dataset size: (634, 6)
test dataset size: (167, 8)
test summary dataset size: 104


# Inspecting the training and testing datasets

In [68]:
print("Train Dataset 1")
display(skim(df_train1))
display(df_train1.head())

print("Train Dataset 2")
display(skim(df_train2))
display(df_train2.head())

print("Joined Train Dataset")
display(skim(df_train))
display(df_train.head())

print("Test Dataset")
display(skim(df_test))
display(df_test.head())

print("Test Summary Dataset")
display(skim(df_test_summary))
display(df_test_summary.head())

Train Dataset 1


None

Unnamed: 0,question_id,f_ref,subject,question,sub_question_id,focus,type,answer_id,pair_id,answer
0,Q1,11373,,Literature on Cardiac amyloidosis. Please let...,Q1-S1,cardiac amyloidosis,information,Q1-S1-A1,1,Cardiac amyloidosis is a disorder caused by de...
1,Q1,11373,,Literature on Cardiac amyloidosis. Please let...,Q1-S1,cardiac amyloidosis,information,Q1-S1-A2,2,"The term ""amyloidosis"" refers not to a single ..."
2,Q2,1-136434885,treatment options versus migraine types,Migraine seems to be a spectrum of conditions ...,Q2-S1,migraine,treatment,Q2-S1-A1,3,There is no specific cure for migraine headach...
3,Q2,1-136434885,treatment options versus migraine types,Migraine seems to be a spectrum of conditions ...,Q2-S1,migraine,treatment,Q2-S1-A2,4,There is no absolute cure for migraine since i...
4,Q3,1-123453375,,DO I USE PYRIDOXINE TABLETS EVEN IF IM PREGNANT?,Q3-S1,pyridoxine,contraindication,Q3-S1-A1,5,"Before taking pyridoxine, tell your doc..."


Train Dataset 2


None

Unnamed: 0,question_id,subject,question,focus,type,answer
0,1.0,Lyme Disease,12 years ago I was bitten by tick while deer h...,Lyme Disease,Information,Lyme disease tests are used to determine if a ...
1,2.0,Raynauds Syndrome,My sons middle toe turned white after being in...,Raynauds Syndrome,Symptom,Only one finger or toe or parts of one or more...
2,5.0,burn to my wrist,Hello I burnt my wrist 2 days ago and after la...,burn,Treatment,"Before giving first aid, it is important to de..."
3,6.0,treatment of parkinson,I AM HAVING PARKINSON FOR LAST 8 YEARS. SO FAR...,Parkinson,Treatment,you should know that people who have Parkinson...
4,8.0,Periventricular Heterotopia. Scoliosis - pos...,Question in laymen terms: Has any genetic or o...,Periventricular Heterotopia/ Scoliosis,information,Isolated lissencephaly sequence (ILS) is a con...


Joined Train Dataset


None

Unnamed: 0,question_id,subject,question,focus,type,answer
0,Q1,,Literature on Cardiac amyloidosis. Please let...,cardiac amyloidosis,information,Cardiac amyloidosis is a disorder caused by de...
1,Q1,,Literature on Cardiac amyloidosis. Please let...,cardiac amyloidosis,information,"The term ""amyloidosis"" refers not to a single ..."
2,Q2,treatment options versus migraine types,Migraine seems to be a spectrum of conditions ...,migraine,treatment,There is no specific cure for migraine headach...
3,Q2,treatment options versus migraine types,Migraine seems to be a spectrum of conditions ...,migraine,treatment,There is no absolute cure for migraine since i...
4,Q3,,DO I USE PYRIDOXINE TABLETS EVEN IF IM PREGNANT?,pyridoxine,contraindication,"Before taking pyridoxine, tell your doc..."


Test Dataset


None

Unnamed: 0,question_id,subject,question,nist_paraphrase,aid,answer,answer_url,comment
0,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,TQ1A1,Noonan's syndrome is an eponymic designation t...,https://www.ncbi.nlm.nih.gov/pubmed/765504,
1,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,TQ1A2,10% of patients with Noonan syndrome have rena...,http://www.thelancet.com/journals/lancet/artic...,
2,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,TQ1A3,"Genitourinary. Renal abnormalities, generally ...",https://www.ncbi.nlm.nih.gov/books/NBK1124/,
3,TQ2,Gluten information,Re:NDC# 0115-0672-50 Zolmitriptan tabkets 5mg....,Do 5 mg. Zolmitriptan tabkets contain gluten?,TQ2A1,Zolmitriptan tablets are available as 2.5 mg (...,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,A list of ingredients is a relevant answer.
4,TQ3,amphetamine salts 20 mg,are they gluten free\t,Are amphetamine salts of 20 mg dosage gluten f...,TQ3A1,Active Ingredients\n\t\t\t\t\tAmphetamine Aspa...,https://www.cvs.com/drug/amphetamine-salts/ora...,A list of ingredients is relevant


Test Summary Dataset


None

Unnamed: 0,qid,subject,question,nist_paraphrase,summary,annotations,reference_answers
0,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,What is the relationship between Noonan syndro...,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Pr...","[{'aid': 'TQ1A1', 'answer': 'Noonan's syndrome..."
1,TQ2,Gluten information,Re:NDC# 0115-0672-50 Zolmitriptan tabkets 5mg....,Do 5 mg. Zolmitriptan tabkets contain gluten?,Do Zolmitriptan 5mg tablets manufactured by G...,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Dr...","[{'aid': 'TQ2A1', 'answer': 'Zolmitriptan tabl..."
2,TQ3,amphetamine salts 20 mg,are they gluten free\t,Are amphetamine salts of 20 mg dosage gluten f...,Do amphetamine salts 20mg tablets contain gluten?,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Dr...","[{'aid': 'TQ3A1', 'answer': 'Active Ingredient..."
3,TQ4,vdrl positive,vdrl positive patients please tell me what ar...,What are the treatments and precautions for VD...,What are the treatments and precautions for VD...,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Pr...","[{'aid': 'TQ4A1', 'answer': 'Syphilis  If ..."
4,TQ5,how much glucagon,How much glucose is in my GlucaGen HypoKit ? ...,How much glucagon is in my GlucaGen kit?,How much glucagon is in the GlucaGen HypoKit a...,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Dr...","[{'aid': 'TQ5A1', 'answer': 'GLUCAGEN  glu..."


# Cleaning the training dataset

In [69]:
def clean_string(text):
    if text is None:
        return None
    # Remove leading and trailing whitespaces
    text = text.strip()
    
    # Replace multiple spaces and newlines with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Capitalize the first letter of each sentence
    sentences = re.split('(?<=\.) ', text)
    sentences = [(s[0].upper() + s[1:]) for s in sentences]
    text = ' '.join(sentences)
    
    return text

# Drop the rows where question is blank in df_train, which represents 0.32% of the data
df_train_clean = df_train.dropna(subset=['question'])

# Apply a proper case to the columns except for the question_id column
df_train_clean['subject'] = df_train_clean['subject'].apply(clean_string)
df_train_clean['question'] = df_train_clean['question'].apply(clean_string)
df_train_clean['focus'] = df_train_clean['focus'].apply(clean_string)
df_train_clean['type'] = df_train_clean['type'].apply(clean_string)
df_train_clean['answer'] = df_train_clean['answer'].apply(clean_string)

# Change 'Diagnose_me' for "Diagnosis" in the type column to form the final 23 categories
df_train_clean['type'] = df_train_clean['type'].apply(lambda x: 'Diagnosis' if x == 'Diagnose_me' else x)

# Count the unique values in the 'type' and 'focus' columns
print("Train Dataset Clean")
display(df_train_clean.describe().T)
display(skim(df_train_clean))


Train Dataset Clean


Unnamed: 0,count,unique,top,freq
question_id,632,444,Q63,13
subject,462,290,ClinicalTrials.gov - Question - general inform...,15
question,632,420,I have a list of questions about Tay sachs dis...,13
focus,632,389,Stroke,12
type,632,23,Treatment,296
answer,632,605,CHANGING YOUR LIFESTYLE An active lifestyle an...,5


None

# Cleaning the test dataset for inspection

In [72]:
# Copy df_test[['question', 'nist_paraphrase']].head(n=20) to the clipboard
print("TEST BEFORE CLEANING")
display(df_test.describe().T)
display(skim(df_test))

# Apply a proper case to the columns subject, question, nist_paraphrase, and answer
df_test_clean = df_test.copy()
df_test_clean['subject'] = df_test_clean['subject'].apply(clean_string)
df_test_clean['question'] = df_test_clean['question'].apply(clean_string)
df_test_clean['nist_paraphrase'] = df_test_clean['nist_paraphrase'].apply(clean_string)
df_test_clean['answer'] = df_test_clean['answer'].apply(clean_string)

# Do the same with the test summary dataset
df_test_summary_clean = df_test_summary.copy()
df_test_summary_clean['subject'] = df_test_summary_clean['subject'].apply(clean_string)
df_test_summary_clean['question'] = df_test_summary_clean['question'].apply(clean_string)
df_test_summary_clean['nist_paraphrase'] = df_test_summary_clean['nist_paraphrase'].apply(clean_string)
df_test_summary_clean['summary'] = df_test_summary_clean['summary'].apply(clean_string)

# Inspect after cleaning
print("TEST AFTER CLEANING")
display(df_test_clean.describe().T)
display(skim(df_test_clean))

print("TEST SUMMARY AFTER CLEANING")
display(df_test_summary_clean.describe().T)
display(skim(df_test_summary_clean))


TEST BEFORE CLEANING


Unnamed: 0,count,unique,top,freq
question_id,167,104,TQ9,5
subject,166,94,general health,10
question,167,104,can a streptococcus infection cause an invasiv...,5
nist_paraphrase,163,101,Can a streptococcus infection cause other dise...,5
aid,167,167,TQ1A1,1
answer,167,167,Noonan's syndrome is an eponymic designation t...,1
answer_url,167,167,https://www.ncbi.nlm.nih.gov/pubmed/765504,1
comment,167,119,,43


None

TEST AFTER CLEANING


Unnamed: 0,count,unique,top,freq
question_id,167,104,TQ9,5
subject,166,91,General health,15
question,167,104,Can a streptococcus infection cause an invasiv...,5
nist_paraphrase,163,101,Can a streptococcus infection cause other dise...,5
aid,167,167,TQ1A1,1
answer,167,166,An electrical injury is damage to the skin or ...,2
answer_url,167,167,https://www.ncbi.nlm.nih.gov/pubmed/765504,1
comment,167,119,,43


None

TEST SUMMARY AFTER CLEANING


Unnamed: 0,count,unique,top,freq
qid,104,104,TQ1,1
subject,103,91,General health,12
question,104,104,What are the references with noonan syndrome a...,1
nist_paraphrase,101,101,What is the relationship between Noonan syndro...,1
summary,104,104,What is the relationship between Noonan syndro...,1
annotations,104,104,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Pr...",1
reference_answers,104,104,"[{'aid': 'TQ1A1', 'answer': 'Noonan's syndrome...",1


None

In [74]:
df_test_clean.to_clipboard(index=False, sep='\t')
df_train_clean.to_clipboard(index=False, sep='\t')
df_train_clean[['focus', 'type']].drop_duplicates().sort_values(by='type')
df_test_summary_clean.to_clipboard(index=False, sep='\t')
df_test_summary.head()

Unnamed: 0,qid,subject,question,nist_paraphrase,summary,annotations,reference_answers
0,TQ1,Noonan syndrome,What are the references with noonan syndrome a...,What is the relationship between Noonan syndro...,What is the relationship between Noonan syndro...,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Pr...","[{'aid': 'TQ1A1', 'answer': 'Noonan's syndrome..."
1,TQ2,Gluten information,Re:NDC# 0115-0672-50 Zolmitriptan tabkets 5mg....,Do 5 mg. Zolmitriptan tabkets contain gluten?,Do Zolmitriptan 5mg tablets manufactured by G...,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Dr...","[{'aid': 'TQ2A1', 'answer': 'Zolmitriptan tabl..."
2,TQ3,amphetamine salts 20 mg,are they gluten free\t,Are amphetamine salts of 20 mg dosage gluten f...,Do amphetamine salts 20mg tablets contain gluten?,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Dr...","[{'aid': 'TQ3A1', 'answer': 'Active Ingredient..."
3,TQ4,vdrl positive,vdrl positive patients please tell me what ar...,What are the treatments and precautions for VD...,What are the treatments and precautions for VD...,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Pr...","[{'aid': 'TQ4A1', 'answer': 'Syphilis  If ..."
4,TQ5,how much glucagon,How much glucose is in my GlucaGen HypoKit ? ...,How much glucagon is in my GlucaGen kit?,How much glucagon is in the GlucaGen HypoKit a...,"[{'type': 'FOCUS', 'id': 'F1', 'category': 'Dr...","[{'aid': 'TQ5A1', 'answer': 'GLUCAGEN  glu..."
