In [10]:
import itertools
from xml.etree.ElementTree import ElementTree
import pandas as pd
from tqdm import tqdm


# Extract data from XML and create a DataFrame
xml_files = ["NEJM_data.xml", "BMJ_data.xml", 
             "animals_data.xml", 
             "caserepvetmed_data.xml", 
             "jvetmedsci_data.xml", 
             "frontvetsci_data.xml", 
             "jamanimhospassoc_data.xml", 
             "jsmallanimpract_data.xml", 
             "openvetj_data.xml", 
             "vetmedsci_data.xml", 
             "vetsci_data.xml"]
data_path = "../data-querying/results/"

data_sets = []
record_sets = []

tree = ElementTree()

lists = []
for xml in xml_files:
    temp = tree.parse(data_path + xml)
    lists.append(temp.findall('.//Rec'))

record_sets = list(itertools.chain(*lists))

progress_bar = tqdm(range(len(record_sets)))

for rec in record_sets:
    try: 
        common = rec.find('.//Common')
        pmid = common.find('PMID').text
        text_types = [elem.text for elem in common.findall('Type')]
        title = common.find('Title').text
        abstract = common.find('Abstract').text
        mesh_term_list = rec.find('.//MeshTermList')
        mesh_terms = [term.text for term in mesh_term_list.findall('MeshTerm')]
    except Exception as e:
        print(f"An error occurred: {e}")
        print(f"Error occured for PMID: {pmid}")

    data_sets.append({'pmid': pmid, "text_types": text_types, 'title': title,
                'abstract': abstract, 'meshtermlist': mesh_terms})
    progress_bar.update(1)

df = pd.DataFrame(data_sets)

100%|██████████| 74757/74757 [12:23<00:00, 100.61it/s]  


In [14]:
df_exploded = df.explode("text_types")
jour_articles_df = df_exploded[df_exploded["text_types"] == "Journal Article"]
case_rep_df = df_exploded[df_exploded["text_types"] == "Case Reports"]

In [24]:
avg_title = jour_articles_df["title"].str.len().mean()
print(f"Average title length of all journal articles: {avg_title}")

avg_abstract = jour_articles_df["abstract"].str.len().mean()
print(f"Average abstract length of all journal articles: {avg_abstract}")

abstract_title = jour_articles_df[["title", "abstract"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
avg_abstract_title = abstract_title.str.len().mean()
print(f"Average abstract+title length of all journal articles: {avg_abstract_title}")

Average title length of all journal articles: 99.11309817464233
Average abstract length of all journal articles: 1039.6407526174423
Average abstract+title length of all journal articles: 1139.7538507920847


In [25]:
avg_title = case_rep_df["title"].str.len().mean()
print(f"Average title length of all case reports: {avg_title}")

avg_abstract = case_rep_df["abstract"].str.len().mean()
print(f"Average abstract length of all case reports: {avg_abstract}")

abstract_title = case_rep_df[["title", "abstract"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
avg_abstract_title = abstract_title.str.len().mean()
print(f"Average abstract+title length of all case reports: {avg_abstract_title}")

Average title length of all case reports: 85.38586645286908
Average abstract length of all case reports: 824.8643148932921
Average abstract+title length of all case reports: 911.2501813461612


In [26]:
avg_title = df["title"].str.len().mean()
print(f"Average title length of all text types: {avg_title}")

avg_abstract = df["abstract"].str.len().mean()
print(f"Average abstract length of all text types: {avg_abstract}")

abstract_title = df[["title", "abstract"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
avg_abstract_title = abstract_title.str.len().mean()
print(f"Average abstract+title length of all text types: {avg_abstract_title}")

Average title length of all text types: 99.15157109033268
Average abstract length of all text types: 1037.1586741040974
Average abstract+title length of all text types: 1137.3102451944299
