In [1]:
import pickle
import os
from xml.etree import ElementTree as ET

import pandas as pd

In [2]:
mesh_tree_df = pd.read_csv('data/mesh/mtrees2018.bin', sep=';', header=None)

In [3]:
mesh_tree_df.shape

(58744, 2)

In [4]:
mesh_tree_df.head()

Unnamed: 0,0,1
0,Body Regions,A01
1,Anatomic Landmarks,A01.111
2,Breast,A01.236
3,"Mammary Glands, Human",A01.236.249
4,Nipples,A01.236.500


In [5]:
mesh_tree_sr = pd.Series(mesh_tree_df[1].values, index=mesh_tree_df[0].values)

In [6]:
mesh_tree_sr.head()

Body Regions                     A01
Anatomic Landmarks           A01.111
Breast                       A01.236
Mammary Glands, Human    A01.236.249
Nipples                  A01.236.500
dtype: object

MeSH (https://meshb.nlm.nih.gov/)
* "psychiatry" or "psychology"
    * Psychiatry and Psychology [F]
    * EXCEPTIONS:
        * Religion and Psychology [F02.880; K01.844.664]
        * Biofeedback, Psychology [E02.190.525.123; F02.830.131; F04.754.137.301; F04.754.308.500]
        * Ergonomics [F02.784.412; J01.293.556]
        * Parapsychology [F02.550; F04.096.462; H01.770.644.364]
        * Signal Detection, Psychological [E01.370.685.814; E05.796.908; F02.463.593.257.800; F02.463.593.710.725; F04.096.753.814; F04.669.908]
* "philosophy"
    * Existentialism [K01.752.304; F02.739.418]
    * Holistic Health [K01.752.667.710; E02.190.321; N01.400.350]
    * Symbolism [K01.400.899; K01.752.798]
* "neuroscience"
    * Neurosciences [H01.158.610]

In [8]:
included_mesh_headings = [
    'E02.190.321',
    'F',
    'H01.158.610',
    'K01.400.899',
    'K01.752.304',
    'K01.752.667.710',
    'K01.752.798',
    'N01.400.350',
]

excluded_mesh_headings = [
    'F02.463.593.257.800',
    'F02.463.593.710.725',
    'F02.550',
    'F02.784.412',
    'F02.830.131',
    'F02.880',
    'F04.096.462',
    'F04.096.753.814',
    'F04.669.908',
    'F04.754.137.301',
    'F04.754.308.500',
]

In [9]:
def any_starts_with(ls1, ls2):
    r = False
    for x in ls1:
        for y in ls2:
            r = r or x.startswith(y)
    return r

def is_selected_mesh_heading(ls):
    return any_starts_with(ls, included_mesh_headings) and not any_starts_with(ls, excluded_mesh_headings)

In [10]:
%%time

error_set = set()

article_ids = []
for filename in sorted(os.listdir('data/src/'))[451:600]:
    print(filename)
    filepath = 'data/src/' + filename
    root = ET.parse(filepath).getroot()
    for a in root.findall('PubmedArticle'):
        mesh_names = [e.text for e in a.findall('MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName')]
        for mesh_name in mesh_names:
            if mesh_name in mesh_tree_sr:
                mesh_heading = mesh_tree_sr[mesh_name]
                if isinstance(mesh_heading, str):
                    mesh_headings = [mesh_heading]
                elif isinstance(mesh_heading, pd.Series):
                    mesh_headings = list(mesh_heading)
                else:
                    mesh_headings = []
                    error_set.add((1, filename, mesh_name))
                if is_selected_mesh_heading(mesh_headings):
                    pmid = a.find('MedlineCitation/PMID').text
                    article_ids.append(pmid)
                    break
            else:
                error_set.add((2, filename, mesh_name))
        a.clear()
    root.clear()

medline17n0001.xml
medline17n0002.xml
medline17n0003.xml
CPU times: user 21min 27s, sys: 2.14 s, total: 21min 29s
Wall time: 21min 36s


In [12]:
len(article_ids)

8973

In [11]:
len(error_set)

51

In [13]:
with open('data/out/article_ids_4.pickle', 'wb') as f:
    pickle.dump(article_ids, f)

In [14]:
with open('data/out/errors_4.pickle', 'wb') as f:
    pickle.dump(list(error_set), f)