In [1]:
import pickle
import os
from xml.etree import ElementTree as ET

import pandas as pd

In [2]:
mesh_tree_df = pd.read_csv('data/mesh/mtrees2018.bin', sep=';', header=None)

In [3]:
mesh_tree_df.shape

(58744, 2)

In [4]:
mesh_tree_df.head()

Unnamed: 0,0,1
0,Body Regions,A01
1,Anatomic Landmarks,A01.111
2,Breast,A01.236
3,"Mammary Glands, Human",A01.236.249
4,Nipples,A01.236.500


In [5]:
mesh_tree_sr = pd.Series(mesh_tree_df[1].values, index=mesh_tree_df[0].values)

In [6]:
mesh_tree_sr.head()

Body Regions                     A01
Anatomic Landmarks           A01.111
Breast                       A01.236
Mammary Glands, Human    A01.236.249
Nipples                  A01.236.500
dtype: object

MeSH (https://meshb.nlm.nih.gov/)
* "psychiatry" or "psychology"
    * Psychiatry and Psychology [F]
* "psychology"
    * Symbolism [K01.400.899; K01.752.798]
    * Religion and Psychology [F02.880; K01.844.664]
    * Psychology, Medical [F04.096.628.808; H02.720]
    * Behavioral Medicine [F04.096.080; H02.403.090]
    * Biofeedback, Psychology [E02.190.525.123; F02.830.131; F04.754.137.301; F04.754.308.500]
    * Existentialism [F02.739.418; K01.752.304]
    * Ergonomics [F02.784.412; J01.293.556]
    * Neuropsychology [F04.096.795.600; H01.158.782.795.110]
    * Psychophysiology [E02.190.525.812; F02.830; F04.096.795; H01.158.782.795]
    * Parapsychology [F02.550; F04.096.462; H01.770.644.364]
    * Signal Detection, Psychological [E01.370.685.814; E05.796.908; F02.463.593.257.800; F02.463.593.710.725; F04.096.753.814; F04.669.908]
* "psychiatry"
    * Preventive Psychiatry [F04.096.544.215.508; H02.403.690.150.580; H02.403.720.750.550]
    * Diagnosis, Dual (Psychiatry) [E01.190]
    * Biological Psychiatry [F04.096.544.090; H02.403.690.100]
    * Child Psychiatry [F04.096.544.193; H02.403.690.130]
    * Neuropsychiatry [F04.096.544.504; H02.403.690.754]
    * Geriatric Psychiatry [F04.096.544.380; H02.403.690.260]
    * Military Psychiatry [F04.096.544.480; H02.403.690.508]
    * Psychiatry [F04.096.544; H02.403.690]
    * Psychiatry in Literature [K01.517.584.500]
    * Addiction Medicine [H02.403.007]
    * Adolescent Psychiatry [F04.096.544.065; H02.403.690.080]
    * Forensic Psychiatry [F04.096.544.335; H02.403.690.208; I01.198.780.937.469; I01.880.604.583.310; N03.706.535.351]
    * Community Psychiatry [F04.096.544.215; H02.403.690.150]
* "philosophy"
    * Philosophy [K01.752]
* "neuroscience"
    * Cognitive Neuroscience [F04.096.628.255.500; H01.158.610.030]
    * Neurosciences [H01.158.610]
    * Neuroscience Nursing [H02.478.676.542; N02.421.533.505]

In [7]:
my_mesh_headings = [
    'E01.190',
    'E01.370.685.814',
    'E02.190.525.123',
    'E02.190.525.812',
    'E05.796.908',
    'F',
    'H01.158.610',
#     'H01.158.610.030',
    'H01.158.782.795',
#     'H01.158.782.795.110',
    'H01.770.644.364',
    'H02.403.007',
    'H02.403.090',
    'H02.403.690',
#     'H02.403.690.080',
#     'H02.403.690.100',
#     'H02.403.690.130',
#     'H02.403.690.150',
#     'H02.403.690.150.580',
#     'H02.403.690.208',
#     'H02.403.690.260',
#     'H02.403.690.508',
#     'H02.403.690.754',
    'H02.403.720.750.550',
    'H02.478.676.542',
    'H02.720',
    'I01.198.780.937.469',
    'I01.880.604.583.310',
    'J01.293.556',
    'K01.400.899',
    'K01.517.584.500',
    'K01.752',
#     'K01.752.304',
#     'K01.752.798',
    'K01.844.664',
    'N02.421.533.505',
    'N03.706.535.351',
]

In [None]:
def any_starts_with_aux(s):
    return any(map(lambda x: s.startswith(x), my_mesh_headings))

def any_starts_with(ls):
    return any(map(any_starts_with_aux, ls))

In [None]:
%%time

error_set = set()

article_ids = []
for filename in sorted(os.listdir('data/src/')):
    print(filename)
    filepath = 'data/src/' + filename
    root = ET.parse(filepath).getroot()
    for a in root.findall('PubmedArticle'):
        mesh_names = [e.text for e in a.findall('MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName')]
        for mesh_name in mesh_names:
            if mesh_name in mesh_tree_sr:
                mesh_heading = mesh_tree_sr[mesh_name]
                if isinstance(mesh_heading, str):
                    mesh_headings = [mesh_heading]
                elif isinstance(mesh_heading, pd.Series):
                    mesh_headings = list(mesh_heading)
                else:
                    mesh_headings = []
                    error_set.add((1, filename, mesh_name))
                if any_starts_with(mesh_headings):
                    pmid = a.find('MedlineCitation/PMID').text
                    article_ids.append(pmid)
            else:
                error_set.add((2, filename, mesh_name))
        a.clear()
    root.clear()

medline17n0001.xml


In [None]:
len(error_set)

In [None]:
len(article_ids)

In [None]:
with open('data/article_ids_full_2.pickle', 'wb') as f:
    pickle.dump(article_ids, f)