The pubmed data is downloaded from https://www.nlm.nih.gov/databases/download/pubmed_medline.html

This file is used to extract titles, abstracts, Year, and MeSH terms from the pubmed data. The data is then used to train a model to predict the MeSH terms for a given title and abstract.

In [2]:
import gzip
import xml.etree.ElementTree as ET

# open the file and save to a dataframe
def parse_pubmed_xml(file_path):
    with gzip.open(file_path, 'rt') as f:
        tree = ET.parse(f)
        root = tree.getroot()

        # create a list to store the data
        data = []

        # iterate through the articles and extract the title, abstract, and journal name
        for article in root.findall('PubmedArticle'):
            pmid_elem = article.find('MedlineCitation/PMID')
            pmid = pmid_elem.text if pmid_elem is not None else None

            title_elem = article.find('MedlineCitation/Article/ArticleTitle')
            title = title_elem.text if title_elem is not None else None

            abstract_elem = article.find('MedlineCitation/Article/Abstract/AbstractText')
            abstract = abstract_elem.text if abstract_elem is not None else None

            year_elem = article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
            year = year_elem.text if year_elem is not None else None

            data.append([pmid, title, abstract, year])

        return data


In [151]:
# data latest version: 2021

data1 = parse_pubmed_xml('data/pubmed24n1216.xml.gz')
print(len(data1))

# data2 = parse_pubmed_xml('data/pubmed24n1218.xml.gz')
# print(len(data2))

# data3 = parse_pubmed_xml('data/pubmed24n1216.xml.gz')
# print(len(data3))

data2 = parse_pubmed_xml('data/pubmed24n1084.xml.gz')
print(len(data2))

data3 = parse_pubmed_xml('data/pubmed24n1143.xml.gz')
print(len(data3))

data = data1 + data2 + data3

30000
30000
30000


In [152]:
# remove the rows with the same pmid
data = [list(x) for x in set(tuple(x) for x in data)]

# remove the rows that have no title and abstract
data = [x for x in data if x[1] is not None and x[2] is not None]

# remove the rows that have no year
data = [x for x in data if len(x) > 2 and x[0] is not None]

print(len(data))

76950


In [153]:
# distribution of years
from collections import Counter
years = [d[3] for d in data]
year_counts1 = Counter(years)
year_counts1


Counter({'2023': 26794,
         '2021': 22364,
         '2022': 24876,
         None: 1796,
         '2024': 687,
         '2020': 305,
         '2014': 9,
         '2019': 48,
         '2013': 8,
         '2017': 12,
         '2018': 23,
         '2003': 2,
         '2012': 6,
         '2002': 1,
         '2015': 7,
         '2016': 7,
         '2010': 1,
         '2009': 1,
         '1999': 1,
         '2011': 2})

In [154]:
import json
import string
import re
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    return text


[nltk_data] Downloading package punkt to /Users/guolu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [156]:

# Filter data for each year
data_2023 = [d for d in data if d[3] == '2023']


# Extract titles and abstracts for 2023
titles_2023 = [clean_text(d[1]) for d in data_2023]

abstracts_2023 = []
for d in data_2023:
    if d[2] is None:
        continue 
    try:
        sentences = sent_tokenize(d[2])
        sentences = [clean_text(s) for s in sentences]
        abstracts_2023 += sentences
    except IndexError as e:
        print(f"Error processing abstract: {d[1]}")
        print(f"Error message: {e}")

title_abstract_2023 = titles_2023 + abstracts_2023
print(len(title_abstract_2023))

Error processing abstract: French National Authority for Health assessment of metabolic surgery for type 2 diabetes remission- a meta-analysis in patients with class I to III obesity.
Error message: list index out of range
170098


In [158]:
# Filter data for each year
data_2022 = [d for d in data if d[3] == '2022']

# Extract titles and abstracts for 2022
titles_2022 = [clean_text(d[1]) for d in data_2022]

abstracts_2022 = []
for d in data_2022:
    if d[2] is None:
        continue 
    try:
        sentences = sent_tokenize(d[2])
        sentences = [clean_text(s) for s in sentences]
        abstracts_2022 += sentences
    except IndexError as e:
        print(f"Error processing abstract: {d[1]}")
        print(f"Error message: {e}")

title_abstract_2022 = titles_2022 + abstracts_2022
print(len(title_abstract_2022))

152527


In [159]:
# Filter data for each year
data_2021 = [d for d in data if d[3] == '2021']

# Extract titles and abstracts for 2021
titles_2021 = [clean_text(d[1]) for d in data_2021]

abstracts_2021 = []
for d in data_2021:
    if d[2] is None:
        continue 
    try:
        sentences = sent_tokenize(d[2])
        sentences = [clean_text(s) for s in sentences]
        abstracts_2021 += sentences
    except IndexError as e:
        print(f"Error processing abstract: {d[1]}")
        print(f"Error message: {e}")

title_abstract_2021 = titles_2021 + abstracts_2021
print(len(title_abstract_2021))


140759


In [164]:
# save the 2023 data to one json file, each line is a sentence
with open('data/title_abstract_2023_sentences.json', 'w') as f:
    for line in title_abstract_2023:
        f.write("%s" % line)

with open('data/title_abstract_2022_sentences.json', 'w') as f:
    for line in title_abstract_2022:
        f.write("%s" % line)

with open('data/title_abstract_2021_sentences.json', 'w') as f:
    for line in title_abstract_2021:
        f.write("%s" % line)

In [168]:
# read in the data
with open('data/title_abstract_2023_sentences.json', 'r') as f:
    title_abstract_2023_1 = f.readlines()

title_abstract_2023_1[:3]

['a novel paradigm examining the remote induction of nocebo effects online\n',
 'topical atropine for childhood myopia control the atropine treatment longterm assessment study\n',
 'hepatocyte nuclear factor 4 a hnf4α a perspective in cancer\n']