In [1]:
import pandas as pd

from utils.funcs import *
from pathlib import Path
from Bio import Medline
from joblib import Parallel, delayed

In [2]:
queries = [
    "immunology",
    "bioinformatics",
    "single cell",
    "cancer",
    "infectious disease",
    "genomics",
    "microbiology",
    "virology",
    "pathology",
    "biochemistry",
    "molecular biology",
    "cell biology",
    "neuroscience",
    "structural biology",
    "pharmacology",
    "Physiology",
    "allergy",
    "developmental biology",
    "aging",
    "bioengineering",
]
# title case
queries = [q.title() for q in queries]
# sort queries alphabetically
queries = sorted(queries)
queries

['Aging',
 'Allergy',
 'Biochemistry',
 'Bioengineering',
 'Bioinformatics',
 'Cancer',
 'Cell Biology',
 'Developmental Biology',
 'Genomics',
 'Immunology',
 'Infectious Disease',
 'Microbiology',
 'Molecular Biology',
 'Neuroscience',
 'Pathology',
 'Pharmacology',
 'Physiology',
 'Single Cell',
 'Structural Biology',
 'Virology']

In [3]:
out_file = Path("data/recent_papers.txt")

In [4]:
# if exists, delete it
if out_file.exists():
    out_file.unlink()

In [5]:
# search and download papers for each query
for query in queries:
    search_and_download(query, out_file)

Found 2500 results for Aging


100%|██████████| 25/25 [00:38<00:00,  1.53s/it]


Found 2500 results for Allergy


100%|██████████| 25/25 [00:39<00:00,  1.59s/it]


Found 2500 results for Biochemistry


100%|██████████| 25/25 [00:36<00:00,  1.48s/it]


Found 2500 results for Bioengineering


100%|██████████| 25/25 [00:35<00:00,  1.43s/it]


Found 2500 results for Bioinformatics


100%|██████████| 25/25 [00:45<00:00,  1.81s/it]


Found 2500 results for Cancer


100%|██████████| 25/25 [01:09<00:00,  2.79s/it]


Found 2500 results for Cell Biology


100%|██████████| 25/25 [01:13<00:00,  2.93s/it]


Found 2500 results for Developmental Biology


100%|██████████| 25/25 [00:47<00:00,  1.89s/it]


Found 2500 results for Genomics


100%|██████████| 25/25 [00:43<00:00,  1.74s/it]


Found 2500 results for Immunology


100%|██████████| 25/25 [00:35<00:00,  1.43s/it]


Found 2500 results for Infectious Disease


100%|██████████| 25/25 [00:35<00:00,  1.40s/it]


Found 2500 results for Microbiology


100%|██████████| 25/25 [00:36<00:00,  1.45s/it]


Found 2500 results for Molecular Biology


100%|██████████| 25/25 [00:35<00:00,  1.43s/it]


Found 2500 results for Neuroscience


100%|██████████| 25/25 [00:35<00:00,  1.41s/it]


Found 2500 results for Pathology


100%|██████████| 25/25 [00:35<00:00,  1.42s/it]


Found 2500 results for Pharmacology


100%|██████████| 25/25 [00:32<00:00,  1.32s/it]


Found 2500 results for Physiology


100%|██████████| 25/25 [00:40<00:00,  1.62s/it]


Found 2500 results for Single Cell


100%|██████████| 25/25 [00:35<00:00,  1.42s/it]


Found 2500 results for Structural Biology


100%|██████████| 25/25 [00:38<00:00,  1.52s/it]


Found 2500 results for Virology


100%|██████████| 25/25 [00:36<00:00,  1.46s/it]


In [6]:
def parse(record):
    """Parse a single Medline record"""
    return {
        "pubmed_id": record.get("PMID"),
        "doi": record.get("AID"),
        "title": record.get("TI"),
        "abstract": record.get("AB"),
        "journal": record.get("TA"),
        "publication_date": record.get("DP"),
        "electronic_publication_date": record.get("DEP"),
    }

In [7]:
# parse the downloaded papers in parallel
with open(out_file, "r") as handle:
    records = Medline.parse(handle)
    records = Parallel(n_jobs=-1)(delayed(parse)(record) for record in records)
len(records)

50000

In [8]:
# convert to a pandas dataframe
articles = pd.DataFrame(records)
articles.shape

(50000, 7)

In [9]:
articles.head()

Unnamed: 0,pubmed_id,doi,title,abstract,journal,publication_date,electronic_publication_date
0,37957539,[10.1002/adbi.202300453 [doi]],Identification and Validation of Glomeruli Cel...,Accumulating evidence indicates that cellular ...,Adv Biol (Weinh),2023 Nov 13,20231113
1,37956978,[10.1055/a-2209-6357 [doi]],The potential of anti-coronavirus plant second...,"In early 2020, a global pandemic was announced...",Planta Med,2023 Nov 13,20231113
2,37956941,"[S0014-4835(23)00342-1 [pii], 10.1016/j.exer.2...",Blockade of interleukin-6 trans-signaling prev...,Interleukin-6 (IL-6) is a multifaceted cytokin...,Exp Eye Res,2023 Nov 11,20231111
3,37956927,"[S1568-1637(23)00281-7 [pii], 10.1016/j.arr.20...",Pharmacological modulation of vascular ageing:...,"Vascular ageing, characterized by structural a...",Ageing Res Rev,2023 Nov 11,20231111
4,37956894,"[S0006-2952(23)00505-1 [pii], 10.1016/j.bcp.20...",DEL-1 deficiency aggravates pressure overload-...,Recent studies have shown that neutrophils pla...,Biochem Pharmacol,2023 Nov 11,20231111


In [10]:
# filter out duplicates
articles = articles.drop_duplicates(subset="pubmed_id", ignore_index=True)
articles.shape

(33968, 7)

In [11]:
# remove rows with missing values
articles = articles.dropna()
articles.shape

(29107, 7)

In [12]:
# check if abstract is None
articles["abstract"].isnull().any()

False

In [13]:
articles.reset_index(drop=True, inplace=True)
articles.head()

Unnamed: 0,pubmed_id,doi,title,abstract,journal,publication_date,electronic_publication_date
0,37957539,[10.1002/adbi.202300453 [doi]],Identification and Validation of Glomeruli Cel...,Accumulating evidence indicates that cellular ...,Adv Biol (Weinh),2023 Nov 13,20231113
1,37956978,[10.1055/a-2209-6357 [doi]],The potential of anti-coronavirus plant second...,"In early 2020, a global pandemic was announced...",Planta Med,2023 Nov 13,20231113
2,37956941,"[S0014-4835(23)00342-1 [pii], 10.1016/j.exer.2...",Blockade of interleukin-6 trans-signaling prev...,Interleukin-6 (IL-6) is a multifaceted cytokin...,Exp Eye Res,2023 Nov 11,20231111
3,37956927,"[S1568-1637(23)00281-7 [pii], 10.1016/j.arr.20...",Pharmacological modulation of vascular ageing:...,"Vascular ageing, characterized by structural a...",Ageing Res Rev,2023 Nov 11,20231111
4,37956894,"[S0006-2952(23)00505-1 [pii], 10.1016/j.bcp.20...",DEL-1 deficiency aggravates pressure overload-...,Recent studies have shown that neutrophils pla...,Biochem Pharmacol,2023 Nov 11,20231111


In [14]:
# save to parquet
articles.to_parquet("data/pubmed_articles.parquet")