In [149]:
import csv
import pandas as pd
import numpy as np
import xmltodict
from tqdm import tqdm
import pickle
import os
import gzip
import random
from collections import Counter
from itertools import combinations

import torch
import torch_geometric.transforms as T
from torch_sparse import SparseTensor
from torch_geometric.data import HeteroData
import torch_geometric.utils as U

os.chdir('/Users/lyk/Downloads/graph_clf')

In [3]:
# This file contains the node IDs, label, and bag-of-words vector.
records = []
with open("data/pubmed-diabetes/Pubmed-Diabetes.NODE.paper.tab") as tsv:
    for line in csv.reader(tsv, delimiter="\t"):
        records.append(line)

ids = [int(r[0]) for r in records[2:]]
labels = [int(r[1][-1])-1 for r in records[2:]]

In [5]:
vocab = [s.split(":")[1] for s in records[1][1:-1]]
features = [[float(0)]*len(vocab) for _ in records[2:]]
for i, r in enumerate(records[2:]):
    for s in r[2:-1]:
        features[i][vocab.index(s.split("=")[0])] = float(s.split("=")[1])

In [6]:
df = pd.DataFrame({"pmid":ids, "label":labels, "tfidf":features})
df.head()

Unnamed: 0,pmid,label,tfidf
0,12187484,0,"[0.09393489570187145, 0.028698458467273157, 0...."
1,2344352,0,"[0.023617916633613394, 0.0, 0.0147841590601865..."
2,14654069,0,"[0.10226314418677966, 0.0, 0.01066898076508311..."
3,16443886,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2684155,0,"[0.030615817858387732, 0.0, 0.0, 0.0, 0.0, 0.0..."


In [None]:
# Download the baseline files in batches of 100, since there are 1167 files. 
files = ["pubmed23n" + str(i).zfill(4) + ".xml.gz" for i in range(1, 1167)]
split = [files[i:i + 100] for i in range(0, len(files), 100)]
pmids, titles, years, abs, authors, journals, mesh, keywords  = [], [], [], [], [], [], [], []

def handler(_, node):
    parent_citation = node["MedlineCitation"]
    if int(parent_citation["PMID"]["#text"]) in ids:
        pmids.append(int(parent_citation["PMID"]["#text"]))
        titles.append(parent_citation["Article"]["ArticleTitle"])
        try:
            years.append(node["PubmedData"]["History"]["PubMedPubDate"][0]["Year"])
        except KeyError:
            years.append(None)
        try:
            abs.append(parent_citation["Article"]["Abstract"]["AbstractText"])
        except KeyError:
            abs.append(None)
        try:
            authors.append(parent_citation["Article"]["AuthorList"]["Author"])
        except KeyError:
            authors.append(None)
        try:
            journals.append(parent_citation["MedlineJournalInfo"]["NlmUniqueID"])
        except KeyError:
            journals.append(None)
        try:
            mesh.append(parent_citation["MeshHeadingList"]["MeshHeading"])
        except KeyError:
            mesh.append(None)
        try:
            keywords.append(parent_citation["KeywordList"]["Keyword"])
        except KeyError:
            keywords.append(None)
    return True

def process_gz_batch(files):
    for f in tqdm(files):
        xmltodict.parse(gzip.GzipFile("data/pubmed-diabetes/baseline/" + f), item_depth=2, item_callback=handler)
    results = [pmids, titles, years, abs, authors, journals, mesh, keywords]
    fromto = files[0].split(".")[0][-4:] + "_" + files[-1].split(".")[0][-4:]
    with open("data/pubmed-diabetes/baseline/in_pubmed_diabetes/results_" + fromto + ".pkl", "wb") as ff:
        pickle.dump(results, ff)
    return results

results = process_gz_batch(split[11]) # change the files arg here manually to process all splits.

In [None]:
# Concatenate the per-split results of `process_gz_batch` into one list.
files = os.listdir(f"data/pubmed-diabetes/baseline/in_pubmed_diabetes/")
in_pubmed = []
for i in tqdm(range(len(files))):
    if files[i] == ".DS_Store":
        continue
    with open(f"data/pubmed-diabetes/baseline/in_pubmed_diabetes/{files[i]}", "rb") as f:
        temp = pickle.load(f)
        in_pubmed.append(temp)

in_pubmed = list(map(list, zip(*in_pubmed)))
in_pubmed = [[k for j in i for k in j] for i in in_pubmed]

In [79]:
df_meta = pd.DataFrame(in_pubmed).T
df_meta.columns = ["pmid", "title", "year", "abstract", "authors", "journal", "mesh", "keywords"]
print("Unmatched: ", set(ids).difference(set(df_meta["pmid"]))) # check for IDs for which there was no metadata.

Unmatched:  {17874530}


In [10]:
df_meta.drop("keywords", axis=1, inplace=True) # field is too incomplete.
df_meta = df.merge(df_meta, left_on="pmid", right_on="pmid", how="left")
df_meta.drop(df_meta[df_meta.pmid == 17874530].index, inplace=True) # drop ID with missing data.
df_meta.head()

Unnamed: 0,pmid,label,tfidf,title,year,abstract,authors,journal,mesh,keywords
0,12187484,0,"[0.09393489570187145, 0.028698458467273157, 0....",Retinal metabolic abnormalities in diabetic mo...,2002,"[{'@Label': 'PURPOSE', '@NlmCategory': 'OBJECT...","{'@ValidYN': 'Y', 'LastName': 'Kowluru', 'Fore...",8104312,"[{'DescriptorName': {'@UI': 'D000818', '@Major...",
1,2344352,0,"[0.023617916633613394, 0.0, 0.0147841590601865...",Spatially resolved changes in diabetic rat ske...,1990,Phase-modulated rotating-frame imaging (p.m.r....,"[{'@ValidYN': 'Y', 'LastName': 'Challiss', 'Fo...",2984726R,"[{'DescriptorName': {'@UI': 'D000255', '@Major...",
2,14654069,0,"[0.10226314418677966, 0.0, 0.01066898076508311...",Mitochondria respiration and susceptibility to...,2003,Cardiovascular complications are the primary c...,"[{'@ValidYN': 'Y', 'LastName': 'Lashin', 'Fore...",0372430,"[{'DescriptorName': {'@UI': 'D000818', '@Major...",
3,16443886,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Mean blood glucose and biological variation ha...,2006,"[{'@Label': 'OBJECTIVE', '@NlmCategory': 'OBJE...","[{'@ValidYN': 'Y', 'LastName': 'McCarter', 'Fo...",7805975,"[{'DescriptorName': {'@UI': 'D001786', '@Major...",
4,2684155,0,"[0.030615817858387732, 0.0, 0.0, 0.0, 0.0, 0.0...",Regulation of very-low-density-lipoprotein lip...,1989,Hepatocytes were derived from 2-3-day streptoz...,"[{'@ValidYN': 'Y', 'LastName': 'Duerden', 'For...",2984726R,"[{'DescriptorName': {'@UI': 'D000818', '@Major...",


In [69]:
def clean_abstract(abstract):
    if isinstance(abstract, list):
        if not isinstance(abstract[0], dict):
            return "[OBJECTIVE] " + abstract[0] + " [METHODS] " + abstract[1]["#text"]
        else:
            return " ".join([f"[{d['@NlmCategory']}] " + d["#text"] if '@NlmCategory' in d.keys() else f"[{d['@Label']}] " + d["#text"] for d in abstract if '#text' in d.keys()])
    return "[ABSTRACT] " + abstract

df_meta["abstract"] = df_meta["abstract"].map(clean_abstract)
df_meta["text_concat"] = "[TITLE] " + df_meta["title"] + " " + df_meta['abstract']

In [70]:
def clean_authors(authors):
    if authors is None:
        return None
    if not isinstance(authors, list):
        authors = [authors]
    result = []
    for a in authors:
        if "CollectiveName" in a.keys():
            result.append(a["CollectiveName"])
        elif "Initials" in a.keys():
            result.append(" ".join([a["Initials"], a["LastName"]]))
        else:
            result.append(a["LastName"])
    return result

df_meta["authors"] = df_meta["authors"].map(clean_authors)

In [71]:
# Remove these headings from the MeSH field, since they are the class labels we're trying to assign.
labels = ["Diabetes Mellitus, Experimental", "Diabetes Mellitus, Type 1", "Diabetes Mellitus, Type 2"]
df_meta["mesh"] = [[i["DescriptorName"]["#text"] for i in mesh if i["DescriptorName"]["#text"] not in labels] for mesh in df_meta["mesh"]]

In [80]:
df_meta.head()

Unnamed: 0,pmid,label,tfidf,title,year,abstract,authors,journal,mesh,text_concat
0,12187484,0,"[0.09393489570187145, 0.028698458467273157, 0....",Retinal metabolic abnormalities in diabetic mo...,2002,[OBJECTIVE] Dogs and rats are commonly used to...,[RA Kowluru],8104312,"[Animals, Diabetic Retinopathy, Galactosemias,...",[TITLE] Retinal metabolic abnormalities in dia...
1,2344352,0,"[0.023617916633613394, 0.0, 0.0147841590601865...",Spatially resolved changes in diabetic rat ske...,1990,[ABSTRACT] Phase-modulated rotating-frame imag...,"[RA Challiss, MJ Blackledge, GK Radda]",2984726R,"[Adenosine Triphosphate, Animals, Electric Sti...",[TITLE] Spatially resolved changes in diabetic...
2,14654069,0,"[0.10226314418677966, 0.0, 0.01066898076508311...",Mitochondria respiration and susceptibility to...,2003,[ABSTRACT] Cardiovascular complications are th...,"[O Lashin, A Romani]",0372430,"[Animals, Body Weight, Cell Respiration, Disea...",[TITLE] Mitochondria respiration and susceptib...
3,16443886,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Mean blood glucose and biological variation ha...,2006,[OBJECTIVE] Mean blood glucose (MBG) over 2-3 ...,"[RJ McCarter, JM Hempe, SA Chalew]",7805975,"[Blood Glucose, Glycated Hemoglobin, Humans, H...",[TITLE] Mean blood glucose and biological vari...
4,2684155,0,"[0.030615817858387732, 0.0, 0.0, 0.0, 0.0, 0.0...",Regulation of very-low-density-lipoprotein lip...,1989,[ABSTRACT] Hepatocytes were derived from 2-3-d...,"[JM Duerden, SM Bartlett, GF Gibbons]",2984726R,"[Animals, Cells, Cultured, Cholesterol, Choles...",[TITLE] Regulation of very-low-density-lipopro...


In [153]:
df_meta.to_parquet("data/pubmed-diabetes/pubmed_metadata.parquet.gzip", compression="gzip")