# Publication matching
- Identify publications that belong to the same scientific research field
- based on: (1) text similarity and (2) references similarity

## Outline
- generate target corpus (the publications for which we want to find matches)
    - retrieve XML publications information from PubMed for specific search term
    - restructure information into Article dataclass for easier processing
    - pickle parsed articles (create break point in work flow)
- generate general corpus (the publications from which we extract matches)
- retrieve references for target corpus
- determine text similarity 
    - text-frequency inverse document frequency (tf-idf) approach on titles and abstracts

### Python library imports

In [40]:
from pathlib import Path  # construct file paths
from Bio import Entrez    # query the NCBI API
import configparser       # retrieve private credentials from file (which is ignored by git)
import xml.etree.ElementTree as ET
import pickle
from crossref.restful import Works    # query the Crossref REST API
from crossref.restful import Works, Etiquette



### Generate target corpus
- As example, we use publications of Madlen Vetter 
- Retrieve publication info from PubMed

In [33]:
# name directory and file according to search_term
resultdir_string = "my_publications"
# define path
main_dir = Path("./")
# mkdir result directory
Path(main_dir / resultdir_string).mkdir(parents=True, exist_ok=True)
# create path object
file_to_open_batched = main_dir / resultdir_string / 'batched.xml'
file_to_open_cleaned = main_dir / resultdir_string / 'cleaned.xml'
file_to_open_parsed = main_dir / resultdir_string / 'parsed_articles.pickle'

In [83]:
# search term for PupMed query
search_term =  '(madlen vetter[author])'
# credentials for NCBI API (Entrez)
# read credentials
config = configparser.ConfigParser()
config.read("../credentials/publication_matching_creds.txt")
pubmed_user = config.get("pubmed", "user")
pubmed_key = config.get("pubmed", "api_key")

In [29]:
def get_clean_xml(search_term, pubmed_user, api_key, batch_size, file_to_open_batched, file_to_open_cleaned):
    """
    Requirements:
    - requires a search term
    - a batch size that is downloaded from Entrez
    - a file path to write out the data
    Actions:
    - calls the Entrez API
    - prints the number of records for the search term
    - saves webenv and querykey for subsequent searches
    - posts the record IDs to the Entrez history server
    - retrieves result in batches using the history server
    - handles server timeouts and retries http calls
    - deposits search_term at the end of the file
    Output:
    - prints progress along the way
    - deposits batched file according to file_to_open_batched path object
    - cleans repetitive XML headers (result of batching)
    - deposits cleaned file according to file_to_open_cleaned path object
    """
    Entrez.email = pubmed_user
    apikey = pubmed_key

    # test the PubMed waters, get the record count and save the history
    handle = Entrez.esearch(db = "pubmed", term = search_term, retmax = 30000, usehistory = "y")
    record = Entrez.read(handle)
    handle.close()
    count = int(record["Count"])

    webenv = record["WebEnv"]
    query_key = record["QueryKey"]

    # first identify the number of counts,
    handle = Entrez.esearch(db = "pubmed", term = search_term, retmax = count)
    record = Entrez.read(handle)

    id_list = record["IdList"]
    assert count == len(id_list)
    print("There are {} records for {}".format(count, search_term))

    post_xml = Entrez.epost("pubmed", id = ",".join(id_list))
    search_results = Entrez.read(post_xml)

    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]

    # generate file handle for the path object
    with file_to_open_batched.open("w", encoding ="utf-8") as out_handle:
        for start in range(0, count, batch_size):
            end = min(count, start + batch_size)
            print("Going to download record %i to %i" % (start+1, end))
            attempt = 0
            while attempt < 3:
                attempt += 1
                try:
                    fetch_handle = Entrez.efetch(db = "pubmed", retmode = "xml",
                                                     retstart = start, retmax = batch_size,
                                                     webenv = webenv, query_key = query_key,
                                                     api_key = apikey)
                except HTTPError as err:
                    if 500 <= err.code <= 599:
                        print("Received error from server %s" % err)
                        print("Attempt %i of 3" % attempt)
                        time.sleep(15)
                    else:
                        raise
            data = fetch_handle.read()
            fetch_handle.close()
            out_handle.write(data)

    # deposit search term as comment at the end of the file
    search_term_comment = "".join(['\n<!--Generated by PubMed search term: ', search_term, "-->\n"])

    with file_to_open_batched.open("a", encoding ="utf-8") as myfile:
        myfile.write(search_term_comment)

    # remove XML header lines that are artifacts of batch process
    problems = ('<?xml version', "<!DOCTYPE PubmedArticleSet PUBLIC", "<PubmedArticleSet", "</PubmedArticleSet")
    with file_to_open_batched.open("r", encoding ="utf-8") as f:
        with file_to_open_cleaned.open("w", encoding ="utf-8") as out_file:
            for i in range(10):
                out_file.write(f.readline())
            for line in f:
                if not line.startswith(problems):
                    out_file.write(line)
            out_file.write("</PubmedArticleSet>\n")


In [56]:
# provide pubmed search term, pubmed user name, pubmed api key, 
# batch size, intermediate batch file path, and path object for final file
get_clean_xml(search_term, pubmed_user, pubmed_key, 5000, file_to_open_batched, file_to_open_cleaned)

There are 4 records for (madlen vetter[author])
Going to download record 1 to 4


### Build target corpus from PubMed XML information

In [35]:
# Create dataclass for articles
from dataclasses import dataclass, field
from typing import Dict, List, Any

@dataclass
# @dataclass_json
class Article:
    my_id: str = field(default = None)
    doi: str = field(default = None)
    pmid: str = field(default = None) # using a field allows to initiate without that info
    authors: List[Any] = field(default_factory = list)
    title: str = field(default = None)
    abstract: str = field(default = None)
    content: str = field(default = None)
    journal: str = field(default = None)
    year: int = field(default = 0)
    references: List[Any] = field(default_factory = list)

In [58]:
# read XML file and find root
with file_to_open_cleaned.open("r", encoding ="utf-8") as infile:
    tree = ET.parse(infile)
    root = tree.getroot()

In [59]:
# Explore XML structure
#[elem.tag for elem in root.iter()]
# articles = root.findall('.//PubmedArticle')
# print(ET.tostring(articles[1]).decode("utf-8"))
# abstract = articles[9].find('.//Abstract')
# print(ET.tostring(abstract, encoding='utf-8', method='xml').decode("utf-8"))

In [33]:
def article_from_pubmed(root):
    # root is an ElementTree element with the PubmedArticle tag
    fields = {}
    articleids = root.findall('.//ArticleId')
    for Id in articleids:
        # TODO isn't there a nicer way to do this?
        if 'doi' in Id.attrib.values():
            fields['doi'] = Id.text
        if 'pubmed' in Id.attrib.values():
            fields['pmid'] = Id.text
    if 'doi' in fields:
        fields['my_id'] = fields['doi']
    elif 'pmid' in fields:
        # Only use pmid for my_id if no doi
        fields['my_id'] = fields['pmid']
    authors = []
    for surname in root.findall(".//AuthorList/Author/LastName"):
        # TODO parse full name if needed
        authors.append(surname.text)
    fields['authors'] = authors
    fields['title'] = root.findtext('.//ArticleTitle')
    fields['journal'] = root.findtext('.//ISOAbbreviation')
    fields['year'] =  root.findtext('.//JournalIssue/PubDate/Year')
    abstract = root.find('.//Abstract')
    if abstract:
        fields['abstract'] = ET.tostring(abstract, encoding='utf-8', method='text').decode("utf-8")
    #if (doi AND title = title, authors=authors, journal=journal, year=year)
    return Article(**fields)

In [61]:
target_articles = []
for article in root.findall('.//PubmedArticle'):
    parsed = article_from_pubmed(article)
    target_articles.append(parsed)

In [62]:
# write out pickle with processed publication information (natural break point in work flow)
with file_to_open_parsed.open("wb") as outfile:
    pickle.dump(target_articles, outfile)

In [63]:
# read in the pickle with processed publication information
with file_to_open_parsed.open("rb") as infile:
    target_articles = pickle.load(infile)

## Generate general publication pool

In [15]:
search_term =  'plants[mh] AND immun*[MH]'
# credentials for NCBI API (Entrez)
# read credentials
config = configparser.ConfigParser()
config.read("../credentials/publication_matching_creds.txt")
pubmed_user = config.get("pubmed", "user")
pubmed_key = config.get("pubmed", "api_key")

In [16]:
# name directory and file according to search_term
resultdir_string = "plant_publications"
# define path
main_dir = Path("./")
# mkdir result directory
Path(main_dir / resultdir_string).mkdir(parents=True, exist_ok=True)
# create path object
file_to_open_batched = main_dir / resultdir_string / 'batched.xml'
file_to_open_cleaned = main_dir / resultdir_string / 'cleaned.xml'
file_to_open_parsed = main_dir / resultdir_string / 'parsed_articles.pickle'

In [24]:
# before retrieving anything, identify the number of counts
Entrez.email = pubmed_user
apikey = pubmed_key

handle = Entrez.esearch(db = "pubmed", term = search_term, retmax = 500000, usehistory = "y")
record = Entrez.read(handle)

webenv = record["WebEnv"] 
query_key = record["QueryKey"]

id_list = record["IdList"]
print(len(id_list))

39198


In [25]:
# retrieve info on frequency of individual terms
record['TranslationStack']

[{'Term': '"plants"[MeSH Terms]', 'Field': 'MeSH Terms', 'Count': '774600', 'Explode': 'Y'}, {'Term': 'immunity[MH]', 'Field': 'MH', 'Count': '335785', 'Explode': 'Y'}, {'Term': 'immunization[MH]', 'Field': 'MH', 'Count': '172778', 'Explode': 'Y'}, 'OR', {'Term': 'immunoassay[MH]', 'Field': 'MH', 'Count': '486189', 'Explode': 'Y'}, 'OR', {'Term': 'immunoblotting[MH]', 'Field': 'MH', 'Count': '203924', 'Explode': 'Y'}, 'OR', {'Term': 'immunochemistry[MH]', 'Field': 'MH', 'Count': '299867', 'Explode': 'Y'}, 'OR', {'Term': 'immunocompetence[MH]', 'Field': 'MH', 'Count': '7453', 'Explode': 'Y'}, 'OR', {'Term': 'immunoconglutinins[MH]', 'Field': 'MH', 'Count': '30', 'Explode': 'Y'}, 'OR', {'Term': 'immunoconjugates[MH]', 'Field': 'MH', 'Count': '11091', 'Explode': 'Y'}, 'OR', {'Term': 'immunodiffusion[MH]', 'Field': 'MH', 'Count': '46121', 'Explode': 'Y'}, 'OR', {'Term': 'immunoelectrophoresis[MH]', 'Field': 'MH', 'Count': '25514', 'Explode': 'Y'}, 'OR', {'Term': 'immunogenetics[MH]', 'Fiel

In [26]:
def get_pubmed_summary(webenv, query_key, apikey, numrec):
    handle = Entrez.esummary(db="pubmed", retmax = numrec, retmode="xml", webenv = webenv, query_key = query_key, api_key = apikey)
    records = Entrez.parse(handle)
    # build a dict of dicts
    data = {}
    record_id = 0
    for record in records:
        # each record is a Python dictionary or list.
        data[record_id] = data.get(record_id, {})
        data[record_id].update(record)
        record_id += 1       
        print(record['Title']) #, record["AuthorList"]
    handle.close()

In [27]:
# Retrieve the titles of some summary records to evaluate topical fit
numrec = 10 # number of records
get_pubmed_summary(webenv, query_key, pubmed_key, numrec)

Origins of peanut allergy-causing antibodies.
Atypical Resistance Protein RPW8/HR Triggers Oligomerization of the NLR Immune Receptor RPP7 and Autoimmunity.
Phenolic Amides with Immunomodulatory Activity from the Nonpolysaccharide Fraction of <i>Lycium barbarum</i> Fruits.
Vaccarin hastens wound healing by promoting angiogenesis via activation of MAPK/ERK and PI3K/AKT signaling pathways in vivo.
Cell Wall Membrane Fraction of <i>Chlorella sorokiniana</i> Enhances Host Antitumor Immunity and Inhibits Colon Carcinoma Growth in Mice.
Identification of lncRNAs and their regulatory relationships with target genes and corresponding miRNAs in melon response to powdery mildew fungi.
Genetic mapping using a wheat multi-founder population reveals a locus on chromosome 2A controlling resistance to both leaf and glume blotch caused by the necrotrophic fungal pathogen Parastagonospora nodorum.
Identification of a Recessive Gene <i>PmQ</i> Conferring Resistance to Powdery Mildew in Wheat Landrace Qi

In [30]:
# retrieve XML records for general publications
get_clean_xml(search_term, pubmed_user, pubmed_key, 5000, file_to_open_batched, file_to_open_cleaned)

There are 39198 records for plants[mh] AND immun*[MH]
Going to download record 1 to 5000
Going to download record 5001 to 10000
Going to download record 10001 to 15000
Going to download record 15001 to 20000
Going to download record 20001 to 25000
Going to download record 25001 to 30000
Going to download record 30001 to 35000
Going to download record 35001 to 39198


In [31]:
# read XML file and find root of general articles
with file_to_open_cleaned.open("r", encoding ="utf-8") as infile:
    tree = ET.parse(infile)
    root = tree.getroot()

In [36]:
general_articles = []
for article in root.findall('.//PubmedArticle'):
    parsed = article_from_pubmed(article)
    general_articles.append(parsed)

In [37]:
# write out pickle with processed publication information (natural break point in work flow)
with file_to_open_parsed.open("wb") as outfile:
    pickle.dump(general_articles, outfile)

### Add references using Crossref

In [41]:
# set up crossref etiquette
config = configparser.ConfigParser()
config.read("../credentials/publication_matching_creds.txt")
crossref_url = config.get("crossref", "url")
crossref_email = config.get("crossref", "email")
my_etiquette = Etiquette('Publication Matching', '0.1', crossref_url, crossref_email)

In [42]:
# retrieve crossref data
works = Works(etiquette=my_etiquette)

# TODO: write functions and apply to target_articles and general_articles

In [None]:
no_references = not_in_crossref = 0
ref_articles = []
for article in general_articles:
    if article.doi:
        ref_list = []
        record = works.doi(article.doi)
        if record:
            if 'reference' in record:
                for ref in record['reference']:
                    title = ref.get('article-title', None)
                    authors = ref.get('author', None)
                    year = ref.get('year', None)
                    journal = ref.get('journal-title', None)
                    doi = ref.get('DOI', None)
                    ref_list.append(Article(my_id=doi, doi=doi, title=title, authors=authors, year=year, journal=journal))
                article.references = ref_list
                ref_articles.append(article)
            else:
                no_references += 1
        else: 
            not_in_crossref += 1
print("Total number or articles: {}".format(len(articles)))
print("Not in crossref: {}".format(not_in_crossref))

In [72]:
# pickle result articles with references
with file_to_open_parsed.open("wb") as outfile:
    pickle.dump(articles, outfile)

## Text similarity

### Prep data structures

In [None]:
# retrieve general articles with reference data
# with XXXXfile_to_open_parsed.open("rb") as infile:
#     general_articles = pickle.load(infile)

In [None]:
# all_articles_dict = {}
# for article in all_articles:
#     all_articles_dict[article.my_id] = article

In [None]:
# list of articles and abstracts from publications, if abstract is sufficiently long
# general_articles = []
# general_abstracts = []
# for article in all_articles:
#     abstract = article.abstract or ''
#     abstract = abstract.strip()
#     if len(abstract) > 50:
#         general_articles.append(article)
#         general_abstracts.append(abstract)
        
# build a list with all target abstracts, and list of all target articles in same order
# target_articles = []
# target_abstracts = []
# for article in target.values():
#     target_articles.append(article)
#     target_abstracts.append(article.abstract)

# build a joint corpus with identifier
# all_corpus = general_abstracts + target_abstracts