# pypmed

In [1]:
import requests
from xml.etree import ElementTree as ET
from datetime import datetime
from thefuzz import fuzz
current_year = datetime.now().year

# function to convert XML ElementTree element to a dictionary
def elem_to_dict(elem):
    if len(elem) == 0:
        return elem.text
    d = {}
    for child in elem:
        child_dict = elem_to_dict(child)
        if child.tag in d:
            if isinstance(d[child.tag], list):
                d[child.tag].append(child_dict)
            else:
                d[child.tag] = [d[child.tag], child_dict]
        else:
            d[child.tag] = child_dict
    return d
        
def build_query_string(search_criteria: dict) -> str:
    # build query string
    search_string = ""
    if 'author_first_name' in search_criteria.keys() or 'author_last_name' in search_criteria.keys():
        if "author_first_name" in search_criteria.keys():
            search_string += f'{search_criteria["author_first_name"]} '
        if "author_last_name" in search_criteria.keys():
            search_string += f'{search_criteria["author_last_name"]} '
        search_string = search_string[:-1] + '[Author] '
            
    if "publication_year" in search_criteria.keys():
        search_string += f'{search_criteria["publication_year"]}[PDAT] '
    
    if "journal" in search_criteria.keys():
        search_string += f'{search_criteria["journal"]}[Journal]'
        
    search_string = search_string.strip()
    return search_string


def query_pubmed(search_criteria: dict, limit: int = 10):
    # build query string from input search_criteria
    query_str = build_query_string(search_criteria)
            
    # setup api endpoint to retrieve pubmed ids based on input query
    endpoint_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params = {
        'db': 'pubmed',
        'term': f'{query_str}',
        'retmode': 'json'
    }

    # Make API request and retrieve json response with ids
    response = requests.get(endpoint_url, params=params)
    
    # if response is ok, retrieve the ids and use them to retrieve the desired fields for each article
    if response.ok:
        data = response.json()
        id_list = data["esearchresult"]["idlist"]
        
        # Use the IDs to retrieve the desired fields for each article
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        params = {
            "db": "pubmed",
            "id": ",".join(id_list),
            "retmode": "xml", # return type - supposedly can be json but couldn't get this return type in my tests
            # "rettype": "abstract"
        }
        
        # make request
        response = requests.get(url, params=params)
        print('a')
        
        # if response is ok, parse the xml and convert to a dictionary
        if response.ok:
            # # Parse the XML file
            root = ET.fromstring(response.text)

            # Convert the root element to a dictionary
            article_list = []
            try:
                article_list = elem_to_dict(root)['PubmedArticle']
                if not isinstance(article_list, list):
                    article_list = [article_list]
            except:
                print(f"No articles found for search criteria: {search_criteria}")
            return article_list


In [2]:
search_criteria = {'author_first_name': 'rachel', 
                   'author_last_name': 'gottschalk', 
                   'publication_year': 2022
                   }
response = query_pubmed(search_criteria=search_criteria)

a


In [3]:
example = response[0]

In [4]:
pmid = example['MedlineCitation']['PMID']
author_list = example['MedlineCitation']['Article']['AuthorList']
example['MedlineCitation'].keys()

dict_keys(['PMID', 'DateCompleted', 'DateRevised', 'Article', 'MedlineJournalInfo', 'ChemicalList', 'CitationSubset', 'MeshHeadingList'])

In [5]:
author_ = author_list['Author'][1]
last_name = author_['LastName'].lower()
first_name = author_['ForeName'].lower()

In [12]:
university = 'university of pittsburgh at pittsburgh'
affiliation_info = author_['AffiliationInfo']['Affiliation']
fuzz.ratio(university, affiliation_info.lower())

27

In [6]:
search_criteria['author_first_name']

'rachel'

In [7]:
from thefuzz import fuzz

fuzz.ratio(search_criteria['author_first_name'] + search_criteria['author_last_name'], first_name + last_name)

94

In [14]:
from thefuzz import process
process.extractOne(university, affiliation_info.lower())

('e', 60)

In [21]:
university
overall_score = 0
university_split = university.split()
aff_split = affiliation_info.lower().split()
for u in university_split:
    ind_best=0
    val_best = 0
    for i, a in enumerate(aff_split):
        s = fuzz.ratio(u, a)
        if s > val_best:
            val_best = s
            ind_best = i
    
    # remove index of best match
    overall_score += val_best
    del aff_split[ind_best]
# average overall score
overall_score /= len(university_split)

In [22]:
overall_score

87.0