# pypmed

In [61]:
import requests
from xml.etree import ElementTree as ET
from datetime import datetime
from thefuzz import fuzz
current_year = datetime.now().year


def institution_check(institution: str, 
                      affiliation_info: str) -> int:
    # define score 
    overall_score = 0
    institution_split = institution.split()
    aff_split = affiliation_info.lower().split()
    
    # loop through each word in the institution name, check match with each word in the affiliation info
    for u in institution_split:
        ind_best=0
        val_best = 0
        for i, a in enumerate(aff_split):
            s = fuzz.ratio(u, a)
            if s > val_best:
                val_best = s
                ind_best = i
        
        # remove index of best match
        overall_score += val_best
        del aff_split[ind_best]
    # average overall score
    overall_score /= len(institution_split)
    return overall_score


def name_score(query_author_name: str, 
               author_list_name: str) -> int:    
    return fuzz.ratio(query_author_name, author_list_name)


# function to convert XML ElementTree element to a dictionary
def elem_to_dict(elem):
    if len(elem) == 0:
        return elem.text
    d = {}
    for child in elem:
        child_dict = elem_to_dict(child)
        if child.tag in d:
            if isinstance(d[child.tag], list):
                d[child.tag].append(child_dict)
            else:
                d[child.tag] = [d[child.tag], child_dict]
        else:
            d[child.tag] = child_dict
    return d
        
def build_query_string(search_criteria: dict) -> str:
    # build query string
    search_string = ""
    if 'author_first_name' in search_criteria.keys() or 'author_last_name' in search_criteria.keys():
        if "author_first_name" in search_criteria.keys():
            search_string += f'{search_criteria["author_first_name"]} '
        if "author_last_name" in search_criteria.keys():
            search_string += f'{search_criteria["author_last_name"]} '
        search_string = search_string[:-1] + '[Author] '
            
    if "publication_year" in search_criteria.keys():
        search_string += f'{search_criteria["publication_year"]}[PDAT] '
    
    if "journal" in search_criteria.keys():
        search_string += f'{search_criteria["journal"]}[Journal]'
        
    search_string = search_string.strip()
    return search_string


def author_metadata_filter(author_metadata: dict, 
                           article_dict: dict,
                           author_list) -> list:
    # unpack current metadata
    author_query_name = author_metadata['author_first_name'] + ' ' + author_metadata['author_last_name']
    institution = author_metadata['institution']
    
    # loop over authors in author list and calculate score
    best_match = None
    best_score = 0
    for item in author_list:
        # create author name
        last_name = item['LastName'].lower()
        first_name = item['ForeName'].lower()
        author_list_name = first_name + ' ' + last_name
        
        # compute score
        score = name_score(author_query_name, author_list_name)
        
        # check if score is better than previous best
        if score > best_score and score > 80:
            best_score = score
            best_match = item
            
    # check if best match for name is not None - if so article is confirmed for author
    if best_match is not None:
        # add author info to article package
        article_dict['author_first_name'] = author_metadata['author_first_name']
        article_dict['author_last_name'] = author_metadata['author_last_name']
                
        # check affiliation if provided 
        if institution is not None:
            affiliation_info = best_match['AffiliationInfo']['Affiliation']
            score = institution_check(institution, affiliation_info)
            if score > 80:
                # author and institution confirmed - add to return list
                article_dict['institution'] = institution
                            
        return article_dict
    return None
                            

def query_pubmed(search_criteria: dict, limit: int = 10):
    # unpack institution if present - not a search criteria for official pubmed api
    institution = None
    if 'institution' in search_criteria.keys():
        institution = search_criteria['institution']
    
    # build query string from input search_criteria
    query_str = build_query_string(search_criteria)
            
    # setup api endpoint to retrieve pubmed ids based on input query
    endpoint_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params = {
        'db': 'pubmed',
        'term': f'{query_str}',
        'retmode': 'json'
    }

    # Make API request and retrieve json response with ids
    response = requests.get(endpoint_url, params=params)
    
    # if response is ok, retrieve the ids and use them to retrieve the desired fields for each article
    if response.ok:
        data = response.json()
        id_list = data["esearchresult"]["idlist"]
        
        # Use the IDs to retrieve the desired fields for each article
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        params = {
            "db": "pubmed",
            "id": ",".join(id_list),
            "retmode": "xml", # return type - supposedly can be json but couldn't get this return type in my tests
            # "rettype": "abstract"
        }
        
        # make request
        response = requests.get(url, params=params)
        
        # if response is ok, parse the xml and convert to a dictionary
        if response.ok:
            # Parse the XML file
            root = ET.fromstring(response.text)

            # Convert the root element to a dictionary
            article_list = []
            try:
                article_list = elem_to_dict(root)['PubmedArticle']
                if not isinstance(article_list, list):
                    article_list = [article_list]
            except:
                print(f"No articles found for search criteria: {search_criteria}")
                return None
                
            # filter articles by checking that desired author / institution is present            
            final_articles = []
            for article in article_list:
                # generate example
                pmid = article['MedlineCitation']['PMID']
                journal_issn = article['MedlineCitation']['Article']['Journal']['ISSN']
                journal_title = article['MedlineCitation']['Article']['Journal']['Title']
                article_title = article['MedlineCitation']['Article']['ArticleTitle']
                publication_year = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
                author_list = article['MedlineCitation']['Article']['AuthorList']['Author']

                # start article package
                article_dict = {}
                article_dict['pmid'] = pmid
                article_dict['journal_issn'] = journal_issn
                article_dict['journal_title'] = journal_title
                article_dict['article_title'] = article_title
                article_dict['publication_year'] = publication_year

                # if author search - perform filtering (double check that correct author is pulled)
                if search_criteria['author_first_name'] is not None and search_criteria['author_last_name'] is not None:
                    # package author info for filter
                    author_metadata = {}
                    author_metadata['author_first_name'] = search_criteria['author_first_name']
                    author_metadata['author_last_name'] = search_criteria['author_last_name']
                    author_metadata['institution'] = institution
                    
                    # apply filter
                    filtered_article = author_metadata_filter(author_metadata, article_dict, author_list)
                    
                    # store 
                    if filtered_article is not None:
                        final_articles.append(filtered_article)
                else:
                    article_dict['author_list'] = author_list
                    final_articles.append(article_dict)
                
                return final_articles
        return None

In [64]:
search_criteria = {'author_first_name': 'rachel', 
                   'author_last_name': 'gottschalk', 
                   'publication_year': 2022,
                   'institution':  'university of pittsburgh at pittsburgh'
                   }
response = query_pubmed(search_criteria=search_criteria)

In [65]:
response


[{'pmid': '35063833',
  'journal_issn': '1879-0372',
  'journal_title': 'Current opinion in immunology',
  'article_title': 'Mechanisms encoding STAT functional diversity for context-specific inflammatory responses.',
  'publication_year': '2022',
  'author_first_name': 'rachel',
  'author_last_name': 'gottschalk',
  'institution': 'university of pittsburgh at pittsburgh'}]

In [7]:
university = 'university of pittsburgh at pittsburgh'
affiliation_info = author_['AffiliationInfo']['Affiliation']
fuzz.ratio(university, affiliation_info.lower())

27

In [6]:
search_criteria['author_first_name']

'rachel'

In [18]:
example = response[0]


In [19]:
len(response)

1

In [43]:
best_match

{'LastName': 'Gottschalk',
 'ForeName': 'Rachel A',
 'Initials': 'RA',
 'AffiliationInfo': {'Affiliation': 'Department of Immunology, University of Pittsburgh School of Medicine, University of Pittsburgh, Pittsburgh, PA, USA; Center for Systems Immunology, University of Pittsburgh, Pittsburgh, PA, USA. Electronic address: rachel.gottschalk@pitt.edu.'}}

In [45]:
institution = 'university of pittsburgh at pittsburgh'




Found match with score 87.0


In [None]:
 institution_check(institution, affiliation_info)

In [22]:
    author_ = author_list['Author'][1]
    last_name = author_['LastName'].lower()
    first_name = author_['ForeName'].lower()

overall_score

87.0