# PubMed PhD Supervisor Search

## Load Packages

In [1]:
import re
import ast
import pandas as pd
from functions import get_article_ids
from collections import Counter
from tqdm.notebook import tqdm
from geotext import GeoText


In [2]:
def get_locations(affiliations):
    locations = []
    for affiliation in affiliations:
        places = GeoText(affiliation)
        if places:
            response = list(set(list(places.countries) + list(places.cities)))
            locations = locations + response
        else:
            pass
        if 'University' in locations: locations.remove('University')
        return locations

## Set Variables

In [79]:
query = 'inflammation depression'
loi = ['united states', 'france', 'netherlands', 'denmark', 'sweden', 'germany', 'switzerland', 'norway', 
                   'finland', 'luxembourg', 'belgium', 'austria', 'cambridge', 'oxford', 'london']

## Fetch Entries from PubMed

In [4]:
response = get_article_ids(query, sort = 'relevance', from_year = 2018, api_key="9f66a38099f29d882365afb5ea170b1ef608")
papers_result = response[0].to_dict('records')
affiliations_result = response[1]

Query:inflammation depression
Number of Results: 2359


HBox(children=(FloatProgress(value=0.0, description='Downloading inflammation depression data in chunks : ', m…




In [6]:
papers_result

[{'title': 'Stress, depression, diet, and the gut microbiota: human-bacteria interactions at the core of psychoneuroimmunology and nutrition.',
  'pmid': '32395568',
  'keywords': [],
  'pub_type_list': ['Journal Article'],
  'journal_info_list': ['Current opinion in behavioral sciences',
   '2352-1546',
   'Print',
   'Curr Opin Behav Sci'],
  'author_list': [['Annelise', 'A', 'Madison'],
   ['Janice K', 'JK', 'Kiecolt-Glaser']],
  'affil_list': ['Institute for Behavioral Medicine Research, The Ohio State University College of Medicine, United States.',
   'Department of Psychiatry and Behavioral Health, The Ohio State University College of Medicine, United States.'],
  'pubdate': '2020',
  'link': 'https://www.ncbi.nlm.nih.gov/pubmed/32395568',
  'abstract': ["Humans and their gut bacteria have evolved multiple ways to communicate with and regulate one another. Psychological stress and depression can promote consumption of highly palatable foods, influencing which gut bacteria thrive

## Create Tables

In [8]:
#Create Author List
def create_author_list(papers_result):
    authors = []
    for paper_dictionary in papers_result:
        for author in paper_dictionary['author_list']:
            try:
                authors.append({'author_list' : author, 'title' : paper_dictionary['title'],
                                'pmid' : paper_dictionary['pmid'], 
                                'author_string' : author[2] + ", " + author[0]})
            except:
                print(author)
                pass
    return authors

authors = create_author_list(papers_result)

In [9]:
#Create Affiliations List
def create_affil_list(affiliations_result):
    affiliations = []
    for affiliation in affiliations_result:
        try:
            author_string = affiliation['author'][2] + ', ' + affiliation['author'][0]
            affiliations.append({'author' : str(author_string), 
                                 'affiliation' : affiliation['affiliation'], 
                                 'locations' : get_locations(affiliation['affiliation']), 
                                 'pmid' : affiliation['pmid']})
        except:
            print(affiliation)
            pass
    return affiliations

affiliations = create_affil_list(affiliations_result)

In [80]:
def filter_affiliations_by_location(affiliations_by_author, locations_of_interest = loi):
    """
    Given a list of author's affiliations and the user's locations of interest, filter the affiliations
    to only contain locations the user is interested in.
    
    Args:
        affiliations_by_author - 
        locations_of_interest - List: Hardcoded list of interesting locations
    Returns:
        
    """
    keep_affiliations_by_author = []
    for author in affiliations_by_author:
        for affiliation in [d['affiliation'] for d in author['affiliations']]:
            for location in locations_of_interest:
                if location.lower() in affiliation.lower():
                    keep_affiliations_by_author.append(author)
                    
    return keep_affiliations_by_author

def map_author_to_affil(top_authors, affiliations, locations_of_interest = loi):
    """
    Get `n` most common authors, find their top 3 affiliations and their geographic locations. 
    
    Filter list of authors by whether one of their affiliation locations lands in a location_of_interest
    
    Args:
    
    Returns:
        List - List elements are dictionaries containing data on an author's top 3 affiliations 
            if they land in the supplied `locations_of_interest`. 
    """
    #Get Affiliations for Top 200 Authors
    affiliations_by_author = []
    ### Count up most common authors
    author_list = list(top_authors.keys())
    
    for author_name in author_list:
        matching_affiliations = []
        for entry in affiliations:
            if author_name == entry['author']:
                if entry.get('locations'):
                    for location in entry['locations']:
                        matching_affiliations.append(location)
                else:
                    matching_affiliations.append('none')
        affs = Counter(matching_affiliations)
        
        reformatted_affs = []
        for aff in affs.most_common(3):
            reformatted_affs.append({'affiliation' : aff[0], 'count' : aff[1]})
        affiliations_by_author.append({'author' : author_name, 
                                       'total_papers' : sum(list(affs.values())),
                                       'affiliations' : reformatted_affs})

    #Filter authors down to authors matching locations of interest
    keep_affiliations_by_author = filter_affiliations_by_location(affiliations_by_author, locations_of_interest = loi)
                
    return keep_affiliations_by_author

top_authors = dict(Counter(pd.DataFrame(authors)['author_string']).most_common(200))
affiliations_by_author = map_author_to_affil(top_authors, affiliations)

In [88]:

sorted(affiliations_by_author, key=lambda k: k['total_papers'], reverse=True)

[{'author': 'Dantzer, Robert',
  'total_papers': 30,
  'affiliations': [{'affiliation': 'Houston', 'count': 15},
   {'affiliation': 'Texas', 'count': 13},
   {'affiliation': 'United States', 'count': 2}]},
 {'author': 'Passerieux, C',
  'total_papers': 27,
  'affiliations': [{'affiliation': 'Versailles', 'count': 6},
   {'affiliation': 'France', 'count': 6},
   {'affiliation': 'Créteil', 'count': 6}]},
 {'author': 'Urbach, M',
  'total_papers': 27,
  'affiliations': [{'affiliation': 'Versailles', 'count': 6},
   {'affiliation': 'France', 'count': 6},
   {'affiliation': 'Créteil', 'count': 6}]},
 {'author': 'Miller, Andrew H',
  'total_papers': 26,
  'affiliations': [{'affiliation': 'Atlanta', 'count': 17},
   {'affiliation': 'United States', 'count': 5},
   {'affiliation': 'Georgia', 'count': 3}]},
 {'author': 'Pariante, Carmine M',
  'total_papers': 22,
  'affiliations': [{'affiliation': 'London', 'count': 15},
   {'affiliation': 'United Kingdom', 'count': 5},
   {'affiliation': 'none

In [38]:
def author_affil_total_df(affiliations_by_author, n=25):
    """
    Combines data from previous steps to generate a dataframe of top publishing authors and their most frequently
    cited affiliations.
    """
    #Get top 25 from remaining authors
    affiliations_by_author_df = pd.DataFrame(affiliations_by_author)

    ### affiliations_by_author_df['totalPapers'] = affiliations_by_author_df['author'].map(top_authors)
    ### Removed because occasionally totalPapers was < sum of counts of topAffiliations which seems like a no-no
    ### `total_papers` is calculated in the `map_author_to_affil()` function and is calculated before 
    ### undesirable locations are filtered out
    affiliations_by_author_df['totalPapers'] = affiliations_by_author_df['total_papers']

    paper_counts_affiliations = affiliations_by_author_df.to_dict('records')

    flat_dict = []
    for author in paper_counts_affiliations:
        topAffiliations = []
        topAffiliations = {affiliation['affiliation'] : str(affiliation['count']) for affiliation in \
                           author['affiliations']}
        
        flat_dict.append({'author' : author['author'],
                          'topAffiliations' : topAffiliations,
                          'totalPapers' : author['totalPapers']})

    out_df = pd.DataFrame(flat_dict).sort_values(by='totalPapers', ascending=False).reset_index(drop=True)
    return out_df.head(n)
    
author_affil_total_df(affiliations_by_author)

Unnamed: 0,author,topAffiliations,totalPapers
0,"Urbach, M","{'Versailles': '6', 'France': '6', 'Créteil': ...",27
1,"Passerieux, C","{'Versailles': '6', 'France': '6', 'Créteil': ...",27
2,"Pariante, Carmine M","{'London': '15', 'United Kingdom': '5', 'none'...",22
3,"Fuchs, Dietmar","{'Austria': '11', 'Innsbruck': '11'}",22
4,"Boyer, L","{'France': '8', 'Marseille': '7', 'Créteil': '6'}",21
5,"Fond, G","{'France': '8', 'Créteil': '6', 'Marseille': '6'}",20
6,"Lançon, C","{'France': '8', 'Marseille': '8', 'Créteil': '4'}",20
7,"Mallet, J","{'Colombes': '6', 'France': '6', 'Créteil': '6'}",19
8,"Dubertret, C","{'Colombes': '6', 'France': '6', 'Créteil': '6'}",19
9,"Llorca, P M","{'Clermont-Ferrand': '6', 'France': '6', 'Crét...",18


In [78]:
def get_authors_papers(author_of_interest, authors, papers_result):
    matching_pmids = []
    ### Find an authors PMIDs
    for author in authors:
        if author['author_string'] == author_of_interest:
            matching_pmids.append(author['pmid'])
    
    matching_papers = []
    for paper in papers_result:
        if paper['pmid'] in matching_pmids:
            matching_papers.append(paper)

    papers_sample = pd.DataFrame(matching_papers).to_dict('records')
    matching_papers_df = pd.DataFrame(matching_papers)
    matchedPapers = matching_papers_df.drop(columns=['pub_type_list', 'journal_info_list', 'author_list', 'keywords'])
    matchedPapers_dicts = matchedPapers.to_dict('records')
    
        
    return matchedPapers_dicts

#Get papers for top 25 authors
paper_top_author_list = []
for author_of_interest in top_authors:
    matchedPapers_dicts = get_authors_papers(author_of_interest, authors, papers_result)
    for matchedPaper in matchedPapers_dicts:
        top_paper = {'author' : author_of_interest, 
                     'title' : matchedPaper['title'], 
                     'pubdate' : matchedPaper['pubdate'], 
                     'link' : matchedPaper['link'], 
                     'pmid' : matchedPaper['pmid']}
        paper_top_author_list.append(top_paper)

paper_top_author_df = pd.DataFrame(paper_top_author_list)

affiliations_df = pd.DataFrame(affiliations)[['author', 'affiliation', 'pmid']]

big_df = pd.merge(paper_top_author_df, affiliations_df, on = ['author', 'pmid'])
out_dict = {}
for author_dict in affiliations_by_author:
    out_dict[author_dict['author']] = {'author' : author_dict['author'], 
                         'total_count' : author_dict['total_papers'],
                         'locations' : str([d['affiliation'] for d in author_dict['affiliations']]),
                           'paper_data' : big_df.loc[big_df['author'] == author_dict['author'], :].to_dict('records')
                       }
    
    
out_dict

{'Pariante, Carmine M': {'author': 'Pariante, Carmine M',
  'total_count': 22,
  'locations': "['London', 'United Kingdom', 'none']",
  'paper_data': [{'author': 'Pariante, Carmine M',
    'link': 'https://www.ncbi.nlm.nih.gov/pubmed/32194233',
    'pmid': '32194233',
    'pubdate': '2020',
    'title': 'Glucocorticoids prime the inflammatory response of human hippocampal cells through up-regulation of inflammatory pathways.',
    'affiliation': ["Stress, Psychiatry and Immunology (SPI) Lab, Institute of Psychiatry, Psychology & Neuroscience, King's College London, London, UK."]},
   {'author': 'Pariante, Carmine M',
    'link': 'https://www.ncbi.nlm.nih.gov/pubmed/32180741',
    'pmid': '32180741',
    'pubdate': '2020',
    'title': 'The Anti-Inflammatory Role of Omega-3 Polyunsaturated Fatty Acids Metabolites in Pre-Clinical Models of Psychiatric, Neurodegenerative, and Neurological Disorders.',
    'affiliation': ["Stress, Psychiatry and Immunology Laboratory, Department of Psychol

In [76]:
affiliations_by_author

[{'author': 'Pariante, Carmine M',
  'total_papers': 22,
  'affiliations': [{'affiliation': 'London', 'count': 15},
   {'affiliation': 'United Kingdom', 'count': 5},
   {'affiliation': 'none', 'count': 1}]},
 {'author': 'Penninx, Brenda W J H',
  'total_papers': 16,
  'affiliations': [{'affiliation': 'Amsterdam', 'count': 12},
   {'affiliation': 'Netherlands', 'count': 3},
   {'affiliation': 'none', 'count': 1}]},
 {'author': 'Fuchs, Dietmar',
  'total_papers': 22,
  'affiliations': [{'affiliation': 'Austria', 'count': 11},
   {'affiliation': 'Innsbruck', 'count': 11}]},
 {'author': 'Lamers, Femke',
  'total_papers': 11,
  'affiliations': [{'affiliation': 'Amsterdam', 'count': 8},
   {'affiliation': 'Netherlands', 'count': 2},
   {'affiliation': 'none', 'count': 1}]},
 {'author': 'Khandaker, Golam M',
  'total_papers': 14,
  'affiliations': [{'affiliation': 'Cambridge', 'count': 8},
   {'affiliation': 'Peterborough', 'count': 6}]},
 {'author': 'Milaneschi, Yuri',
  'total_papers': 10,


## Save results

In [None]:
#Save papers by author in order of total number of papers
lengths = []
for key, df in big_df_grouped:
    df = df[['author', 'pmid']].drop_duplicates()
    lengths.append({'key' : key, 'length' : df.shape[0]})

lengths = sorted(lengths, key=lambda k: k['length'], reverse=True) 

filename = query + '_TopAuthors.xlsx'
writer = pd.ExcelWriter(filename)

final_authors_df.to_excel(writer, 'LocationsByAuthor')

for entry in lengths[0:20]:
    for key, df in big_df_grouped:
        if key == entry['key']:
            loc_list = ast.literal_eval(key[1])
            locs = []
            for loc in loc_list:
                if loc != 'none':
                    locs.append(loc)
            loc_string = '-'.join(locs)
            key_string = key[0] + ' ' + str(key[2])
            trimmed_df = df[['title', 'pubdate', 'link', 'affiliation']]
            trimmed_df['affiliation'] = trimmed_df['affiliation'].apply(' -- '.join)
            trimmed_df = trimmed_df.drop_duplicates()
            trimmed_df.to_excel(writer, key_string)
        else: pass
    
writer.save()

### BREAK

In [None]:
# from fuzzywuzzy import fuzz
# import copy
# ### If you remove this, it edits the original `paper_counts_affiliations` list
# paper_counts_affiliations_copy = copy.deepcopy(paper_counts_affiliations)

# fuzzy_threshold = 90

# out_author_affils = []
# for author_index, author_dict in enumerate(paper_counts_affiliations_copy):
#     author = paper_counts_affiliations_copy[author_index]['author']
#     main_affil = paper_counts_affiliations_copy[author_index]['affiliations'][0]
#     main_affil_count = main_affil['count']
#     out_affils = []
    
#     for other_affil in paper_counts_affiliations_copy[author_index]['affiliations'][1:]:
#         if fuzz.partial_ratio(main_affil['affiliation'].lower(), \
#                               other_affil['affiliation'].lower()) >= fuzzy_threshold:
#             main_affil_count = main_affil_count + other_affil['count']
#         else:
#             out_affils.append(other_affil)
#     main_affil['count'] = main_affil_count
#     out_affils = [main_affil] + out_affils
#     out_author_affils.append({'author' : author, 'affiliations' : out_affils})
    
# out_author_affils

In [None]:
#Get matching papers
author_of_interest = top_authors_df.loc[1]['author']
print('Papers Matching: ' + author_of_interest)
matching_pmids = []
for author in authors:
    if author['author_string'] == author_of_interest:
        matching_pmids.append(author['pmid'])
        
matching_papers = []
for paper in res_dicts:
    if paper['pmid'] in matching_pmids:
        matching_papers.append(paper)
        
papers_sample = pd.DataFrame(matching_papers).to_dict('records')
matching_papers_df = pd.DataFrame(matching_papers)[0:10]
matchedPapers = matching_papers_df.drop(columns=['pub_type_list', 'journal_info_list', 'author_list', 'link', 'keywords'])
top_papers

In [None]:
#Get matching affiliations
matching_affiliations = []
for author in result[1]:
    author_name = author['author'][2] + ', ' + author['author'][0]
    if author_name == author_of_interest:
        for affiliation in author['affiliation']:
            matching_affiliations.append(affiliation)
print('Most Common Affiliations Matching: ' + author_of_interest)
affs = Counter(matching_affiliations)
d = dict(Counter(affs).most_common(5))
pd.DataFrame.from_dict(d, orient='index').reset_index().rename(columns={'index':'Affiliation', 0:'Count'})

In [None]:
#Journals the author has published in
journals = []
for paper in papers_sample:
    journals.append(paper['journal_info_list'][0])
print('Most Common Journals Matching: ' + author_of_interest)
d = dict(Counter(journals).most_common(5))
pd.DataFrame.from_dict(d, orient='index').reset_index().rename(columns={'index':'Journal', 0:'Count'})

In [None]:
#Keywords related to the author
keywords = []
for paper in papers_sample:
    if len(paper['keywords']) > 0:
        for i in paper['keywords']:
            keywords.append(i)
print('Keywords Matching: ' + author_of_interest)
d = dict(Counter(keywords).most_common(25))
pd.DataFrame.from_dict(d, orient='index').reset_index().rename(columns={'index':'Journal', 0:'Count'})

In [None]:
#author_of_interest = top_authors_df.loc[0]['author']
authors_info = []
for author_of_interest in list(top_authors_df['author']):
    matching_pmids = []
    for author in authors:
        if author['author_string'] == author_of_interest:
            matching_pmids.append(author['pmid'])

    matching_papers = []
    for paper in res_dicts:
        if paper['pmid'] in matching_pmids:
            matching_papers.append(paper)

    papers_sample = pd.DataFrame(matching_papers).to_dict('records')

    #Keywords related to the author
    keywords = []
    for paper in papers_sample:
        if len(paper['keywords']) > 0:
            for i in paper['keywords']:
                keywords.append(i)
    d = dict(Counter(keywords).most_common(25))
    keywords = list(pd.DataFrame.from_dict(d, orient='index').reset_index().rename(columns={'index':'Keyword', 0:'Count'})['Keyword'])
    
    #Get matching affiliations
    matching_affiliations = []
    for author in result[1]:
        author_name = author['author'][2] + ', ' + author['author'][0]
        if author_name == author_of_interest:
            for affiliation in author['affiliation']:
                matching_affiliations.append(affiliation)
    affs = Counter(matching_affiliations)
    d = dict(Counter(affs).most_common(5))
    aff_terms = []
    for i in pd.DataFrame.from_dict(d, orient='index').reset_index().rename(columns={'index':'Affiliation', 0:'Count'})['Affiliation']:
        aff_terms = aff_terms + [d.lower().strip() for d in i.split(',')]
    aff_terms = list(set(aff_terms))
    
    authors_info.append({'author' : author_of_interest , 'keywords' : keywords, 'aff_terms' : aff_terms})

In [None]:
def search_author(keywords, location = []):
    accept = []
    for author in authors_info:
        for keyword in keywords:
            if keyword.lower() in [d.lower() for d in author['keywords']]:
                if location:
                    for term in author['aff_terms']:
                            if location in term:
                                if author not in accept:
                                    accept.append(author)
                else:
                     if author not in accept:
                        accept.append(author)   
    return pd.DataFrame(accept)

In [None]:
search_author(['CD-8', 'Depression', 'Anxiety', 'neuroimmunology'], 'boston')

In [None]:
pd.DataFrame(authors_info)