# PubMed Dashboarding to Identify Research Trends with EntrezPy, Plotly, Dash

Goal is to pull all publications by a search query, and visualize its NIH compliance, the journals it appears in, and research trends. 

In [1]:
from Bio import Entrez #use this library to access PubMed database and its Medline records
import pandas as pd #to export PMID's or PubMed Queries as needed
from pandas import DataFrame
import plotly.express as px

from jupyter_dash import JupyterDash #use plotly + Dash for building dashboard instead of using default
import dash
import dash_core_components as dcc
import dash_html_components as html

pd.set_option('display.max_rows', 500)


In [2]:
#to let NLM know who I am
Entrez.email = "sarah.ngo@ucsf.edu" 

In [3]:
#the following formulas are for setting up the dataframe by searching Entrzpy to retrieve the Medline records according to a search query

#this is to return all the PMIDs corresponding to a search term
def get_term_pmids(term):
    links = Entrez.esearch(db="pubmed", retmax = 100000, term=term)
    record = Entrez.read(links)
    link_list = record['IdList']
    return link_list

#this is to check medline record for PMID that the medline records correspond to
def get_medline(pubmed_list):
    handle = Entrez.efetch(db='pubmed',id=pubmed_list, retmode='text', rettype='medline')
    return handle.read()

In [4]:
#prompt input for search search, and retrieve results
query = input("what do you want to search for?")


what do you want to search for?((Kotwal A San Francisco AND (2020:2021[pdat])) AND (Kotwal A San Francisco)) AND (Ashwin Kotwal AND (2020:2021[pdat]))


In [5]:
pmids_list = get_term_pmids(query)
pmids_list

['33099456', '33048142', '32965024', '32865472', '32803486', '32422084', '32359072', '32356919', '31775021']

In [6]:

draft_pubmed_list = []

for pmid_query in pmids_list:
    try:
        pmid= get_medline(pmid_query).split('\n')
    
        for element in pmid:
            new_string = element.split('- ')
            draft_pubmed_list.append(new_string)
        
        draft_df = DataFrame(draft_pubmed_list, columns=['key', 'value'])
        draft_df.key = draft_df.key.apply(str.strip)
        draft_df.dropna(inplace=True)
        
    except Exception:
        print(f'{pmid_query} needs troubleshooting')


In [7]:
draft_df

Unnamed: 0,key,value
1,PMID,33099456
2,OWN,NLM
3,STAT,MEDLINE
4,DCOM,20201030
5,LR,20201030
...,...,...
832,PHST,2019/11/28 06:00 [entrez]
833,AID,S0885-3924(19)30669-4 [pii]
834,AID,10.1016/j.jpainsymman.2019.11.014 [doi]
835,PST,ppublish


In [8]:
draft_df['key'].count()

622

In [9]:
is_pmid = draft_df['key']=='PMID'
df_pmid = draft_df[is_pmid]
df_pmid

Unnamed: 0,key,value
1,PMID,33099456
88,PMID,33048142
151,PMID,32965024
297,PMID,32865472
393,PMID,32803486
463,PMID,32422084
522,PMID,32359072
583,PMID,32356919
712,PMID,31775021


In [10]:
is_MH = draft_df['key']=='MH'
df_MH = draft_df[is_MH]
df_MH

Unnamed: 0,key,value
50,MH,Aged
51,MH,"Aged, 80 and over"
52,MH,Communication
53,MH,Decision Making
54,MH,*Early Detection of Cancer
55,MH,*Health Services for the Aged
56,MH,Humans
57,MH,United States
131,MH,Adult
132,MH,Chronic Disease


In [11]:
df_pmid.shape

(9, 2)

In [12]:
df_MH.shape

(69, 2)

In [26]:
pmids_list = ['33099456']
pmid= get_medline(pmids_list).split('\n')

pubmed_list = []
for element in pmid:
    new_string = element.split('- ')
    pubmed_list.append(new_string)
    
df = DataFrame(pubmed_list, columns=['key', 'value'])
df.key = df.key.apply(str.strip)
df.dropna(inplace=True)
df['PMID'] = 33099456
df['DP']='2020 Nov'
df

Unnamed: 0,key,value,PMID,DP
1,PMID,33099456,33099456,2020 Nov
2,OWN,NLM,33099456,2020 Nov
3,STAT,MEDLINE,33099456,2020 Nov
4,DCOM,20201030,33099456,2020 Nov
5,LR,20201030,33099456,2020 Nov
6,IS,1557-9859 (Electronic),33099456,2020 Nov
7,IS,0025-7125 (Linking),33099456,2020 Nov
8,VI,104,33099456,2020 Nov
9,IP,6,33099456,2020 Nov
10,DP,2020 Nov,33099456,2020 Nov


In [27]:
pmids_list = ['33048142']
pmid= get_medline(pmids_list).split('\n')

pubmed_list = []
for element in pmid:
    new_string = element.split('- ')
    pubmed_list.append(new_string)
    
df2 = DataFrame(pubmed_list, columns=['key', 'value'])
df2.key = df.key.apply(str.strip)
df2.dropna(inplace=True)
df2['PMID'] = 33048142
df2['DP']= '2020 Oct 13'
df2

Unnamed: 0,key,value,PMID,DP
1,PMID,33048142,33048142,2020 Oct 13
2,OWN,NLM,33048142,2020 Oct 13
3,STAT,MEDLINE,33048142,2020 Oct 13
4,DCOM,20201016,33048142,2020 Oct 13
5,LR,20201231,33048142,2020 Oct 13
6,IS,1538-3598 (Electronic),33048142,2020 Oct 13
7,IS,0098-7484 (Linking),33048142,2020 Oct 13
8,VI,324,33048142,2020 Oct 13
9,IP,14,33048142,2020 Oct 13
10,DP,2020 Oct 13,33048142,2020 Oct 13


In [28]:
new_df = pd.concat([df,df2], ignore_index=True)
new_df

Unnamed: 0,key,value,PMID,DP
0,PMID,33099456,33099456,2020 Nov
1,OWN,NLM,33099456,2020 Nov
2,STAT,MEDLINE,33099456,2020 Nov
3,DCOM,20201030,33099456,2020 Nov
4,LR,20201030,33099456,2020 Nov
5,IS,1557-9859 (Electronic),33099456,2020 Nov
6,IS,0025-7125 (Linking),33099456,2020 Nov
7,VI,104,33099456,2020 Nov
8,IP,6,33099456,2020 Nov
9,DP,2020 Nov,33099456,2020 Nov


In [24]:
is_pmid = new_df['key']=='PMID'
df_pmid = new_df[is_pmid]
df_pmid


Unnamed: 0,key,value,PMID
0,PMID,33099456,33099456
63,PMID,33048142,33048142


# Creating plots with Plotly


In [16]:
#publication year
fig1 = px.scatter(draft_df, x=df_pmid) #show this - color by Journal and by First Author
fig1.show() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  args[field_name][i] = str(col_name)


KeyError: 0