# PubMed Dashboarding to Identify Research Trends with EntrezPy, Plotly, Dash

Goal is to pull all publications by a search query, and visualize its NIH compliance, the journals it appears in, and research trends. 

In [1]:
from Bio import Entrez #use this library to access PubMed database and its Medline records
import pandas as pd #to export PMID's or PubMed Queries as needed
from pandas import DataFrame
import plotly.express as px

from jupyter_dash import JupyterDash #use plotly + Dash for building dashboard instead of using default
import dash
import dash_core_components as dcc
import dash_html_components as html


In [2]:
#to let NLM know who I am
Entrez.email = "sarah.ngo@ucsf.edu" 

In [3]:
#the following formulas are for setting up the dataframe by searching Entrzpy to retrieve the Medline records according to a search query

#this is to return all the PMIDs corresponding to a search term
def get_term_pmids(term):
    links = Entrez.esearch(db="pubmed", retmax = 10000, term=term)
    record = Entrez.read(links)
    link_list = record['IdList']
    return link_list

#this is to check medline record for PMID that the medline records correspond to
def get_medline(pubmed_list):
    handle = Entrez.efetch(db='pubmed',id=pubmed_list, retmode='text', rettype='medline')
    return handle.read()

#this returns all the mesh terms of a set of PMID list
def get_mesh_terms(link_list):
    record = Entrez.efetch(db="pubmed", retmax = 10000, term=term)







    
    
    
    

In [4]:
#prompt input for search search, and retrieve results
query = input("what do you want to search for?")


what do you want to search for?((Kotwal A San Francisco AND (2020:2021[pdat])) AND (Kotwal A San Francisco)) AND (Ashwin Kotwal AND (2020:2021[pdat]))


In [5]:
pmids_list = get_term_pmids(query)
pmids_list

['33099456', '33048142', '32965024', '32865472', '32803486', '32422084', '32359072', '32356919', '31775021']

In [6]:
#this is to format the text blob. retrieve medline record and split record by line break so it will not be one big text blob
pmid= get_medline(pmids_list).split('\n')

pubmed_list = []
for element in pmid:
    new_string = element.split('- ')
    pubmed_list.append(new_string)
    #print(new_string) #print to see that it retrieves the right records, and see if it formatted correctly



In [7]:
pubmed_list

[[''],
 ['PMID', '33099456'],
 ['OWN ', 'NLM'],
 ['STAT', 'MEDLINE'],
 ['DCOM', '20201030'],
 ['LR  ', '20201030'],
 ['IS  ', '1557-9859 (Electronic)'],
 ['IS  ', '0025-7125 (Linking)'],
 ['VI  ', '104'],
 ['IP  ', '6'],
 ['DP  ', '2020 Nov'],
 ['TI  ',
  'Cancer Screening in Older Adults: Individualized Decision-Making and'],
 ['      Communication Strategies.'],
 ['PG  ', '989-1006'],
 ['LID ', 'S0025-7125(20)30080-8 [pii]'],
 ['LID ', '10.1016/j.mcna.2020.08.002 [doi]'],
 ['AB  ',
  'Cancer screening decisions in older adults can be complex due to the unclear'],
 ['      cancer-specific mortality benefits of screening and several known harms including'],
 ['      false positives, overdiagnosis, and procedural complications from downstream'],
 ['      diagnostic interventions. In this review, we provide a framework for'],
 ['      individualized cancer screening decisions among older adults, involving'],
 ['      accounting for overall health and life expectancy, individual values, a

df= {}

def create_df(pubmed_list):
    for line_item in pubmed_list:
        try:
            df[line_item[0]] = line_item[1] 
        except Exception:
            if line_item == ['']:
                pass
make_df = create_df(pubmed_list)
df

In [14]:
df = DataFrame(pubmed_list, columns=['key', 'value'])
df

Unnamed: 0,key,value
0,,
1,PMID,33099456
2,OWN,NLM
3,STAT,MEDLINE
4,DCOM,20201030
...,...,...
829,AID,10.1016/j.jpainsymman.2019.11.014 [doi]
830,PST,ppublish
831,SO,J Pain Symptom Manage. 2020 Apr;59(4):916-931....
832,10.1016/j.jpainsymman.2019.11.014. Epub ...,


In [15]:
df.transpose()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,824,825,826,827,828,829,830,831,832,833
key,,PMID,OWN,STAT,DCOM,LR,IS,IS,VI,IP,...,PHST,PHST,PHST,PHST,AID,AID,PST,SO,10.1016/j.jpainsymman.2019.11.014. Epub ...,
value,,33099456,NLM,MEDLINE,20201030,20201030,1557-9859 (Electronic),0025-7125 (Linking),104,6,...,2019/11/13 00:00 [accepted],2019/11/28 06:00 [pubmed],2019/11/28 06:00 [medline],2019/11/28 06:00 [entrez],S0885-3924(19)30669-4 [pii],10.1016/j.jpainsymman.2019.11.014 [doi],ppublish,J Pain Symptom Manage. 2020 Apr;59(4):916-931....,,


In [26]:
df = df[~df.rows[0:].str.contains('PMID')].reset_index(drop=True)

AttributeError: 'DataFrame' object has no attribute 'rows'

In [None]:
#publications_list = pd.read_csv('csv-P30AG04428-set.csv')
#publications_list

In [None]:
dataframe = pd.DataFrame(publications_list)

In [None]:
dataframe

In [None]:
df = pd.DataFrame(publications_list, columns=['PMID', 'PMCID', 'Publication_Year', "Journal"])

In [None]:
df.columns

In [None]:
import numpy as np
import matplotlib.pyplot as plt


In [None]:
x = dataframe.Publication_Year
y = dataframe.Journal

In [None]:
plt.scatter(x, y)
plt.title('Journals We Published To Over the Years')

In [None]:
plt.hist(y)
plt.title('Journals Histogram')

In [None]:
plt.bar(x, height=y)
plt.title('Journals over the Years')

In [None]:
x = dataframe.Publication_Year
z = dataframe.First_Author

plt.scatter(x, z)
plt.title('First Authors Who Published To Over the Years')

In [None]:
x = dataframe.Publication_Year
t = dataframe.Title

plt.scatter(x, t)
plt.title('Articles Published To Over the Years')