In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re

In [2]:
# Get_Beautifulsoup
PMID = "30135580"
URL = f"https://pubmed.ncbi.nlm.nih.gov/{PMID}/"

r = requests.get(URL)
print(f"Status code: {r.status_code}")
html_pubmed = bs(r.text, 'html.parser')


Status code: 200


In [3]:
# Get_year_of_pub
year = int(html_pubmed.find('div', {'class': "article-source"}).find('span', {'class': "cit"}).text.split(' ', 1)[0])
year_dict = {'year_of_publication': year}
print(year_dict)

{'year_of_publication': 2018}


In [4]:
# Get_PMCID
try:
    PMCID = html_pubmed.find('span', {'class': "identifier pmc"}).a.text.strip()
except AttributeError:
    print("No PMCID")
else:
    PMCID_dict = {'PMCID': PMCID}
print(PMCID_dict)

{'PMCID': 'PMC6245578'}


In [5]:
# Get_DOI
DOI = html_pubmed.find('span', {'class': "identifier doi"}).a.text.strip()
DOI_dict ={'DOI': DOI}
print(DOI_dict)

{'DOI': '10.1038/s41586-018-0442-2'}


In [6]:
# Get_PMID
PMID = html_pubmed.find('strong', {'class': "current-id"}).text.strip()
PMID_dict ={'PMID': PMID}
print(PMID_dict)

{'PMID': '30135580'}


In [7]:
# Get_title
title = html_pubmed.h1.text.strip()
title_dict = {'title': title}
print(title_dict)


{'title': 'Structure of paused transcription complex Pol II-DSIF-NELF'}


In [8]:
# Get_authors_and_affiliations
auteurs = []
affiliations = []
authors = html_pubmed.find_all('span', {'class': "authors-list-item"})
for author in authors:
    try:
        aut = author.find('a', {'class': "full-name"})['data-ga-label']
        auteurs.append(aut)
        affil = author.find('a', {'class': "affiliation-link"})['title']
        affiliations.append(affil)
    except TypeError:
        continue

authors_affiliation_dict = dict(zip(auteurs, affiliations))
print(authors_affiliation_dict)



{'Seychelle M Vos': 'Max Planck Institute for Biophysical Chemistry, Department of Molecular Biology, Göttingen, Germany.', 'Lucas Farnung': 'Max Planck Institute for Biophysical Chemistry, Department of Molecular Biology, Göttingen, Germany.', 'Henning Urlaub': 'Max Planck Institute for Biophysical Chemistry, Bioanalytical Mass Spectrometry, Göttingen, Germany.', 'Patrick Cramer': 'Max Planck Institute for Biophysical Chemistry, Department of Molecular Biology, Göttingen, Germany. patrick.cramer@mpibpc.mpg.de.'}


In [9]:
# Get_Abstract
abstract = html_pubmed.find('div', {'class': "abstract-content selected"}).text.strip().replace('\n', '')
abstract_dict = {'abstract': abstract}
print(abstract_dict)


{'abstract': "Metazoan gene regulation often involves the pausing of RNA polymerase II (Pol II) in the promoter-proximal region. Paused Pol II is stabilized by the protein complexes DRB sensitivity-inducing factor (DSIF) and negative elongation factor (NELF). Here we report the cryo-electron microscopy structure of a paused transcription elongation complex containing Sus scrofa Pol II and Homo sapiens DSIF and NELF at 3.2 Å resolution. The structure reveals a tilted DNA-RNA hybrid that impairs binding of the nucleoside triphosphate substrate. NELF binds the polymerase funnel, bridges two mobile polymerase modules, and contacts the trigger loop, thereby restraining Pol II mobility that is required for pause release. NELF prevents binding of the anti-pausing transcription elongation factor IIS (TFIIS). Additionally, NELF possesses two flexible 'tentacles' that can contact DSIF and exiting RNA. These results define the paused state of Pol II and provide the molecular basis for understandi

In [10]:
# Get_references_as_PMID

def not_PMC(href):
    return href and not re.compile("pmc").search(href)


URL_ref_pubmed = f"https://pubmed.ncbi.nlm.nih.gov/{PMID_dict['PMID']}/references/"
r = requests.get(URL_ref_pubmed)
print(f"Status code: {r.status_code}")

references_all = bs(r.text, 'html.parser')
references_incr = references_all.find_all('li')

cited_in_ref = []
for ref in references_incr:
    only_pub = ref.find(href=not_PMC)['href'].strip('/')
    if len(only_pub) > 0:
        cited_in_ref.append(only_pub)
print(cited_in_ref)
  

Status code: 200
['12676794', '12676794', '3520340', '3520340', '2825027', '2825027', '3099167', '3099167', '1505520', '1505520', '9450929', '9450929', '10199401', '10199401', '12612062', '12612062', '7759473', '7759473', '9491887', '9491887', '14701750', '14701750', '16427012', '16427012', '8900211', '8900211', '20434984', '20434984', '25773599', '25773599', '19820169', '19820169', '12782658', '12782658', '22982352', '22982352', '28892040', '28892040', '28775211', '28775211', '27282391', '27282391', '16898873', '16898873', '9651586', '9651586', '24789973', '24789973', '21346759', '21346759', '22056778', '22056778', '29499135', '29499135', '29499136', '29499136', '11313498', '11313498', '10499798', '10499798', '12914699', '12914699', '17129781', '17129781', '17581590', '17581590', '27193681', '27193681', '11940650', '11940650', '24453987', '24453987', '28994650', '28994650', '20534440', '20534440', '17073751', '17073751', '1379232', '1379232', '15629721', '15629721', '20007866', '20007

In [16]:
# GetHTML_Pubmed_Citedby

URL = "https://pubmed.ncbi.nlm.nih.gov/?linkname=pubmed_pubmed_citedin&from_uid=" + f"{PMID}"

r = requests.get(URL)
print(f"Status code: {r.status_code}")
citedby = bs(r.text, 'html.parser')

Status code: 200


In [96]:
# GetNumberPages
ref_citing =[]
try:
    number_total_pages = int(citedby.find('label', {'class': "of-total-pages"}).text.split(' ')[1])
except AttributeError:
    print("Not cited")
else:
    # Get RefCytedBy
    
    ref_page_1 = citedby.find_all('span', {'class': "docsum-journal-citation full-journal-citation"})
    
    for doi in ref_page_1:
        ref_doi = doi.text.split('doi:')[1].lstrip(' ')
        if len(ref_doi.split(' ')) > 1:
            ref_citing.append(ref_doi.split(' ')[0].rstrip('.'))
        else:
            ref_citing.append(ref_doi.rstrip('.'))


if number_total_pages > 1:
    for i in range(2, (number_total_pages +1)):
        URL_suite = f"{URL}&page={i}"
        
        r = requests.get(URL_suite)
        print(f"Status code: {r.status_code}")
        citedby_suite = bs(r.text, 'html.parser')
            
        ref_pages_suite = citedby_suite.find_all('span', {'class': "docsum-journal-citation full-journal-citation"})
        for doi in ref_pages_suite:
            ref_doi = doi.text.split('doi:')[1].lstrip(' ')
            if len(ref_doi.split(' ')) > 1:
                ref_citing.append(ref_doi.split(' ')[0].rstrip('.'))
            else:
                ref_citing.append(ref_doi.rstrip('.'))
del ref_citing[0]
references_citedby_dict_DOI = {'cited_by_DOI': ref_citing}
print(references_citedby_dict_DOI)

Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
{'cited_by_DOI': ['10.1186/s12964-021-00776-1', '10.1126/sciadv.abh1684', '10.1126/sciadv.abb2947', '10.1016/j.devcel.2021.02.025', '10.1016/j.molcel.2020.12.002', '10.1016/j.ceb.2020.11.003', '10.15252/embj.2020105564', '10.1093/nar/gkaa1208', '10.1371/journal.ppat.1009055', '10.1038/d41586-018-05971-8', '10.1038/s41586-018-0440-4', '10.1073/pnas.1000681107', '10.1093/nar/gkw571', '10.1016/j.bbagrm.2012.11.007', '10.1146/annurev-genet-110711-155440', '10.1016/j.molcel.2021.12.020', '10.1098/rsob.210220', '10.1038/s41596-021-00640-z', '10.1038/s41467-021-26935-z', '10.1038/s41568-021-00411-8', '10.1038/d41586-018-05971-8', '10.1038/s41586-018-0440-4', '10.1073/pnas.1000681107', '10.1093/nar/gkw571', '10.1016/j.bbagrm.2012.11.007', '10.1146/annurev-genet-110711-155440', '10.1016/j.molcel.2021.12.020', '10.1098/rsob.210220', '10.1038/s41596-021-00640-z', '10.1038/s41467-021-26935-z', '10