In [36]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time
import shutil
import requests
import os
import re

# get figures from pm id

In [37]:
base_article_url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/'
not_found_ids = []
no_pathway_papers = []

def create_no_pathway_file():
    
    with open('no_pathway.txt', 'w') as f:
        for id in no_pathway_papers:
            f.write("%s\n" % id)

def create_not_found_file():
    
    with open('not_found_ids.txt', 'w') as f:
        for id in not_found_ids:
            f.write("%s\n" % id)

def pubmed_id_to_PMC(id):
    
    a = requests.get("https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=" + str(id) + "&format=json")
    all_id =  a.json()
    all_list = all_id.get('records')
    b = list(all_list[0].values())
    
    if b[0][:3] != 'PMC':
        not_found_ids.append(id)
    
    return b[0]

def get_page_soup(PMC):
    
    page = requests.get(base_article_url + PMC + '/')
    soup = BeautifulSoup(page.content, 'html.parser')
    
    return soup

def get_captions(soup):
    
    caption_elements = soup.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['caption'])
    captions = [caption_element.findChild().text for caption_element in  caption_elements]
    
    return captions

def get_figure_numbers(soup):
    
    img_elements = soup.find_all(lambda tag: tag.name == 'a' and tag.get('class') == ['figpopup'])
    fig_numbers = [img_element.text for img_element in img_elements]
    
    return fig_numbers
    
def get_img_names(soup):
       
    img_elements = soup.find_all('img', class_ = 'tileshop')
    sources = [img_element['src'] for img_element in img_elements]
    after_bins = [re.search("(bin(.*))", source).group() for source in sources]
    
    return after_bins

def get_image(PMC, source, figure_number, id):

    response = requests.get(base_article_url + PMC + '/' + source, stream=True)
    
    save_path = 'output/'+ str(id)
    try:
        os.makedirs(save_path)
    except:
        print(save_path + ' exists')
    
    with open(save_path + '/' + figure_number + '.jpg', 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
        
def take_imgs_from_site(id):
    
    PMC = pubmed_id_to_PMC(id)
    soup = get_page_soup(PMC)   
    
    captions = get_captions(soup)
    sources = get_img_names(soup)
    figure_numbers = get_figure_numbers(soup)
    
    pathway_found_counter = 0
    for source, figure_number, caption in zip(sources, figure_numbers, captions):
        if ('athway' or 'etwork') in caption:
            get_image(PMC, source, figure_number, id)
            pathway_found_counter+=1
    
    if pathway_found_counter == 0:
        no_pathway_papers.append(id)
    
    print('Paper id ' + str(id) + ' done')

def take_imgs_from_multiple_sites(ids):
    
    id_counter = 1
    for id in ids:
        
        if id.isnumeric():
            take_imgs_from_site(id)
        print(str(len(ids)-id_counter) + ' ids left')
        id_counter+=1
    
    create_no_pathway_file()
    create_not_found_file()
    print('All papers done')

In [39]:
df = pd.read_csv('Collected papers and classification proposal - Collected papers and classification proposal.csv')
ids = df['Pubmed ID'].tolist()

take_imgs_from_multiple_sites(ids)

Paper id 32145363 done
142 ids left
Paper id 31226023 done
141 ids left
Paper id 32330414 done
140 ids left
Paper id 32511329 done
139 ids left
Paper id 15194747 done
138 ids left
Paper id 32132184 done
137 ids left
Paper id 31034780 done
136 ids left
Paper id 15331731 done
135 ids left
Paper id 16877062 done
134 ids left
Paper id 32225175 done
133 ids left
Paper id 32227760 done
132 ids left
131 ids left
Paper id 33899609 done
130 ids left
Paper id 33482803 done
129 ids left
Paper id 32265513 done
128 ids left
Paper id 32219363 done
127 ids left
Paper id 32976572 done
126 ids left
Paper id 33436497 done
125 ids left
Paper id 32353859 done
124 ids left
Paper id 32358202 done
123 ids left
Paper id 32346093 done
122 ids left
Paper id 32835247 done
121 ids left
Paper id 32227090 done
120 ids left
Paper id 32398875 done
119 ids left
output/32522207 exists
Paper id 32522207 done
118 ids left
Paper id 32835326 done
117 ids left
Paper id 32546811 done
116 ids left
output/28933406 exists
outpu

# Get figures from pmid that do not have PMCID

In [32]:
base_article_url = 'https://pubmed.ncbi.nlm.nih.gov/'
not_found_ids = []
no_pathway_papers = []

def create_no_pathway_file():

    with open('no_pathway_from_no_pmcids.txt', 'w') as f:
        for id in no_pathway_papers:
            f.write("%s\n" % id)

def get_page_soup(pmid):
    
    page = requests.get(base_article_url + str(pmid) + '/')
    soup = BeautifulSoup(page.content, 'html.parser')
    
    return soup

def get_fulltext_link(soup):
       
    full_text_links = soup.find_all(lambda tag: tag.name == 'a' and tag.get('class') == ['id-link'])
    links = [full_text_link['href'] for full_text_link in full_text_links]
    
    return links[0]

def get_full_text_soup(links):
    
    text_page = requests.get(links)
    text_soup = BeautifulSoup(text_page.content, 'html.parser')
    
    return text_soup

def get_fig_captions_pmid(text_soup):
    
    figcaption_codes = text_soup.find_all(lambda tag: tag.name == 'b' and tag.get('class') == ['c-article-section__figure-caption'])
    #captions = [caption_element.text for caption_element in fig]
    #figcaption_codes = soup.find_all('figcaption')
    
    return figcaption_codes
            
###############
def get_img_sources(soup):
       
    #img_containers = soup.find_all('a', class_ = 'ar-figure-viewer-modal figure scrollFig')
    #img_srcs = [img_container.findChild()['src'] for img_container in  img_containers]
      
    #img_containers = soup.find_all('a', class_ = 'highwire-figure-link highwire-figure-link-download')  
    #img_srcs = [img_container['href'] for img_container in img_containers]
    
    img_containers = soup.find_all('a', class_ = 'c-article-section__figure-link')
    img_srcs = [img_container['href'] for img_container in img_containers]
    return img_srcs

def get_image(source,fake_fig_num, id):
    
    response = requests.get('https://www.annualreviews.org/'+ source, stream=True)
    #response = requests.get(source)
    save_path = 'output_noPMCIDs/'+ str(id)
    try:
        os.makedirs(save_path)
    except:
        print(save_path + ' exists')
    
    #with open(save_path + '/' + caption + '.jpg', 'wb') as out_file:
       # shutil.copyfileobj(response.raw, out_file)         
    
  
    file = open(save_path + '/' + str(fake_fig_num) + source[-4:], "wb")
    file.write(response.content)
    file.close()
###############

def get_from_one_id(id):
    soup = get_page_soup(id)
    links = get_fulltext_link(soup)
    text_soup = get_full_text_soup(links)
    caption_codes = get_fig_captions_pmid(text_soup)
    img_sources = get_img_sources(text_soup)
    
    print(caption_codes)
    
    pathway_found_counter = 0
    for source, caption_code, fake_fig_num in zip(img_sources, caption_codes, range(1, len(img_sources) + 1)):
        if ('athway' or 'etwork') in caption_code:
            get_image(source, caption_code, fake_fig_num, id)
            pathway_found_counter += 1
     
    #for source, fake_fig_num in zip(img_sources, range(1, len(img_sources) + 1)):
        #get_image(source, fake_fig_num, id)
        #pathway_found_counter += 1
            
    if pathway_found_counter == 0:
        no_pathway_papers.append(id)
    
    print('Paper id ' + str(id) + ' done')    
    
def get_from_multi_ids(ids):
    id_counter = 1
    for id in ids:
        get_from_one_id(id)
        print(str(len(ids)-id_counter) + ' ids left')
        id_counter+=1
        
    create_no_pathway_file()    

In [31]:
ids = open('not_found_ids.txt').read().splitlines()
get_from_multi_ids(ids)

[]
Paper id 31226023 done
24 ids left
[]
Paper id 32219363 done
23 ids left
[<b class="c-article-section__figure-caption" data-test="figure-caption-text" id="Fig1">Fig. 1: Dysregulated bronchoalveolar immune landscapes in patients with severe COVID-19 infection.</b>, <b class="c-article-section__figure-caption" data-test="figure-caption-text" id="Fig2">Fig. 2: T cell clonal expansion in BALF from patients with COVID-19.</b>]
Paper id 32398875 done
22 ids left
[]
Paper id 32546811 done
21 ids left
[<b class="c-article-section__figure-caption" data-test="figure-caption-text" id="Fig1">Fig. 1: Illustration of the experimental setup and the patient cohort used in this study.</b>, <b class="c-article-section__figure-caption" data-test="figure-caption-text" id="Fig2">Fig. 2: Identification, distribution and classification of cells from the upper respiratory tract.</b>, <b class="c-article-section__figure-caption" data-test="figure-caption-text" id="Fig3">Fig. 3: Inferred differentiation path