### This file will download papers to a desired location given its DOI. It should be used right after obtaining a list of DOIs from similar papers to a seed paper (see get_similar_papers.ipynb). 

In [68]:
import time
import bibtexparser
import requests
import urllib
import os
import urllib.request

from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium import webdriver
from urllib.parse import quote

import fitz  # PyMuPDF for PDF text extraction



In [69]:
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36')

options.add_argument("enable-automation");
options.add_argument("--window-size=1920,1080");
options.add_argument("--no-sandbox");
options.add_argument("--disable-extensions");
options.add_argument("--dns-prefetch-disable");
options.add_argument("--disable-gpu");


options.add_experimental_option('prefs', {      
"download.default_directory": "/Users/rishikesh/Downloads/try_openalex_pdfs",  # Papers will be downloaded at this address
"download.prompt_for_download": False,  
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True  
})

# driver = webdriver.Chrome(options=options)

In [108]:
download_dir = '/Users/rishikesh/Downloads/try_openalex_pdfs'

def clear_download_directory():
    """Clear any files in the download directory."""
    for file in os.listdir(download_dir):
        file_path = os.path.join(download_dir, file)
        try:
            os.remove(file_path)
        except Exception as e:
            print(f"Error deleting file {file_path}: {e}")

def wait_for_download():
    """Wait until a new file appears in the download directory and return its path."""
    t = 0
    time.sleep(1)
    while t<20:
        files = os.listdir(download_dir)
        for file in files:
            if file.lower().endswith('.pdf'):
                return os.path.join(download_dir, file)
        t += 1
        time.sleep(1)
    return None


def extract_text_from_pdf(file_path):
    """Extract text from a PDF file and return as a string."""
    text = ""
    try:
        with fitz.open(file_path) as pdf:
            for page in pdf:
                text += page.get_text("text")
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
    return text

In [109]:
# Replace with your science_direct api key and insstoken. Insstoken needs to be requested from sciencedirect.com. 
science_direct_api_key = "be3a3b1c4e6171897b5a922185dd9104"  
science_direct_insttoken = ""


In [110]:
def download_science_direct(doi, api_key=science_direct_api_key, insttoken=science_direct_insttoken):
    try:
        url = f"https://api.elsevier.com/content/article/doi/{doi}?apiKey={api_key}&httpAccept=application%2Fpdf"#&insttoken={insttoken}"
        # print('sd url', url)
        response = urllib.request.urlopen(url)
        
        pdf_content = response.read()
        # print(pdf_content)


        directory = "/Users/rishikesh/Downloads/try_openalex_pdfs"
        if not os.path.exists(directory):
            os.makedirs(directory)

        filename = f"{directory}/{doi.replace('/', '_')}.pdf"
        with open(filename, 'wb') as f:
            f.write(pdf_content)
            # print('Saved for SD')

    except urllib.error.HTTPError as e:
        print(f"HTTPError: {e.code} for DOI {doi}")
        if e.code == 500:
            print("Internal Server Error. The server encountered an unexpected condition.")
        elif e.code == 403:
            print("Forbidden. You may not have permission to access this resource. Check your API key and institution token.")
        elif e.code == 404:
            print("Not Found. The DOI may not exist or is not available in this format.")
        else:
            print("An HTTP error occurred:", e)
        
    except urllib.error.URLError as e:
        # Handle other URL errors (e.g., network issues)
        print(f"URLError: {e.reason} for DOI {doi}")

    except Exception as e:
        print(f"Failed to download PDF for DOI {doi}: {e}")


In [111]:
not_retrieved = []
retrieved = []

In [112]:
def institutional_login(driver):
    driver.get('https://ieeexplore.ieee.org/Xplore/home.jsp')
    time.sleep(2)

    # Click the 'Institutional Sign In' button
    inst_sign_in_button = driver.find_element(By.LINK_TEXT, 'Institutional Sign In')
    inst_sign_in_button.click()
    time.sleep(2)

    # At this point, the user can manually select the university and enter credentials
    print("Please select your university and complete the login manually (including Duo authentication if required).")
    input("Press Enter here after you have fully logged in...")  # Wait for manual completion

# Function to check if the PDF link is accessible
def is_pdf_accessible(pdf_link):
    try:
        response = requests.head(pdf_link, timeout=5)
        content_type = response.headers.get('Content-Type', '')
        return 'pdf' in content_type  # Check if Content-Type includes 'pdf'
    except requests.RequestException as e:
        print(f"Error checking URL {pdf_link}: {e}")
        return False

# Main function to download IEEE PDFs for multiple DOIs
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# Main function to download IEEE PDFs for multiple DOIs
def download_ieee_pdfs(driver, doi):
    url = f"https://doi.org/{doi}"
    driver.get(url)
    time.sleep(1)

    current_url = driver.current_url
    if "ieeexplore.ieee.org" in current_url:
        try:
            # Extract the document number and construct the PDF link (stamp page link)
            doc_number = current_url.split("/")[-1]
            # pdf_link = f"https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={doc_number}"
            pdf_link = f"https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber={doc_number}&ref="
            # print(f"Attempting to access the stamp page for DOI {doi}...")

            # Navigate to the stamp page
            driver.get(pdf_link)
            time.sleep(1)  # Wait for the page to load

            # Look for the "Open" button on the page
            # try:
                # open_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Open')]")
            #     open_button.click()
            #     print(f"Clicked 'Open' button to download PDF for DOI {doi}")
            #     retrieved.append(doi)
            #     time.sleep(3)  # Wait for download to start
            # except NoSuchElementException:
            #     print(f"No 'Open' button found on page for DOI {doi}. PDF might have been downloaded automatically.")
            #     retrieved.append(doi)

        except Exception as e:
            print(f"An error occurred while attempting to download the PDF for DOI {doi}: {e}")
            not_retrieved.append(doi)


In [124]:
# This function will automatically download a paper to a desired location given its DOI
# The download location can be defined in the second cell under options.add_experimental_option -> "download.default_directory"   

def download_pdf(driver, doi, ieee_dois):

    clear_download_directory()

    fetched = 0

    url = "https://doi.org/" + doi
    
    try:
        driver.get(url)
    except Exception as e:
        print(f"Error accessing the URL: {e}")
        return ""
    
    # driver.get(url)
    time.sleep(1)

    current_url = driver.current_url
    # print(current_url)

    if "sciencedirect" in current_url and fetched==0:
        download_science_direct(doi=doi)
        retrieved.append(doi)
        fetched = 1
        
        # return
        
    url_split = current_url.split("/")
    url_find = url_split[-1].split("?")[0]

    doi_split = doi.split("/")

    # journals.sagepub.com
    if "journals.sagepub.com" in current_url and fetched==0:
        # print('BS')
        try:
            reader_href = f"https://journals.sagepub.com/doi/reader/{doi}"
            driver.get(reader_href)
            time.sleep(1)
            download_links = driver.find_elements(By.CSS_SELECTOR, "a[class*='btn--light btn format-download-btn download']")
            for link in download_links:
                href = link.get_attribute('href')
                if href:
                    driver.get(href)
                    time.sleep(1)
                    retrieved.append(doi)
                    # fetched=1
                    # print('journals.sagepub.com')
                    
                    try:
                        response = requests.head(href, timeout=5)
                        content_type = response.headers.get('Content-Type', '')
                        # print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
                        if 'pdf' in content_type:
                            fetched=1
                    except requests.RequestException as e:
                        print(f"Error checking URL {href}: {e}")
                        return ""

                    # return
        except:
            pass


    # Taylor & Francis
    if "www.tandfonline.com" in current_url and fetched==0:
        # print('Other BS')
        # try:
        #     doi = current_url.split("full/")[1].split("#")[0]
        #     reader_href = f"https://www.tandfonline.com/doi/pdf/{doi}/?download=true"
        #     driver.get(reader_href)
        #     # print(2)
        #     time.sleep(1)
        #     retrieved.append(doi)
        #     fetched = 1
        #     # try:
        #     #     response = requests.head(href, timeout=5)
        #     #     content_type = response.headers.get('Content-Type', '')
        #     #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
        #     # except requests.RequestException as e:
        #     #     print(f"Error checking URL {href}: {e}")
        #         # return False
        #     # return
            
        # except:
        #     pass
        return ""


    # ACM DL
    if "dl.acm.org" in current_url and fetched==0:
        pdf_link = f"https://dl.acm.org/doi/pdf/{doi}"
        driver.get(pdf_link)
        retrieved.append(doi)
        fetched = 1
        time.sleep(1)


    # ieee
                    

    if "ieeexplore.ieee.org" in current_url and fetched==0:
        try:
            ieee_dois.append(doi)
            download_ieee_pdfs(driver, doi)
            retrieved.append(doi)
            fetched=1
            # doc_number = current_url.split("/")[-1]
            # pdf_link = f"https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber={doc_number}"
            # print('ieee')
            # # print(pdf_link)
            # driver.get(pdf_link)
            # time.sleep(1)
            # retrieved.append(doi)
            # # print(3)
            # try:
            #     response = requests.head(pdf_link, timeout=5)
            #     content_type = response.headers.get('Content-Type', '')
            #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
            # except requests.RequestException as e:
            #     print(f"Error checking URL {href}: {e}")
            #     # return False
            # return
            
        except:
            pass

    # plos:
    if "journals.plos.org" in current_url and fetched==0:
        try:
            split = current_url.split("?")
            prefix = split[0]
            id = split[-1]

            url = f"{prefix}/file?{id}&type=printable"
            driver.get(url)
            time.sleep(1)
            retrieved.append(doi)
            fetched = 1
            # print(4)
            # try:
            #     response = requests.head(href, timeout=5)
            #     content_type = response.headers.get('Content-Type', '')
            #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
            # except requests.RequestException as e:
            #     print(f"Error checking URL {href}: {e}")
            #     # return False
            # return
            
        except:
            pass
    

    # agupubs.onlinelibrary.wiley.com
    if "agupubs.onlinelibrary.wiley.com" in current_url and fetched==0:
        try:
            href = f"https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/{doi}?download=true"
            driver.get(href)
            time.sleep(1)
            retrieved.append(doi)
            fetched = 1
            # print(4)
            # try:
            #     response = requests.head(href, timeout=5)
            #     content_type = response.headers.get('Content-Type', '')
            #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
            # except requests.RequestException as e:
            #     print(f"Error checking URL {href}: {e}")
            #     # return False
            # return
        except:
            pass


    # www.mdpi.com
    if "www.mdpi.com" in current_url and fetched==0:
        try:
            links = driver.find_element(By.CSS_SELECTOR, 'a=[class*="UD_ArticlePDF')
            href = link.get_attribute('href')
            driver.get(href)
            time.sleep(1)
            retrieved.append(doi)
            fetched = 1
            # print(5)
            # try:
            #     response = requests.head(href, timeout=5)
            #     content_type = response.headers.get('Content-Type', '')
            #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
            # except requests.RequestException as e:
            #     print(f"Error checking URL {href}: {e}")
            #     # return False
            # return
            
        except:
            pass
                
    
    # AMS Publications
    if "journals.ametsoc.org" in current_url and fetched==0:
        try:
            href = current_url.replace("view", "downloadpdf/view").replace(".xml", ".pdf")
            driver.get(href)
            time.sleep(1)
            retrieved.append(doi)
            fetched = 1
            # print(6)
            # try:
            #     response = requests.head(href, timeout=5)
            #     content_type = response.headers.get('Content-Type', '')
            #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
            # except requests.RequestException as e:
            #     print(f"Error checking URL {href}: {e}")
            #     # return False
            # return
        except:
            pass

    
    # Now publisher
    if "nowpublishers" in current_url and fetched==0:
        try:
            href = f"https://www.nowpublishers.com/article/Download/{url_find}"
            driver.get(href)
            time.sleep(1)
            retrieved.append(doi)
            fetched = 1
            # print(7)
            # try:
            #     response = requests.head(href, timeout=5)
            #     content_type = response.headers.get('Content-Type', '')
            #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
            # except requests.RequestException as e:
            #     print(f"Error checking URL {href}: {e}")
            #     # return False
            # return
            
        except:
            pass

    
    # a general case for many paper websites
    try:
        pdf_links = driver.find_elements(By.CSS_SELECTOR, "a[class*='article-pdfLink']")
        for pdf_link in pdf_links:
            href = pdf_link.get_attribute('href')
            if href:
                driver.get(href)
                time.sleep(1)
                retrieved.append(doi)
                fetched = 1
                # print(8)
                # try:
                #     response = requests.head(href, timeout=5)
                #     content_type = response.headers.get('Content-Type', '')
                #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
                # except requests.RequestException as e:
                #     print(f"Error checking URL {href}: {e}")
                #     # return False
                # return
    except Exception as e:
        pass
        
    # www.degruyter.com
    if "degruyter.com" in current_url and fetched==0:
        try:
            pdf_links = driver.find_elements(By.CSS_SELECTOR, "a[class*='ga_download_button_pdf_article downloadCompletePdfArticle downloadPdf btn btn-primary fw-bold py-2 w-100 vgwort-click']")
            for pdf_link in pdf_links:
                href = pdf_link.get_attribute('href')
                if href:
                    driver.get(href)
                    time.sleep(1)
                    retrieved.append(doi)
                    fetched = 1
                    # print(9)
                    # try:
                    #     response = requests.head(href, timeout=5)
                    #     content_type = response.headers.get('Content-Type', '')
                    #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
                    # except requests.RequestException as e:
                    #     print(f"Error checking URL {href}: {e}")
                    #     # return False
                    # return
        except Exception as e:
            pass

    
    # academic.oup.com
    if "academic.oup.com" in current_url and fetched==0:
        try:
            pdf_link = driver.find_element(By.CSS_SELECTOR, "a.al-link.pdf.article-pdfLink")
            href = pdf_link.get_attribute('href')
            if href:
                driver.get(href)
                time.sleep(1)
                retrieved.append(doi)
                fetched = 1
                # print(10)
                # try:
                #     response = requests.head(href, timeout=5)
                #     content_type = response.headers.get('Content-Type', '')
                #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
                # except requests.RequestException as e:
                #     print(f"Error checking URL {href}: {e}")
                #     # return False
                # return
        except Exception as e:
            pass

    
    href_conditions = [
        doi_split[-1] + ".pdf",
        doi_split[-1] + "-pdf",
        doi_split[-1] + "/pdf",
        "epdf/" + doi,
        "reader/" + doi,
        url_find + "/pdf",
        url_find + "/pdfft"
    ]   
        
    # If form is not found, search for href-based downloads
    for condition in href_conditions:
        try:
            # Find an element where the href contains the condition
            elements = driver.find_elements(By.XPATH, f"//a[contains(@href, '{condition}')]")
            for element in elements:
                href = element.get_attribute('href')
                driver.get(href)
                retrieved.append(doi)
                fetched = 1
                # print(11)
                # try:
                #     response = requests.head(href, timeout=5)
                #     content_type = response.headers.get('Content-Type', '')
                #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
                # except requests.RequestException as e:
                #     print(f"Error checking URL {href}: {e}")
                #     # return False
                # return
        except Exception as e:
            continue
    
    # Additional check for a condition that occurs in ScienceDirect papers: doi and "pdf" present without concatenation
    try:
        elements = driver.find_elements(By.XPATH, "//a[contains(@href, 'pdf')]")
        for element in elements:
            href = element.get_attribute('href')
            if (doi in href and "pdf" in href) or (url_find in href and "pdf" in href):
                # print(url_find)
                # print(href)
                driver.get(href)
                retrieved.append(doi)
                fetched = 1
                # print(12)
                # try:
                #     response = requests.head(href, timeout=5)
                #     content_type = response.headers.get('Content-Type', '')
                #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
                # except requests.RequestException as e:
                #     print(f"Error checking URL {href}: {e}")
                # return False
                # return
    except Exception as e:
        pass

    # Check if there is a form for PDF download
    try:
        form = driver.find_element(By.XPATH, "//form[contains(@class, 'ft-download-content__form ft-download-content__form--pdf js-ft-download-form')]")
        if form:                                                       
            form.submit()
            time.sleep(1)
            retrieved.append(doi)
            fetched = 1
            # print(13)
            # try:
            #     response = requests.head(href, timeout=5)
            #     content_type = response.headers.get('Content-Type', '')
            #     print('pdf' in content_type)  # Checks if Content-Type includes 'pdf'
            # except requests.RequestException as e:
            #     print(f"Error checking URL {href}: {e}")
                # return False
            # return
    except Exception as e:
        pass
        
    not_retrieved.append(doi)


    if fetched==1:
        downloaded_file = wait_for_download()

        # Extract text from the downloaded PDF
        if downloaded_file:
            text_content = extract_text_from_pdf(downloaded_file)

        # Delete the PDF file after extracting text
            os.remove(downloaded_file)

            return text_content  # Return the extracted text
        
    return ""
    

In [129]:
import json
from tqdm import tqdm

with open ("OpenAlex_Text_Depth.json") as file:
    papers = json.load(file)
    
ieee_dois = []
idx = 0

driver = webdriver.Chrome(options=options)

institutional_login(driver)

for paper in tqdm(papers[:]):
    idx += 1
    if idx>4200:
        if paper.get('doi', '') != '':
            if (paper.get('text', '') == ''): # and idx > num
                paper['text'] = download_pdf(driver, paper.get('doi', ''), ieee_dois)
            
        if (idx%100)==0: # and idx > num
            # print(idx)
            filename = f'OpenAlex_Text_Depth.json'
            print('Saving')
            with open(filename, "w") as json_file:
                json.dump(papers, json_file, indent=2)
            print('\n')
            print(f'Count-{idx} : Saved')
# driver.quit()

# 2549 / 4800

Please select your university and complete the login manually (including Duo authentication if required).


 10%|▉         | 4299/45140 [03:36<10:20:21,  1.10it/s]

Saving


 10%|▉         | 4300/45140 [03:44<23:29:15,  2.07s/it]



Count-4300 : Saved


 10%|▉         | 4399/45140 [14:05<96:32:48,  8.53s/it] 

Saving


 10%|▉         | 4400/45140 [14:19<111:02:14,  9.81s/it]



Count-4400 : Saved


 10%|▉         | 4499/45140 [21:55<67:48:18,  6.01s/it] 

Saving


 10%|▉         | 4500/45140 [22:03<75:48:56,  6.72s/it]



Count-4500 : Saved


 10%|█         | 4599/45140 [33:28<61:19:28,  5.45s/it] 

Saving


 10%|█         | 4600/45140 [33:42<81:44:42,  7.26s/it]



Count-4600 : Saved


 10%|█         | 4699/45140 [42:42<41:44:07,  3.72s/it] 

Saving


 10%|█         | 4700/45140 [42:52<60:22:15,  5.37s/it]



Count-4700 : Saved


 10%|█         | 4737/45140 [47:08<55:01:57,  4.90s/it] 

HTTPError: 500 for DOI 10.1016/c2013-0-11671-6
Internal Server Error. The server encountered an unexpected condition.


 11%|█         | 4799/45140 [53:50<64:37:02,  5.77s/it] 

Saving


 11%|█         | 4800/45140 [54:00<73:09:38,  6.53s/it]



Count-4800 : Saved


 11%|█         | 4884/45140 [1:01:05<8:23:33,  1.33it/s]  


KeyboardInterrupt: 

In [130]:
with open ("OpenAlex_Text_Depth.json") as file:
    papers = json.load(file)

In [119]:
len(ieee_dois)

171

In [120]:
for i,p in enumerate(papers):
    doi = p.get('doi')
    if doi in ieee_dois:
        if p.get('text', '') == '':
            print(i)
            break

3261


In [35]:
papers[24]

{'title': 'A multigrid tutorial',
 'doi': '',
 'openalex_id': 'https://openalex.org/W1530872699',
 'authors': ['William L. Briggs'],
 'publication_date': '1987-01-01',
 'publish_year': 1987,
 'keywords': ['Multigrid method',
  'Computer science',
  'Nonlinear system',
  'Applied mathematics',
  'Bibliography',
  'Iterative method',
  'Index (typography)',
  'Mathematics',
  'Algorithm',
  'Physics',
  'Programming language',
  'Library science',
  'Quantum mechanics'],
 'abstract': 'Preface 1. Model problems 2. Basic iterative methods 3. Elements of multigrid 4. Implementation 5. Some theory 6. Nonlinear problems 7. Selected applications 8. Algebraic multigrid (AMG) 9. Multilevel adaptive methods 10. Finite elements Bibliography Index.',
 'global_link_openable': 'https://openalex.org/W1530872699',
 'citation_count': 1951,
 'publication': [],
 'references': []}

In [18]:
papers[247]

{'title': 'HOGgles: Visualizing Object Detection Features',
 'doi': '10.1109/iccv.2013.8',
 'openalex_id': 'https://openalex.org/W1982428585',
 'authors': ['Carl Vondrick',
  'Aditya Khosla',
  'Tomasz Malisiewicz',
  'Antonio Torralba'],
 'publication_date': '2013-12-01',
 'publish_year': 2013,
 'keywords': ['Computer science',
  'False positive paradox',
  'Object detection',
  'Feature (linguistics)',
  'Artificial intelligence',
  'Object (grammar)',
  'Detector',
  'Visualization',
  'Space (punctuation)',
  'Feature vector',
  'Computer vision',
  'Feature extraction',
  'Pattern recognition (psychology)',
  'Telecommunications',
  'Philosophy',
  'Linguistics',
  'Operating system'],
 'abstract': "We introduce algorithms to visualize feature spaces used by object detectors. The tools in this paper allow a human to put on 'HOG goggles' and perceive the visual world as a HOG based object detector sees it. We found that these visualizations allow us to analyze object detection syst

In [134]:
papers[7]

{'title': 'Unsupervised representation learning by discovering reliable image relations',
 'doi': '10.1016/j.patcog.2019.107107',
 'openalex_id': 'https://openalex.org/W3000050604',
 'authors': ['Timo Milbich', 'Omair Ghori', 'Ferran Diego', 'Björn Ommer'],
 'publication_date': '2020-01-17',
 'publish_year': 2020,
 'keywords': ['Artificial intelligence',
  'Pairwise comparison',
  'Pattern recognition (psychology)',
  'Computer science',
  'Inference',
  'Representation (politics)',
  'Divide and conquer algorithms',
  'Unsupervised learning',
  'Machine learning',
  'Feature learning',
  'Pascal (unit)',
  'Transitive relation',
  'Feature (linguistics)',
  'Mathematics',
  'Algorithm',
  'Politics',
  'Political science',
  'Law',
  'Linguistics',
  'Philosophy',
  'Combinatorics',
  'Programming language'],
 'abstract': 'Learning robust representations that allow to reliably establish relations between images is of paramount importance for virtually all of computer vision. Annotatin

In [133]:
papers[797]

{'title': 'A new approach to cross-modal multimedia retrieval',
 'doi': '10.1145/1873951.1873987',
 'openalex_id': 'https://openalex.org/W2106277773',
 'authors': ['Nikhil Rasiwasia',
  'José Costa Pereira',
  'Emanuele Coviello',
  'Gabriel Doyle',
  'Gert Lanckriet',
  'Roger Lévy',
  'Nuno Vasconcelos'],
 'publication_date': '2010-10-25',
 'publish_year': 2010,
 'keywords': ['Computer science',
  'Latent Dirichlet allocation',
  'Abstraction',
  'Information retrieval',
  'Modal',
  'Visual Word',
  'Canonical correlation',
  'Context (archaeology)',
  'Image retrieval',
  'Feature (linguistics)',
  'Task (project management)',
  'Artificial intelligence',
  'Scale-invariant feature transform',
  'Component (thermodynamics)',
  'Explicit semantic analysis',
  'Topic model',
  'Natural language processing',
  'Pattern recognition (psychology)',
  'Feature extraction',
  'Image (mathematics)',
  'Semantic computing',
  'Philosophy',
  'Linguistics',
  'Chemistry',
  'Biology',
  'Pale

In [131]:
count = 0
l = []
for paper in papers[:]:
    if paper.get('text', '') != '':
        count+=1
        l.append(paper.get('doi'))
print(count)
print(l)

2549
['10.48550/arxiv.1706.03762', '10.1109/cvpr.2016.90', '10.48550/arxiv.1406.2661', '10.48550/arxiv.1412.6980', '10.48550/arxiv.1804.02767', '10.48550/arxiv.1611.01578', '10.48550/arxiv.1803.03635', '10.1016/j.patcog.2019.107107', '10.1016/j.ijforecast.2016.01.001', '10.1145/3236009', '10.1145/3285029', '10.1109/cvpr.2017.690', '10.48550/arxiv.1905.11946', '10.48550/arxiv.1906.08237', '10.48550/arxiv.1907.11692', '10.48550/arxiv.2005.14165', '10.48550/arxiv.2103.14030', '10.48550/arxiv.2102.12092', '10.1038/s41586-021-03819-2', '10.48550/arxiv.1806.07366', '10.1109/cvpr.2019.00453', '10.1109/cvpr.2018.00727', '10.1109/cvpr.2017.424', '10.1109/cvpr.2019.00020', '10.1109/iccv.2015.169', '10.1109/iccv.2015.123', '10.1007/978-3-642-35289-8_14', '10.48550/arxiv.1502.03167', '10.1007/978-3-319-10602-1_48', '10.1109/cvpr.2015.7298965', '10.48550/arxiv.1207.0580', '10.1109/iccv.2015.135', '10.1109/cvpr.2015.7299173', '10.1109/tpami.2011.235', '10.1145/1141911.1142005', '10.1016/0893-6080(95

In [135]:
len(ieee_dois)

38

In [None]:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

# Replace with your university name
university_name = 'University of Michigan - Ann Arbor'

# Function to handle the initial navigation to the Institutional Sign In page
def institutional_login(driver):
    driver.get('https://ieeexplore.ieee.org/Xplore/home.jsp')
    time.sleep(2)

    # Click the 'Institutional Sign In' button
    inst_sign_in_button = driver.find_element(By.LINK_TEXT, 'Institutional Sign In')
    inst_sign_in_button.click()
    time.sleep(2)

    # At this point, the user can manually select the university and enter credentials
    print("Please select your university and complete the login manually (including Duo authentication if required).")
    input("Press Enter here after you have fully logged in...")  # Wait for manual completion

# Function to check if the PDF link is accessible
def is_pdf_accessible(pdf_link):
    try:
        response = requests.head(pdf_link, timeout=5)
        content_type = response.headers.get('Content-Type', '')
        return 'pdf' in content_type  # Check if Content-Type includes 'pdf'
    except requests.RequestException as e:
        print(f"Error checking URL {pdf_link}: {e}")
        return False

# Main function to download IEEE PDFs for multiple DOIs
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# Main function to download IEEE PDFs for multiple DOIs
def download_ieee_pdfs(driver, dois):
    for doi in dois:
        # Navigate to the DOI link
        url = f"https://doi.org/{doi}"
        driver.get(url)
        time.sleep(1)

        current_url = driver.current_url
        if "ieeexplore.ieee.org" in current_url:
            try:
                # Extract the document number and construct the PDF link (stamp page link)
                doc_number = current_url.split("/")[-1]
                # pdf_link = f"https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={doc_number}"
                pdf_link = f"https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber={doc_number}&ref="
                print(f"Attempting to access the stamp page for DOI {doi}...")

                # Navigate to the stamp page
                driver.get(pdf_link)
                time.sleep(2)  # Wait for the page to load

                # Look for the "Open" button on the page
                try:
                    open_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Open')]")
                    open_button.click()
                    print(f"Clicked 'Open' button to download PDF for DOI {doi}")
                    retrieved.append(doi)
                    time.sleep(3)  # Wait for download to start
                except NoSuchElementException:
                    print(f"No 'Open' button found on page for DOI {doi}. PDF might have been downloaded automatically.")
                    retrieved.append(doi)

            except Exception as e:
                print(f"An error occurred while attempting to download the PDF for DOI {doi}: {e}")
                not_retrieved.append(doi)

                
# Initialize WebDriver with download preferences


options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36')

options.add_argument("enable-automation");
options.add_argument("--window-size=1920,1080");
options.add_argument("--no-sandbox");
options.add_argument("--disable-extensions");
options.add_argument("--dns-prefetch-disable");
options.add_argument("--disable-gpu");


options.add_experimental_option('prefs', {      
"download.default_directory": "/Users/rishikesh/Downloads/try_2",  # Papers will be downloaded at this address
"download.prompt_for_download": False,  
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True  
})

driver = webdriver.Chrome(options=options)

# Perform the initial institutional login
institutional_login(driver)

# Download multiple PDFs without re-logging in
download_ieee_pdfs(driver, ieee_dois)

# Close the driver after completing all downloads
# driver.quit()


#### Sample usage:

In [None]:
print(f"Retrieved {len(retrieved)} papers. {len(not_retrieved)} papers unretrieved.")

In [None]:
retrieved

In [None]:
not_retrieved