In [1]:
import os

import elsapy
from elsapy.elsclient import ElsClient
from elsapy.elssearch import ElsSearch
from elsapy.elsdoc import FullDoc
from dotenv import load_dotenv
load_dotenv()

# Load your API key
API_KEY = os.getenv("ELSAVIER_API_KEY")
client = ElsClient(API_KEY)


In [2]:
search_query = 'LANGUAGE ( ukrainian ) AND ( LIMIT-TO ( OA , "all" ) )'
search = ElsSearch(search_query, 'scopus')
search.execute(client)

In [3]:
len(search.results)

25

In [8]:
os.makedirs('pdfs', exist_ok=True)

In [None]:
import requests
from scidownl import scihub_download

for doc in search.results:
    if 'dc:title' in doc and 'dc:identifier' in doc:

        title = doc['dc:title'].replace('/', '-')
        identifier = doc['prism:doi']
        print(f"Title: {title}")
        try:

        except Exception as exc:
            print(f"Failed to download PDF for: {title} (Status Code: {exc})")

    else:
        print(f"Failed to retrieve full document for: {title}")

In [None]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin


def get_pdf_from_doi(doi):
    base_url = 'https://doi.org/'
    url = base_url + doi
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page for DOI: {doi}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    pdf_link = None

    # Collect and check all elements with href or src attributes
    elements_with_urls = []
    for tag in ['a', 'button', 'iframe', 'object', 'embed', 'link', 'img']:
        for element in soup.find_all(tag):
            href = element.get('href') or element.get('src')
            if href and ("pdf" in href.lower() or "pdf" in element.text.lower()):
                href = urljoin(url, href)
                elements_with_urls.append(href)

    for href in elements_with_urls:
        try:
            head_response = requests.head(href, headers=headers, allow_redirects=True)
            if head_response.headers.get('Content-Type') == 'application/pdf':
                pdf_link = href
                break
            else:
                # If the head response is not a PDF, follow the link and check the content
                follow_response = requests.get(href, headers=headers, allow_redirects=True)
                if follow_response.headers.get('Content-Type') == 'application/pdf':
                    pdf_link = href
                    break
                else:
                    # Parse the follow response for additional potential PDF links
                    follow_soup = BeautifulSoup(follow_response.text, 'html.parser')
                    for tag in ['a', 'button', 'iframe', 'object', 'embed', 'link', 'img']:
                        for element in follow_soup.find_all(tag):
                            inner_href = element.get('href') or element.get('src')
                            if inner_href and ("pdf" in inner_href.lower() or "pdf" in element.text.lower()):
                                inner_href = urljoin(href, inner_href)
                                inner_head_response = requests.head(inner_href, headers=headers, allow_redirects=True)
                                if inner_head_response.headers.get('Content-Type') == 'application/pdf':
                                    pdf_link = inner_href
                                    break
                        if pdf_link:
                            break
        except requests.RequestException as e:
            print(f"Error checking link {href}: {e}")
        if pdf_link:
            break

    if not pdf_link:
        print(f"No PDF link found for DOI: {doi}")
        return

    pdf_response = requests.get(pdf_link, headers=headers)
    if pdf_response.status_code == 200 and pdf_response.headers.get('Content-Type') == 'application/pdf':
        pdf_filename = f"pdfs/{doi.replace('/', '_')}.pdf"
        with open(pdf_filename, 'wb') as pdf_file:
            pdf_file.write(pdf_response.content)
        print(f"Downloaded PDF for DOI: {doi}")
    else:
        print(f"Failed to download PDF for DOI: {doi} (Status Code: {pdf_response.status_code})")


get_pdf_from_doi("10.32918/nrs.2019.1(81).13")