In [1]:
import requests
import pprint
import os

In [2]:
pdf_folder = "downloaded_pdfs"

In [3]:
dois = [
    "10.1002/lno.11306",
    "10.1002/ppp.2200",
    "10.1016/j.geoderma.2021.115674",
    "10.1021/acs.estlett.0c00748",
    "10.1029/2022JG006889",
    "10.1038/s41467-023-36515-y",
    "10.1038/s41564-019-0449-y",
    "10.1038/s41564-020-00861-0",
    "10.1038/s41564-022-01266-x",
    "10.1038/s41597-019-0132-4",
    "10.1038/s41597-024-03069-7",
    "10.1038/s41597-024-04013-5",
    "10.1093/nar/gkab990",
    "10.1101/2022.12.12.520098",
    "10.1111/1462-2920.16314",
    "10.1111/mec.16891",
    "10.1111/nph.15662",
    "10.1126/sciadv.adg7888",
    "10.1128/mra.01080-23",
    "10.1128/mra.01361-19",
    "10.1128/msystems.00045-18",
    "10.1128/msystems.00768-19",
    "10.1186/s40168-020-00889-8",
    "10.1371/journal.pone.0228165",
    "10.3389/fsoil.2023.1120425",
    "10.3390/microorganisms9020357",
    "10.3897/tdwgproceedings.1.20637",
    "10.5194/acp-23-15783-2023",
]

In [4]:
def get_pdf_url(doi):
    api_url = f"https://api.unpaywall.org/v2/{doi}?email=MAM@lbl.gov"
    response = requests.get(api_url)

    if response.status_code == 200:
        data = response.json()
        best_oa_location = data.get("best_oa_location")  # This could be None

        if best_oa_location:  # Ensure it's not None before accessing .get()
            return best_oa_location.get("url_for_pdf")

    return None  # Return None if no valid PDF URL is found

In [5]:
doi_to_url = {}

for doi in dois:
    print(f"Fetching PDF URL for {doi}")
    pdf_url = get_pdf_url(doi)
    doi_to_url[doi] = pdf_url

Fetching PDF URL for 10.1002/lno.11306
Fetching PDF URL for 10.1002/ppp.2200
Fetching PDF URL for 10.1016/j.geoderma.2021.115674
Fetching PDF URL for 10.1021/acs.estlett.0c00748
Fetching PDF URL for 10.1029/2022JG006889
Fetching PDF URL for 10.1038/s41467-023-36515-y
Fetching PDF URL for 10.1038/s41564-019-0449-y
Fetching PDF URL for 10.1038/s41564-020-00861-0
Fetching PDF URL for 10.1038/s41564-022-01266-x
Fetching PDF URL for 10.1038/s41597-019-0132-4
Fetching PDF URL for 10.1038/s41597-024-03069-7
Fetching PDF URL for 10.1038/s41597-024-04013-5
Fetching PDF URL for 10.1093/nar/gkab990
Fetching PDF URL for 10.1101/2022.12.12.520098
Fetching PDF URL for 10.1111/1462-2920.16314
Fetching PDF URL for 10.1111/mec.16891
Fetching PDF URL for 10.1111/nph.15662
Fetching PDF URL for 10.1126/sciadv.adg7888
Fetching PDF URL for 10.1128/mra.01080-23
Fetching PDF URL for 10.1128/mra.01361-19
Fetching PDF URL for 10.1128/msystems.00045-18
Fetching PDF URL for 10.1128/msystems.00768-19
Fetching PDF 

In [6]:
pprint.pprint(doi_to_url)

{'10.1002/lno.11306': 'https://aslopubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/lno.11306',
 '10.1002/ppp.2200': 'https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/ppp.2200',
 '10.1016/j.geoderma.2021.115674': None,
 '10.1021/acs.estlett.0c00748': None,
 '10.1029/2022JG006889': 'https://onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2022JG006889',
 '10.1038/s41467-023-36515-y': 'https://www.nature.com/articles/s41467-023-36515-y.pdf',
 '10.1038/s41564-019-0449-y': 'https://www.nature.com/articles/s41564-019-0449-y.pdf',
 '10.1038/s41564-020-00861-0': 'https://www.nature.com/articles/s41564-020-00861-0.pdf',
 '10.1038/s41564-022-01266-x': 'https://www.nature.com/articles/s41564-022-01266-x.pdf',
 '10.1038/s41597-019-0132-4': 'https://www.nature.com/articles/s41597-019-0132-4.pdf',
 '10.1038/s41597-024-03069-7': 'https://www.nature.com/articles/s41597-024-03069-7.pdf',
 '10.1038/s41597-024-04013-5': None,
 '10.1093/nar/gkab990': 'https://academic.oup.com/nar/article-pdf/50/D1/D8

In [7]:
# Create a folder to store PDFs

os.makedirs(pdf_folder, exist_ok=True)

In [8]:
def download_pdf(pdf_url, doi):
    """Downloads a PDF from a given URL and saves it using the DOI as the filename."""
    if not pdf_url:
        print(f"Skipping {doi} (No Open Access PDF found)")
        return

    try:
        response = requests.get(pdf_url, stream=True)  # Stream to handle large files
        response.raise_for_status()  # Raise an error for bad responses

        # Generate a safe filename (replace slashes and special characters)
        safe_filename = doi.replace("/", "_") + ".pdf"
        pdf_path = os.path.join(pdf_folder, safe_filename)

        # Write the PDF file
        with open(pdf_path, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                pdf_file.write(chunk)

        print(f"Downloaded: {doi} -> {pdf_path}")

    except requests.RequestException as e:
        print(f"Error downloading {doi}: {e}")

In [None]:
# Loop through DOI and URL pairs
for k,v in doi_to_url.items():
    print(f"Downloading PDF for {k}")
    download_pdf(v, k)
