In [1]:
import sys, re, os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import time
import numpy as np
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [2]:
### COMBINE TXT AND PDF LISTS

base_dir = "/Volumes/Samsung_SSD/levin-references/"
txt_references = f"{base_dir}levin-bibtex-sources.csv"
pdf_references = f"{base_dir}levin-pdf-references.csv"
df1 = pd.read_csv(txt_references)
df2 = pd.read_csv(pdf_references)
combined_df = pd.concat([df1, df2], ignore_index=True)
all_references = combined_df.drop_duplicates(subset='DOI', keep='first')
### ORIGINAL: 23751
### DUPLICATES REMOVED: 22783


In [3]:
### FIND DOWNLOAD URL
def find_unpaywall_url(doi):
    base_url = "https://api.unpaywall.org/v2/"
    doi_endpoint = f"{base_url}{doi}?email=reedbndr@gmail.com"
    response = requests.get(doi_endpoint)
    if response.status_code == 200:
        data = response.json()
        if data.get("is_oa"):
            url = data['best_oa_location']['url_for_pdf']
            if url == None or url == "None":
                url = "NA"
            return url
        else:
            return "NA"
    else:
        return "NA"

def process_citation_urls(all_references, 
                          max_workers=20,
                          max_retries=3,
                          chunk_size = 600):
    ### CREATE CHUNKS TO PROCESS SERIALLY
    chunks = len(all_references) // chunk_size + (1 if len(all_references) % chunk_size else 0)
    
    ### LOOP CHUNKS
    retries = 0
    for chunk in range(chunks):
        while retries <= max_retries:
            try:
                start_idx = chunk * chunk_size
                end_idx = start_idx + chunk_size
                chunk_dois = all_references["DOI"].iloc[start_idx:end_idx]
                all_urls = [None] * len(chunk_dois)

                # FIND URLS IN PARALLEL FOR CHUNK
                with ThreadPoolExecutor(max_workers=max_workers) as executor:
                    future_to_index = {executor.submit(find_unpaywall_url, doi): idx for idx, doi in enumerate(chunk_dois)}
                    futures = list(future_to_index.keys())
                    for future in tqdm(as_completed(futures), total=len(futures), desc=f'Fetching URLs for chunk {chunk+1}/{chunks}'):
                        index = future_to_index[future]
                        url = future.result()
                        all_urls[index] = url

                # CREATE DATAFRAME FOR THIS CHUNK
                chunk_df = pd.DataFrame({
                    'Title': all_references['Title'].iloc[start_idx:end_idx].reset_index(drop=True), 
                    'DOI': chunk_dois.reset_index(drop=True), 
                    'URL': all_urls
                })

                ### WRITE TO CSV
                mode = 'w' if chunk == 0 else 'a'
                header = True if chunk == 0 else False
                chunk_df.to_csv('levin-citation-downloads.csv', index=False, mode=mode, header=header)

                ### BREAK AND MOVE ON
                time.sleep(5)
                retries = 0
                break
                
            except Exception as e:
                ### IF RATE-LIMITED, SLEEP AND TRY AGAIN
                retries += 1
                time.sleep(30)
                print(f"Error processing chunk {start_idx//chunk_size + 1}: {e}")
                print(f"Taking a 30 second break and trying again...")

    ### RETURN
    final_df = pd.read_csv('levin-citation-downloads.csv')
    return final_df


In [4]:
# process_citation_urls(all_references)
### DONE ###

In [5]:
compiled_citations = pd.read_csv('levin-citation-downloads.csv')
not_na_count = compiled_citations['URL'].notna().sum()
print(f"Number of entries with a URL that is not 'NA': {not_na_count}")
print(f"% of all citations with Open Access PDF: {(not_na_count / len(compiled_citations))*100:.1f}")

Number of entries with a URL that is not 'NA': 9497
% of all citations with Open Access PDF: 41.7


In [6]:
### CREATE THE DOWNLOAD FILENAME
def clean_title(title):
    title = re.sub(r'[^\w\s]', '', title)
    title = title.replace(' ', '_') 
    title = title[:50]
    return title.lower()

compiled_citations['Filename'] = compiled_citations.apply(lambda row: clean_title(row['Title']) + '.pdf' if pd.notnull(row['URL']) else np.nan, axis=1)


In [7]:
#########################
### DOWNLOAD THE PDFs ###
#########################
def download_pdf_from_url(download_url, 
                          filename, 
                          download_location="/Volumes/Samsung_SSD/levin-references/pdf_downloads/"):
    if download_url is None:
        return "None"

    ### IMMULATE WEB-BROWSER REQUEST
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

    ### RETRY WITH EXPONENTIAL BACKOFF
    retry_strategy = Retry(
        total=3,
        status_forcelist=[403, 429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        backoff_factor=1
    )

    ### ATTEMPT DOWNLOAD
    adapter = HTTPAdapter(max_retries=retry_strategy)
    with requests.Session() as session:
        session.mount("https://", adapter)
        session.mount("http://", adapter)

        try:
            response = session.get(download_url, headers=headers, stream=True)
            response.raise_for_status()
            
            with open(f"{download_location}{filename}", 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            return f"[successful pdf download] -> `{filename}`"
        except requests.exceptions.HTTPError as e:
            return f"HTTP Error downloading {filename}: {e}"
        except Exception as e:
            return f"Error downloading {filename}: {e}"
        finally:
            ### AVOID RATE-LIMITS
            time.sleep(3)
            
def download_pdfs(all_citations, 
                  max_workers=20):
    all_citations = all_citations.copy()
    ### CREATE LIST OF TUPLES
    download_list = [
        (index, row['URL'], row['Filename']) if not pd.isna(row['URL']) else (index, None, None)
        for index, row in all_citations.iterrows()
    ]

    ### TRACK STATUS OF DOWNLOADS
    download_statuses = {index: None for index, _, _ in download_list}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        ### TRACK BY INDEX
        future_to_index = {
            executor.submit(download_pdf_from_url, url, filename): index
            for index, url, filename in download_list
        }

        ### DOWNLOAD PDFS
        progress = tqdm(as_completed(future_to_index), total=len(future_to_index), unit="file")
        for future in progress:
            index = future_to_index[future]
            try:
                download_status = future.result()
            except Exception as e:
                download_status = f"An exception occurred: {e}"
            download_statuses[index] = download_status
            progress.set_description(f"Processing index {index}")

    ### ADD STATUS TO DF
    for index, status in download_statuses.items():
        all_citations.at[index, 'Download Status'] = status
    all_citations.to_csv('levin-citation-downloads-completed.csv', index=False)
    return all_citations




In [8]:
# download_pdfs(compiled_citations)

### 6650 SUCCESSFUL DOWNLOADS ###

  0%|          | 0/22783 [00:00<?, ?file/s]

Unnamed: 0,Title,DOI,URL,Filename,Download Status
0,Immunotherapy: A new target for cancer cure (R...,10.3892/or.2023.8537,https://www.spandidos-publications.com/10.3892...,immunotherapy_a_new_target_for_cancer_cure_rev...,[successful pdf download] -> `immunotherapy_a_...
1,Embryos Assist Each Other’s Morphogenesis: cal...,10.31219/osf.io/ps987,,,
2,"Dormant cancer cells: programmed quiescence, s...",10.1007/s10555-022-10073-z,https://link.springer.com/content/pdf/10.1007/...,dormant_cancer_cells_programmed_quiescence_sen...,[successful pdf download] -> `dormant_cancer_c...
3,Bioelectricity in Developmental Patterning and...,10.3390/cells12081148,https://www.mdpi.com/2073-4409/12/8/1148/pdf?v...,bioelectricity_in_developmental_patterning_and...,[successful pdf download] -> `bioelectricity_i...
4,Cellular Competency during Development Alters ...,10.3390/e25010131,https://www.mdpi.com/1099-4300/25/1/131/pdf?ve...,cellular_competency_during_development_alters_...,[successful pdf download] -> `cellular_compete...
...,...,...,...,...,...
22778,Immune response and anti -inflammatory activ...,10.1016/j.vetimm.2021.110269,https://air.unimi.it/bitstream/2434/847327/4/%...,immune_response_and___anti_inflammatory_activi...,[successful pdf download] -> `immune_response_...
22779,An evaluation of the biological effects of thr...,10.1088/0952-4746/9/1/007,,,
22780,Spontaneous endogenous sup erweak luminescence...,10.1016/0006-3002(60)91616-4,,,
22781,The interpretation and use of the rotation of ...,10.1007/978-3-642-69186-7_19,,,


In [9]:
compiled_citations = pd.read_csv('levin-citation-downloads-completed.csv')

In [10]:
compiled_citations

Unnamed: 0,Title,DOI,URL,Filename,Download Status
0,Immunotherapy: A new target for cancer cure (R...,10.3892/or.2023.8537,https://www.spandidos-publications.com/10.3892...,immunotherapy_a_new_target_for_cancer_cure_rev...,[successful pdf download] -> `immunotherapy_a_...
1,Embryos Assist Each Other’s Morphogenesis: cal...,10.31219/osf.io/ps987,,,
2,"Dormant cancer cells: programmed quiescence, s...",10.1007/s10555-022-10073-z,https://link.springer.com/content/pdf/10.1007/...,dormant_cancer_cells_programmed_quiescence_sen...,[successful pdf download] -> `dormant_cancer_c...
3,Bioelectricity in Developmental Patterning and...,10.3390/cells12081148,https://www.mdpi.com/2073-4409/12/8/1148/pdf?v...,bioelectricity_in_developmental_patterning_and...,[successful pdf download] -> `bioelectricity_i...
4,Cellular Competency during Development Alters ...,10.3390/e25010131,https://www.mdpi.com/1099-4300/25/1/131/pdf?ve...,cellular_competency_during_development_alters_...,[successful pdf download] -> `cellular_compete...
...,...,...,...,...,...
22778,Immune response and anti -inflammatory activ...,10.1016/j.vetimm.2021.110269,https://air.unimi.it/bitstream/2434/847327/4/%...,immune_response_and___anti_inflammatory_activi...,[successful pdf download] -> `immune_response_...
22779,An evaluation of the biological effects of thr...,10.1088/0952-4746/9/1/007,,,
22780,Spontaneous endogenous sup erweak luminescence...,10.1016/0006-3002(60)91616-4,,,
22781,The interpretation and use of the rotation of ...,10.1007/978-3-642-69186-7_19,,,
