In [None]:
import os
import json
import requests
from pathlib import Path
from urllib.parse import urlparse

In [2]:
def clean_filename(name: str) -> str:
    invalid_chars = ':*?"<>|/'
    for char in invalid_chars:
        name = name.replace(char, '-')
    return name    

In [46]:
def empty_pdfs_folder(folder_path: str = "data/pdfs"):
    if not os.path.isdir(folder_path):
        print(f"Error: The path '{folder_path}' is not a valid folder.")
        return
    for pdf_file in os.listdir(folder_path):
        complete_path = os.path.join(folder_path, pdf_file)
        os.remove(complete_path)
    print(f"The folder '{folder_path}' has been emptied.")


def download_pdfs(json_path: str = "data/papers_data.json", dest_folder: str = "/pdfs"):
    base_dest_path = Path("../data") / dest_folder.strip('/')

    # Create new folder to save the PDFs if it doesn't already exist
    base_dest_path.mkdir(parents=True, exist_ok=True)
    # Empty the destination folder before downloading new PDFs
    empty_pdfs_folder("data" + dest_folder)
    
    # Load the JSON file
    with open(json_path, "r", encoding="utf-8") as j:
        paper_list = json.load(j)
    
    pdf_counter = 1
    total_pdf_number = len(paper_list)

    for i, paper in enumerate(paper_list):
        # Obtain URL and title for each paper (cleaning the title from invalid characters like ':')
        pdf_url = paper['PDF URL']
        pdf_title = clean_filename(paper['Title']) + ".pdf"

        # Download the paper PDF
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()  # Check if the request was successful

        # Stablish the path to save the PDF
        pdf_path = base_dest_path / pdf_title
    
        # Write the PDF content to a file
        with open(pdf_path, 'wb') as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                pdf_file.write(chunk)

        paper['Local PDF Path'] = str(pdf_path)
        
        print(f"({pdf_counter}/{total_pdf_number}) {pdf_title} downloaded.")
        pdf_counter += 1

    with open(json_path, "w", encoding="utf-8") as j:
        json.dump(paper_list, j, indent=4)
    
    print(f"All {total_pdf_number} PDFs downloaded.")

In [47]:
download_pdfs() 

The folder 'data/pdfs' has been emptied.
(1/40) KG^2- Learning to Reason Science Exam Questions with Contextual Knowledge Graph Embeddings.pdf downloaded.
(2/40) Incorporating Literals into Knowledge Graph Embeddings.pdf downloaded.
(3/40) Adversarial Contrastive Estimation.pdf downloaded.
(4/40) KBGAN- Adversarial Learning for Knowledge Graph Embeddings.pdf downloaded.
(5/40) Convolutional 2D Knowledge Graph Embeddings.pdf downloaded.
(6/40) Answering Visual-Relational Queries in Web-Extracted Knowledge Graphs.pdf downloaded.
(7/40) Expeditious Generation of Knowledge Graph Embeddings.pdf downloaded.
(8/40) Learning Knowledge Graph Embeddings with Type Regularizer.pdf downloaded.
(9/40) Analysis of the Impact of Negative Sampling on Link Prediction in Knowledge Graphs.pdf downloaded.
(10/40) DeepPath- A Reinforcement Learning Method for Knowledge Graph Reasoning.pdf downloaded.
(11/40) Inducing Interpretability in Knowledge Graph Embeddings.pdf downloaded.
(12/40) Fast Linear Model fo

In [26]:
empty_pdfs_folder()

The folder 'data\pdfs' has been emptied.
