In [3]:
import requests
import json
import time
import random
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import socket

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options

In [7]:
class MDPI_MVP:
    def __init__(self, 
                 i_start:int=0, 
                 i_delta:int=8000, 
                 crawl_delay:int=8.0):
        '''
        Init
        '''
        # store path
        if 'lambda' in socket.gethostname():
            self.download_dir = Path('/homes/csiebenschuh/Projects/dataprep/data/mdpi')
        else:
            self.download_dir = Path('/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/raw_data/mdpi') # Polaris
        assert self.download_dir.is_dir(), f"Initializing `ArXiV_MVP` failed as {self.download_dir} does not exist"

        self.crawl_delay = crawl_delay
        df = pd.read_csv('./registry/mdpi_database.csv', sep='|')
        
        # subset
        df_sub = df.iloc[i_start:i_start+i_delta]
        self.df_sub = df_sub.sample(frac=1).reset_index(drop=True)
        
        # shuffle order
        self.df_sub = self.df_sub.sample(frac=1).reset_index(drop=True)


    def get_arxiv_articles_with_html(self,):
        '''
        Attempt to download PDFs and HTML files
        '''

        # setup directories if needed
        download_dir = Path(self.download_dir)
        pdf_path = download_dir / 'pdf'
        html_path = download_dir / 'html'
        csv_path = download_dir / 'csv'

        assert download_dir.is_dir(), "`download_dir` invalid directory path"

         # driver options 
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        driver = webdriver.Chrome(options=options)

        # loop entries
        for _,row in self.df_sub.iterrows():
            doi = str(row['html_url']).split('www.mdpi.com/')[-1].replace('/', '.')  # df['html_url'][0].split('www.mdpi.com/')[-1].replace('/', '.')
            file_stem = doi.replace('/', '_')

            # HTML / PDF
            html_url = row['html_url']
            pdf_url = row['pdf_url']

            # HTML
            driver.get(html_url)
            html_content = driver.page_source

            # wait
            time.sleep(self.crawl_delay)
            
            # PDF
            pdf_response = requests.get(pdf_url)
            if pdf_response.status_code == 200:
                # Save HTML content to file
                with open(str(html_path / (file_stem + '.html')), 'w', encoding='utf-8') as file:
                    file.write(html_content)

                # Save PDF content to file
                with open(f"{pdf_path}/{file_stem}.pdf", 'wb') as pdf_file:
                    pdf_file.write(pdf_response.content)

                # Meta 
                row.to_csv(f"{csv_path}/{file_stem}.csv", sep='|')

                # wait again
                time.sleep(random.uniform(0.2, 1.5))
                
            else:
                print(f'nothing written, {pdf_response.status_code}')
        
        pass


In [8]:
m = MDPI_MVP()

In [10]:
m.get_arxiv_articles_with_html()