In [1]:
from pypdf import PdfReader
import numpy as np
import pandas as pd
from google.cloud import storage
import os
from glob import glob
import json
import multiprocessing
from arxiv_public_datasets.arxiv_public_data.fulltext import convert_directory_parallel




In [2]:
total_cpu = multiprocessing.cpu_count()
print(total_cpu)

20


In [3]:
def create_folder(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

In [4]:
def download_many_blobs_with_transfer_manager(
    bucket_name, blob_names, destination_directory="", workers=8
):
    """Download blobs in a list by name, concurrently in a process pool.

    The filename of each blob once downloaded is derived from the blob name and
    the `destination_directory `parameter. For complete control of the filename
    of each blob, use transfer_manager.download_many() instead.

    Directories will be created automatically as needed to accommodate blob
    names that include slashes.
    """

    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The list of blob names to download. The names of each blobs will also
    # be the name of each destination file (use transfer_manager.download_many()
    # instead to control each destination file name). If there is a "/" in the
    # blob name, then corresponding directories will be created on download.
    # blob_names = ["myblob", "myblob2"]

    # The directory on your computer to which to download all of the files. This
    # string is prepended (with os.path.join()) to the name of each blob to form
    # the full path. Relative paths and absolute paths are both accepted. An
    # empty string means "the current working directory". Note that this
    # parameter allows accepts directory traversal ("../" etc.) and is not
    # intended for unsanitized end user input.
    # destination_directory = ""

    # The maximum number of processes to use for the operation. The performance
    # impact of this value depends on the use case, but smaller files usually
    # benefit from a higher number of processes. Each additional process occupies
    # some CPU and memory resources until finished. Threads can be used instead
    # of processes by passing `worker_type=transfer_manager.THREAD`.
    # workers=8

    from google.cloud.storage import Client, transfer_manager

    storage_client = Client()
    bucket = storage_client.bucket(bucket_name)

    results = transfer_manager.download_many_to_path(
        bucket, blob_names, destination_directory=destination_directory, max_workers=workers
    )

    for name, result in zip(blob_names, results):
        # The results list is either `None` or an exception for each blob in
        # the input list, in order.

        if isinstance(result, Exception):
            print("Failed to download {} due to exception: {}".format(name, result))
        else:
            print("Downloaded {} to {}.".format(name, destination_directory + name))

In [5]:
from google.cloud.storage import Client, transfer_manager

In [6]:
def download_bucket_with_transfer_manager(
    bucket_name, destination_directory="", workers=8, max_results=1000
):
    """Download all of the blobs in a bucket, concurrently in a process pool.

    The filename of each blob once downloaded is derived from the blob name and
    the `destination_directory `parameter. For complete control of the filename
    of each blob, use transfer_manager.download_many() instead.

    Directories will be created automatically as needed, for instance to
    accommodate blob names that include slashes.
    """

    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The directory on your computer to which to download all of the files. This
    # string is prepended (with os.path.join()) to the name of each blob to form
    # the full path. Relative paths and absolute paths are both accepted. An
    # empty string means "the current working directory". Note that this
    # parameter allows accepts directory traversal ("../" etc.) and is not
    # intended for unsanitized end user input.
    # destination_directory = ""

    # The maximum number of processes to use for the operation. The performance
    # impact of this value depends on the use case, but smaller files usually
    # benefit from a higher number of processes. Each additional process occupies
    # some CPU and memory resources until finished. Threads can be used instead
    # of processes by passing `worker_type=transfer_manager.THREAD`.
    # workers=8

    # The maximum number of results to fetch from bucket.list_blobs(). This
    # sample code fetches all of the blobs up to max_results and queues them all
    # for download at once. Though they will still be executed in batches up to
    # the processes limit, queueing them all at once can be taxing on system
    # memory if buckets are very large. Adjust max_results as needed for your
    # system environment, or set it to None if you are sure the bucket is not
    # too large to hold in memory easily.
    # max_results=1000

    from google.cloud.storage import Client, transfer_manager

    storage_client = Client()
    bucket = storage_client.bucket(bucket_name)

    blob_names = [blob.name for blob in bucket.list_blobs(max_results=max_results)]

    results = transfer_manager.download_many_to_path(
        bucket, blob_names, destination_directory=destination_directory, max_workers=workers
    )

    for name, result in zip(blob_names, results):
        # The results list is either `None` or an exception for each blob in
        # the input list, in order.

        if isinstance(result, Exception):
            print("Failed to download {} due to exception: {}".format(name, result))
        else:
            print("Downloaded {} to {}.".format(name, destination_directory + name))

In [7]:
def download_folder_transfer_manager(bucket_name, bucket_folder_name, local_folder_path, workers=8, max_results=10000):
    """Downloads a folder from the bucket."""

    ## Create the folder if it doesn't exist
    create_folder(local_folder_path)

    ## Create an anonymous client for the bucket
    storage_client = storage.Client.create_anonymous_client()

    ## Get the bucket and list the blobs
    bucket = storage_client.bucket(bucket_name)

    blob_names = [blob.name for blob in bucket.list_blobs(max_results=max_results, prefix=bucket_folder_name)]

    results = storage.transfer_manager.download_many_to_path(
        bucket, blob_names, destination_directory=local_folder_path, max_workers=workers
    )

    for name, result in zip(blob_names, results):
        # The results list is either `None` or an exception for each blob in
        # the input list, in order.

        if isinstance(result, Exception):
            print("Failed to download {} due to exception: {}".format(name, result))
        else:
            print("Downloaded {} to {}.".format(name, local_folder_path + name))

In [8]:
def download_folder_counted(bucket_name, folder_name, local_folder_path, count):
    """Downloads a folder from the bucket."""

    ## Create the folder if it doesn't exist
    create_folder(local_folder_path)

    ## Create an anonymous client for the bucket
    storage_client = storage.Client.create_anonymous_client()

    ## Get the bucket and list the blobs
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=folder_name)

    ## Initiate the counter
    tmp_count = 0

    ## Download the blobs
    for blob in blobs:
        
        ## Check if the counter is less than the count
        if tmp_count < count:

          ## Increment the counter
          tmp_count +=1

          ## Download the file
          filename = blob.name.replace('/', '_')
          blob.download_to_filename(f"{local_folder_path}/{filename}")
        ## If the counter is greater than the count, break the loop
        else:
          break


In [9]:
## Function to delete the original pdfs after they are converted to txt files
def delete_pdfs(directory_path):

    ## Get all pdf files
    pdf_files = glob(f"{directory_path}/*.pdf")
    
    ## Get all txt files
    txt_files = glob(f"{directory_path}/*.txt")
    
    ## Convert to a set for faster searching
    txt_files = set(txt_files)
    

    ## Remove pdf only if there is a corresponding txt file
    for pdf in pdf_files:
        
        ## Get the pdf name
        pdf_name = pdf.split('/')[-1].split('.')[0] + '.' + pdf.split('/')[-1].split('.')[1]
        
        ## Get the txt name
        txt_name = f"{directory_path}/{pdf_name}.txt"
        
        ## Check if txt file exists
        if txt_name in txt_files:
            
            ## Remove the pdf
            os.remove(pdf)

In [10]:
download_folder_transfer_manager(bucket_name='arxiv-dataset', bucket_folder_name=f'arxiv/arxiv/pdf/2301', local_folder_path='tmp_ds', max_results=10)

Downloaded arxiv/arxiv/pdf/2301/2301.00001v1.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00001v1.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00002v1.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00002v1.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00003v1.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00003v1.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00004v1.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00004v1.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00005v1.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00005v1.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00006v1.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00006v1.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00006v2.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00006v2.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00007v1.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00007v1.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00007v2.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00007v2.pdf.
Downloaded arxiv/arxiv/pdf/2301/2301.00008v1.pdf to tmp_dsarxiv/arxiv/pdf/2301/2301.00008v1.pdf.


In [11]:
# ## Creating a list for the year and month
# yymm_list = np.arange(start=2301, stop=2313, step=1)
# yymm_list = [str(i) for i in yymm_list]
# print(yymm_list)

In [12]:
# ## loop to download the files, convert them to text and delete the pdfs
# for yymm in yymm_list:

#     local_folder_path = 'scientific_dataset_2023/{yymm}'
    
#     ## Download all the pdfs published on Arxiv in the year 20yy and month mm
#     download_folder(bucket_name='arxiv-dataset', folder_name=f'arxiv/arxiv/pdf/{yymm}', local_folder_path=local_folder_path)

#     ## Convert all the pdfs in the yymm directory to text
#     convert_directory_parallel(local_folder_path, total_cpu)

#     ## Delete them pdfs if they have been converted to txts
#     delete_pdfs(local_folder_path)