In [20]:
from datasets import load_dataset

dataset = load_dataset("bluuebunny/arxiv_metadata_by_year", data_files='data/arxiv_metadata_2007.parquet', verification_mode='no_checks')

In [24]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'],
        num_rows: 42328
    })
})

In [23]:
# Assuming 'dataset' holds your loaded dataset
df = dataset['train'].to_pandas()

# Now you can use pandas methods on the DataFrame 'df'
(df.head())  # Display the first few rows

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"b'[{""version"":""v1"",""created"":""Mon, 2 Apr 2007 ...",2008-11-26,"b'[[""Bal\\u00e1zs"",""C."",""""],[""Berger"",""E. L."",..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","b'[{""version"":""v1"",""created"":""Sat, 31 Mar 2007...",2008-12-13,"b'[[""Streinu"",""Ileana"",""""],[""Theran"",""Louis"",""..."
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"b'[{""version"":""v1"",""created"":""Sun, 1 Apr 2007 ...",2008-01-13,"b'[[""Pan"",""Hongjun"",""""]]'"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"b'[{""version"":""v1"",""created"":""Sat, 31 Mar 2007...",2007-05-23,"b'[[""Callan"",""David"",""""]]'"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"b'[{""version"":""v1"",""created"":""Mon, 2 Apr 2007 ...",2013-10-15,"b'[[""Abu-Shammala"",""Wael"",""""],[""Torchinsky"",""A..."


In [17]:
df.info()
df.memory_usage()
df.memory_usage().sum()
df.memory_usage().sum() / 1024**2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42328 entries, 0 to 42327
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              42328 non-null  object
 1   submitter       42328 non-null  object
 2   authors         42328 non-null  object
 3   title           42328 non-null  object
 4   comments        37317 non-null  object
 5   journal-ref     21920 non-null  object
 6   doi             25735 non-null  object
 7   report-no       4189 non-null   object
 8   categories      42328 non-null  object
 9   license         4135 non-null   object
 10  abstract        42328 non-null  object
 11  versions        42328 non-null  object
 12  update_date     42328 non-null  object
 13  authors_parsed  42328 non-null  object
dtypes: object(14)
memory usage: 4.5+ MB


4.521244049072266

In [None]:
print(df.shape)  # Get the dimensions (rows, columns)

In [None]:
import numpy as np
import os
from google.cloud.storage import Client, transfer_manager
from multiprocessing import Pool

In [None]:
## Function to create a folder if it doesn't exist
def create_folder(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)


In [None]:
def download_folder_transfer_manager(bucket_name, bucket_folder_name, local_folder_path, workers=Pool()._processes, max_results=10000):
    """Downloads a folder from the bucket."""

    

    ## Create the folder if it doesn't exist
    create_folder(local_folder_path)

    ## Create an anonymous client for the bucket
    storage_client = Client.create_anonymous_client()

    ## Get the bucket and list the blobs
    bucket = storage_client.bucket(bucket_name)

    blob_names = [blob.name for blob in bucket.list_blobs(max_results=max_results, prefix=bucket_folder_name)]

    results = transfer_manager.download_many_to_path(
        bucket, blob_names, destination_directory=local_folder_path, max_workers=workers
    )

    for name, result in zip(blob_names, results):
        # The results list is either `None` or an exception for each blob in
        # the input list, in order.

        if isinstance(result, Exception):
            print("Failed to download {} due to exception: {}".format(name, result))
        else:
            print("Downloaded {} to {}.".format(name, local_folder_path + name))


In [None]:
yymm = 1901
download_folder_transfer_manager(bucket_name='arxiv-dataset', bucket_folder_name=f'arxiv/arxiv/pdf/{yymm}', local_folder_path='tmp_ds', max_results=10)

In [None]:
## Create a yymm list from the year 2020 to 2023
def create_yymm_list(start_year, end_year):
    yymm_list = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            yymm = year * 100 + month

            if len(str(yymm)) == 3:
                yymm = '0' + str(yymm)
            
            yymm_list.append(yymm)
    return yymm_list

yymm_list = create_yymm_list(11, 11)
yymm_list = [str(i) for i in yymm_list]
(yymm_list)

In [None]:
'|'.join(yymm_list)