In [2]:
from glob import glob
import pandas as pd
from datasets import load_dataset
import os
from huggingface_hub import HfApi




In [3]:
## Verify trimmed parquet files
parquet_files = glob('arxiv_metadata_by_year/*.parquet')
parquet_files.sort()
parquet_files

['arxiv_metadata_by_year\\arxiv_metadata_2007.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2008.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2009.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2010.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2011.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2012.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2013.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2014.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2015.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2016.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2017.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2018.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2019.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2020.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2021.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2022.parquet',
 'arxiv_metadata_by_year\\arxiv_metadata_2023.parquet']

In [None]:
## write all the info to a text file
with open('arxiv_metadata_by_year/arxiv_metadata.txt', 'w') as f:
    for file in parquet_files:
        
        df = pd.read_parquet(file)

        f.write('#'*80 + '\n')
        f.write(f'{file} has {df.shape[0]} rows and {df.shape[1]} columns\n')
        f.write('Printing first 5 rows of the dataframe:\n')
        f.write(f'{df.head()}\n\n')
        f.write('Printing last 5 rows of the dataframe:\n')
        f.write(f'{df.tail()}\n\n')
        f.write('Printing random 5 rows of the dataframe:\n')
        f.write(f'{df.sample(5)}\n\n')
        f.write('Printing data types of the dataframe:\n')
        f.write(f'{df.dtypes}\n\n')
        f.write('Printing summary of the dataframe:\n')
        f.write(f'{df.describe()}\n\n')
        f.write('Printing missing values of the dataframe:\n')
        f.write(f'{df.isnull().sum()}\n\n')
        f.write('Printing unique values of the dataframe:\n')
        f.write(f'{df.nunique()}\n\n')
        f.write('Printing columns of the dataframe:\n')
        f.write(f'{df.columns}\n\n')
        f.write('Printing memory usage of the dataframe:\n')
        f.write(f'{df.info()}\n\n')
        f.write(f'{df.memory_usage()}\n\n')
        f.write(f'{df.memory_usage().sum()}\n\n')
        f.write(f'{df.memory_usage().sum() / 1024**2}\n\n')
        f.write('#'*80 + '\n')

In [None]:
## Setup the Hugging Face API
access_token =  os.environ.get("HUGGINGFACE_TOKEN")
api = HfApi(token=access_token)

## Verify the API
user = api.whoami()
user


In [7]:
folder_path = "arxiv_metadata_by_year"
repo_id = "bluuebunny/arxiv_metadata_by_year"  # Replace with your details
subfolder = "data"  # Optional subfolder within the repository

# Upload all files within the folder to the specified repository
api.upload_folder(repo_id=repo_id, folder_path=folder_path, path_in_repo=subfolder, repo_type="dataset")


arxiv_metadata_2007.parquet:   0%|          | 0.00/29.8M [00:00<?, ?B/s]
[A


[A[A[A

[A[A



[A[A[A[A
[A

arxiv_metadata_2007.parquet:   0%|          | 16.4k/29.8M [00:00<15:43, 31.6kB/s]



arxiv_metadata_2007.parquet:   2%|▏         | 639k/29.8M [00:00<00:21, 1.37MB/s] 

[A[A
[A



[A[A[A[A
[A



arxiv_metadata_2007.parquet:   5%|▌         | 1.56M/29.8M [00:00<00:09, 3.12MB/s]

[A[A



arxiv_metadata_2007.parquet:   7%|▋         | 2.21M/29.8M [00:00<00:07, 3.70MB/s]
[A

[A[A
arxiv_metadata_2007.parquet:  18%|█▊        | 5.37M/29.8M [00:01<00:02, 9.22MB/s]
[A



[A[A[A[A

[A[A



[A[A[A[A

[A[A
[A



[A[A[A[A

[A[A

arxiv_metadata_2007.parquet:  22%|██▏       | 6.54M/29.8M [00:01<00:04, 5.06MB/s]
arxiv_metadata_2007.parquet:  25%|██▍       | 7.41M/29.8M [00:01<00:04, 5.55MB/s]
[A

arxiv_metadata_2007.parquet:  28%|██▊       | 8.40M/29.8M [00:01<00:03, 6.38MB/s]
[A

arxiv_metadata_2007.parquet:  32%|███▏      | 9.50M/29.8M [00:01<00:02, 7.

CommitInfo(commit_url='https://huggingface.co/datasets/bluuebunny/arxiv_metadata_by_year/commit/11e29059fad6cde300c3c1de74234e2e42df84e4', commit_message='Upload folder using huggingface_hub', commit_description='', oid='11e29059fad6cde300c3c1de74234e2e42df84e4', pr_url=None, pr_revision=None, pr_num=None)