**Resources:**
- [Kaggle Arxiv dataset](https://www.kaggle.com/datasets/Cornell-University/arxiv)
- [Tooling repo](https://github.com/mattbierbaum/arxiv-public-datasets)

# Downloading Arxiv PDFs

## Using Arxiv metadata

In [None]:
from google.colab import drive
import zipfile
import pandas as pd
import json

In [None]:
drive.mount('/content/drive/')
!cp "/content/drive/MyDrive/[Personal-route]" "arxiv-metadata-oai-snapshot.json.zip"
drive.flush_and_unmount()

Mounted at /content/drive/


In [None]:
json_file = zipfile.ZipFile("arxiv-metadata-oai-snapshot.json.zip").open("arxiv-metadata-oai-snapshot.json")

In [None]:
data = json_file.read()

In [None]:
data.decode()[:20000]

Keeps failing

## Using Google Cloud Storage

In [None]:
from random import sample
from google.cloud import storage
from google.colab import auth

In [None]:
# List papers categories
!gsutil ls gs://arxiv-dataset/arxiv/

gs://arxiv-dataset/arxiv/acc-phys/
gs://arxiv-dataset/arxiv/adap-org/
gs://arxiv-dataset/arxiv/alg-geom/
gs://arxiv-dataset/arxiv/ao-sci/
gs://arxiv-dataset/arxiv/arxiv/
gs://arxiv-dataset/arxiv/astro-ph/
gs://arxiv-dataset/arxiv/atom-ph/
gs://arxiv-dataset/arxiv/bayes-an/
gs://arxiv-dataset/arxiv/chao-dyn/
gs://arxiv-dataset/arxiv/chem-ph/
gs://arxiv-dataset/arxiv/cmp-lg/
gs://arxiv-dataset/arxiv/comp-gas/
gs://arxiv-dataset/arxiv/cond-mat/
gs://arxiv-dataset/arxiv/cs/
gs://arxiv-dataset/arxiv/dg-ga/
gs://arxiv-dataset/arxiv/funct-an/
gs://arxiv-dataset/arxiv/gr-qc/
gs://arxiv-dataset/arxiv/hep-ex/
gs://arxiv-dataset/arxiv/hep-lat/
gs://arxiv-dataset/arxiv/hep-ph/
gs://arxiv-dataset/arxiv/hep-th/
gs://arxiv-dataset/arxiv/math-ph/
gs://arxiv-dataset/arxiv/math/
gs://arxiv-dataset/arxiv/mtrl-th/
gs://arxiv-dataset/arxiv/nlin/
gs://arxiv-dataset/arxiv/nucl-ex/
gs://arxiv-dataset/arxiv/nucl-th/
gs://arxiv-dataset/arxiv/patt-sol/
gs://arxiv-dataset/arxiv/physics/
gs://arxiv-dataset/arxiv/p

In [None]:
!gsutil ls gs://arxiv-dataset/arxiv/cs/pdf/**/*.pdf > files.txt

In [None]:
with open("files.txt", "r") as fp:
  files = fp.read().splitlines()
  print(f"Total files: {len(files)}")

Total files: 9882


Let's take a batch of 1000 files from it

In [None]:
sample_files = sample(files, 500)

In [None]:
with open("sample_files.txt", "w") as fp:
  for file_ in sample_files:
    fp.write(f"{file_}\n")

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!cat sample_files.txt | gsutil -m cp -I "/content/drive/MyDrive/[Personal-route]

Copying gs://arxiv-dataset/arxiv/cs/pdf/0605/0605141v1.pdf...
/ [0 files][    0.0 B/360.6 KiB]                                                Copying gs://arxiv-dataset/arxiv/cs/pdf/0507/0507012v1.pdf...
/ [0 files][    0.0 B/726.8 KiB]                                                Copying gs://arxiv-dataset/arxiv/cs/pdf/0405/0405006v1.pdf...
/ [0 files][    0.0 B/911.6 KiB]                                                Copying gs://arxiv-dataset/arxiv/cs/pdf/0310/0310004v1.pdf...
/ [0 files][    0.0 B/  1.0 MiB]                                                Copying gs://arxiv-dataset/arxiv/cs/pdf/0506/0506045v1.pdf...
Copying gs://arxiv-dataset/arxiv/cs/pdf/0607/0607135v1.pdf...
Copying gs://arxiv-dataset/arxiv/cs/pdf/0607/0607104v1.pdf...
Copying gs://arxiv-dataset/arxiv/cs/pdf/0703/0703018v9.pdf...
Copying gs://arxiv-dataset/arxiv/cs/pdf/0607/0607120v1.pdf...
Copying gs://arxiv-dataset/arxiv/cs/pdf/0508/0508037v2.pdf...
Copying gs://arxiv-dataset/arxiv/cs/pdf/0306/0306054v1.p

In [None]:
drive.flush_and_unmount()

# Extracting plain text

In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20211012-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 24.5 MB/s 
Collecting cryptography
  Downloading cryptography-36.0.2-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 40.9 MB/s 
Installing collected packages: cryptography, pdfminer.six
Successfully installed cryptography-36.0.2 pdfminer.six-20211012


In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
from pdfminer.high_level import extract_text
import os

In [None]:
pdf_file = "/content/drive/MyDrive/[Personal-route]"
txt_file = "/content/drive/MyDrive/[Personal-route]"

text = extract_text(pdf_file)
with open(txt_file, "w+") as f:
  f.write(text)

In [None]:
pdf_files_dir = "/content/drive/MyDrive/[Personal-route]
txt_files_dir = "/content/drive/MyDrive/[Personal-route]"

for filename in os.listdir(pdf_files_dir):
  pdf_file = f"{pdf_files_dir}/{filename}"
  txt_file = f"{txt_files_dir}/{filename}".replace(".pdf", ".txt")

  text = extract_text(pdf_file)
  with open(txt_file, "w+") as f:
    f.write(text)

