In [1]:
from pathlib import Path
import json
from tqdm.auto import tqdm
import re

ARXIV_DIR = Path("/n/data1/hms/dbmi/zaklab/arXiv")

In [2]:
METADATA_FP = ARXIV_DIR / "arxiv-metadata-oai-snapshot.json"

In [3]:
# with open(METADATA_FP, mode='r') as f:
#     metadata = json.load(f)

In [4]:
SEARCH_TERMS = (
    ["AUC", "AUROC", "Area under the receiver operating characteristic", "ROC", "receiver operating characteristic"],
    ["APR", "AUPRC", "Area under the precision recall curve", "Average precision", "PRC", "Precision recall curve"],
)

def query(text: str, text_queries=SEARCH_TERMS):
    match text_queries:
        case list() as or_queries:
            for q in or_queries:
                if query(text, q):
                    return True
            return False
        case str() as q:
            return re.search(r"(?:\W|^)" + q + r"(?:\W|$)", text, flags=re.I)
        case tuple() as and_queries:
            for q in and_queries:
                if not query(text, q):
                    return False
            return True
        case _:
            raise TypeError(f"Can only accept lists (or), tuples (and), and strings (queries). Got {type(text_queries)}")

## Parse HTML

In [None]:
import xml.etree.ElementTree as ET

In [None]:
HTML_FPS = list((ARXIV_DIR / "arxiv").glob("*/html/**/*.html"))

In [None]:
def check_HTML(fp: Path, text_queries):
    try:
        text = fp.read_text(encoding="utf-8", errors="ignore").lower()
        return text, query(text, text_queries)
        
    except UnicodeDecodeError as e:
        print(f"Failed to parse {fp}: {e}")
        return None, False


    # try:
    #     doc = ET.parse(fp)
    # except ET.ParseError as e:
    #     return fp.read_text()

In [8]:
for html_fp in tqdm(HTML_FPS):
    doc, include = check_HTML(html_fp, SEARCH_TERMS)
    if include: break

  0%|          | 0/3001 [00:00<?, ?it/s]

## Parse PDFs

In [None]:
PDF_FPS = list((ARXIV_DIR / "arxiv").glob("*/pdf/**/*.pdf"))

In [6]:
from pypdf import PdfReader

In [7]:
def read_pdf(fp: Path) -> list[str]:
    reader = PdfReader(fp)
    number_of_pages = len(reader.pages)
    return [reader.pages[i].extract_text() for i in range(len(reader.pages))]

In [12]:
for pdf_fp in tqdm(PDF_FPS):
    pgs = read_pdf(pdf_fp)
    doc = '\n'.join(pgs)
    if query(doc, SEARCH_TERMS):
        print(pdf_fp, "Found 1!")
        break

  0%|          | 0/1370153 [00:00<?, ?it/s]

PdfReadError: Missed the stop code in LZWDecode!

## Parse txt

In [5]:
TEXT_FPS = list((ARXIV_DIR / "arxiv_as_txt").glob("*/pdf/**/*.txt"))

In [6]:
scanned_fps_fp = ARXIV_DIR / "scanned_txts.txt"
has_terms_fp = ARXIV_DIR / "has_terms.txt"

if scanned_fps_fp.is_file():
    already_scanned = set(scanned_fps_fp.read_text().split('\n'))
    texts_to_scan = list(set(TEXT_FPS) - already_scanned)
else:
    texts_to_scan = TEXT_FPS

In [7]:
for txt_fp in tqdm(texts_to_scan):
    doc = txt_fp.read_text().lower()
    if query(doc, SEARCH_TERMS):
        print(txt_fp)
        with open(has_terms_fp, mode='a') as f:
            f.write(str(txt_fp.resolve()) + '\n')

  0%|          | 0/1370141 [00:00<?, ?it/s]

/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1612/1612.07025v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1501/1501.06545v2.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1506/1506.02565v3.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/0804/0804.2097v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1701/1701.08816v4.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1001/1001.3355v3.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1602/1602.01107v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1604/1604.07339v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1604/1604.08570v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1304/1304.1063v2.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1701/1701.06236v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1504/1504.06074v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1211/1211.3680v1.txt
/n/data1/hms/dbm

In [8]:
with open(scanned_fps_fp, mode='w') as f:
    f.write('\n'.join(str(x.resolve()) for x in TEXT_FPS))