In [2]:
from pathlib import Path
import copy
import json
import pickle
from tqdm.auto import tqdm
import re
from openai import OpenAI

import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")

ARXIV_DIR = Path("/n/data1/hms/dbmi/zaklab/arXiv")
OPENAI_RESULTS_FP = ARXIV_DIR / "openai_results.pkl"
has_terms_fp = ARXIV_DIR / "has_terms.txt"

In [3]:
scanned_fps_fp = ARXIV_DIR / "scanned_txts.txt"

In [5]:
n_files = len(scanned_fps_fp.read_text().split('\n'))
print(f"Processed {n_files} files total.")

Processed 863282 files total.


In [2]:
SEARCH_TERMS = (
    ["AUC", "AUROC", "Area under the receiver operating characteristic", "ROC", "receiver operating characteristic"],
    ["APR", "AUPRC", "Area under the precision recall curve", "Average precision", "PRC", "Precision recall curve"],
)

def query(text: str, text_queries=SEARCH_TERMS):
    match text_queries:
        case list() as or_queries:
            for q in or_queries:
                if query(text, q):
                    return True
            return False
        case str() as q:
            return re.search(r"(?:\W|^)" + q + r"(?:\W|$)", text, flags=re.I)
        case tuple() as and_queries:
            for q in and_queries:
                if not query(text, q):
                    return False
            return True
        case _:
            raise TypeError(f"Can only accept lists (or), tuples (and), and strings (queries). Got {type(text_queries)}")

In [3]:
with_terms_fps = [x.strip() for x in has_terms_fp.read_text().split('\n') if x.strip()]
print(f"Loaded {len(with_terms_fps)} files to check.")

Loaded 1686 files to check.


In [4]:
if OPENAI_RESULTS_FP.is_file():
    with open(OPENAI_RESULTS_FP, mode='rb') as f:
        old_results = pickle.load(f)
        old_with_terms_fps = old_results["with_terms_fps"]
        old_with_terms_chunk = old_results["with_terms_chunk"]
        old_with_terms_openai = old_results["with_terms_openai"]
        old_final_docs = old_results["final_docs"]
else:
    old_with_terms_fps = []
    old_with_terms_chunk = []
    old_with_terms_openai = []
    old_final_docs = []

### Pre-filter by chunks

In [14]:
def check_text_detailed(fp: Path, chunk_size: int, offset: int) -> list[str]:
    doc = fp.read_text().lower()
    as_words = doc.split() 
    
    matching_chunks = []
    for st in range(0, len(as_words), offset):
        chunk = ' '.join(as_words[st:st+chunk_size])
        if query(chunk):
            matching_chunks.append((st, chunk))
    
    return matching_chunks

In [15]:
with_terms_localized = copy.deepcopy(old_with_terms_chunk)

for fp in tqdm(with_terms_fps):
    if fp in old_with_terms_fps: continue
    chunks = check_text_detailed(Path(fp), 512, 128)
    if chunks:
        with_terms_localized.append((fp, chunks))

  0%|          | 0/1686 [00:00<?, ?it/s]

In [19]:
print(f"Found total of {len(with_terms_localized)} documents with relevant chunks.")

Found total of 516 documents with relevant chunks.


In [31]:
def n_tokens(doc: str) -> int:
    return len(enc.encode(doc))

MODELS = {
    "GPT-4 Turbo": (0.01, 0.03, 128000),
    "GPT-3.5 Turbo": (0.001, 0.002, 16000),
    "GPT-4": (0.03, 0.06, 8192),
}

def profile_cost(fp: str | Path) -> dict[str, int]:
    n = n_tokens((Path(fp) if type(fp) is str else fp).read_text())
    
    cost_dict = {k: in_c*n + 100*out_c for k, (in_c, out_c, _) in MODELS.items()}
    return cost_dict

### Query with GPT-3.5

In [23]:
SYSTEM_PROMPT = (
    "You are an expert in machine learning and scientific literature review. "
    "For each chunk of a published paper (which may have typos, misspellings, and odd characters as a result of conversion from PDF), "
    "return a JSON object that states whether or not the paper makes any claim that the area under the precision recall curve (AUPRC) "
    "is superior as a general performance metric to the area under the receiver operating characteristic (AUROC) in an ML setting. "
    "A paper claiming that a model performs better under AUPRC vs. AUROC is *not* an example of this; instead a paper claiming that AUPRC "
    "should be used instead of AUROC in cases of class imbalance is an example of this metric commentary. "
    "Respond with format {'claims': [{'claim': DESCRIPTION OF CLAIM, 'evidence': SUBSTRING FROM INPUT STATING CLAIM}, ...]}. "
    "If the paper makes no claims, leave the 'claims' key in the JSON object empty."
)

In [24]:
def check_with_openai(
    chunks: list[tuple[int, str]], model: str, system_prompt: str
):
    client = OpenAI()

    responses = []
    for st_idx, chunk in chunks:
        try:
            response = client.chat.completions.create(
                model=model,
                response_format={ "type": "json_object" },
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": chunk},
                ]
            )
            as_json = json.loads(response.choices[0].message.content)
            if "claims" in as_json: responses.extend(as_json["claims"])
        except:
            print("Failed!")
            continue
    return responses

In [27]:
with_terms_openai = copy.deepcopy(old_with_terms_openai)
for fp, chunks in tqdm(with_terms_localized):
    if fp in old_with_terms_fps:
        continue
    openai_chunks = check_with_openai(chunks, model="gpt-3.5-turbo-1106", system_prompt=SYSTEM_PROMPT)
    if openai_chunks:
        print(fp)
        with_terms_openai.append((fp, chunks, openai_chunks))

  0%|          | 0/516 [00:00<?, ?it/s]

/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1609/1609.04392v2.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1609/1609.03536v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1609/1609.09430v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1609/1609.04392v6.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1609/1609.04392v4.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1603/1603.09114v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1504/1504.03106v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1012/1012.0930v2.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1110/1110.4198v3.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1110/1110.4198v2.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1205/1205.6986v2.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1703/1703.04213v1.txt
/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1703/1703.04213v2.txt
/n/data1/hms/dbm

In [29]:
len(with_terms_openai)

147

### Validate with GPT-4

In [40]:
def check_doc_with_openai(
    doc: str, model: str, system_prompt: str
):
    client = OpenAI()

    try:
        response = client.chat.completions.create(
            model=model,
            response_format={ "type": "json_object" },
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": doc},
            ]
        )
        as_json = json.loads(response.choices[0].message.content)
        if "claims" in as_json: return as_json["claims"]
        else: return []
    except Exception as e:
        print(f"Failed with {e}")
        return None


In [41]:
final_docs = copy.deepcopy(old_final_docs)
for fp, chunks, openai_chunks in tqdm(with_terms_openai):
    if fp in old_with_terms_fps:
        continue
    final_doc_response_claims = check_doc_with_openai(Path(fp).read_text(), model="gpt-4-1106-preview", system_prompt=SYSTEM_PROMPT)
    if final_doc_response_claims:
        final_docs.append((fp, final_doc_response_claims))

print(f"Found {len(final_docs)} relevant final documents.")

  0%|          | 0/147 [00:00<?, ?it/s]

Found 2 relevant final documents.


In [42]:
with open(OPENAI_RESULTS_FP, mode='wb') as f:
    pickle.dump({
        "with_terms_fps": with_terms_fps,
        "with_terms_chunk": with_terms_localized,
        "with_terms_openai": with_terms_openai,
        "final_docs": final_docs,
    }, f)

In [43]:
final_docs

[('/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1310/1310.5103v1.txt',
  [{'claim': 'AP places more emphasis on the initial part of the ROC curve and addresses criticisms of the AUC',
    'evidence': 'for the AUC, stamina and momentum are equally important, whereas for the AP, momentum is more important'}]),
 ('/n/data1/hms/dbmi/zaklab/arXiv/arxiv_as_txt/arxiv/pdf/1206/1206.4667v1.txt',
  [{'claim': 'AUPRC is preferred to AUROC in situations of large class skew',
    'evidence': 'In particular, PR analysis is preferred to ROC analysis when there is a large skew in the class distribution.'}])]