## Read Data and Extract DOI Links

In [4]:
!pip install -U marker-pdf[full]
!pip install pymupdf 
!pip install vllm
!pip install logits-processor-zoo==0.1.10
!pip install triton==3.2.0

Collecting marker-pdf[full]
  Downloading marker_pdf-1.8.0-py3-none-any.whl.metadata (29 kB)
Collecting Pillow<11.0.0,>=10.1.0 (from marker-pdf[full])
  Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting anthropic<0.47.0,>=0.46.0 (from marker-pdf[full])
  Downloading anthropic-0.46.0-py3-none-any.whl.metadata (23 kB)
Collecting click<9.0.0,>=8.2.0 (from marker-pdf[full])
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting ebooklib<0.19,>=0.18 (from marker-pdf[full])
  Downloading EbookLib-0.18.tar.gz (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.5/115.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting filetype<2.0.0,>=1.2.0 (from marker-pdf[full])
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting ftfy<7.0.0,>=6.1.1 (from marker-pdf[full])
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Co

In [1]:
import os
LOCAL = False
if not LOCAL:
    # !mkdir -p /root/.cache/datalab/models/
    # !mkdir -p /usr/local/lib/python3.11/dist-packages/static/fonts/
    # !cp -r /kaggle/input/marker-models/marker_models/* /root/.cache/datalab/models/
    # !cp /kaggle/input/marker-models/GoNotoCurrent-Regular.ttf /usr/local/lib/python3.11/dist-packages/static/fonts/
    import sys
    sys.path.append('/kaggle/input/mdc-tools')
from utils import *

# vLLM V1 does not currently accept logits processor so we need to disable it
# https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#deprecated-features
os.environ["VLLM_USE_V1"] = "0"
if LOCAL:   
    labels_dir = '/root/autodl-tmp/train_labels.csv'
else:
    labels_dir =  "/kaggle/input/make-data-count-finding-data-references/train_labels.csv"

print(labels_dir)

/kaggle/input/make-data-count-finding-data-references/train_labels.csv


In [None]:
# if not LOCAL:
#     os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
#     if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#         !NUM_DEVICES=2 NUM_WORKERS=4 marker_chunk_convert /kaggle/input/make-data-count-finding-data-references/test/PDF /kaggle/working/pdf_parsed
#     else:
#         !NUM_DEVICES=2 NUM_WORKERS=4 marker_chunk_convert /kaggle/input/make-data-count-finding-data-references/train/PDF /kaggle/working/pdf_parsed
#     import torch
#     from IPython.display import clear_output
#     try:
#         del converter
#     except NameError:
#         pass
#     torch.cuda.empty_cache()
#     import gc
#     gc.collect()
#     clear_output()
#     print(f"当前显存占用: {torch.cuda.memory_allocated()/1024**2:.2f} MB")

In [6]:

import re
import fitz  # PyMuPDF
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
import pickle
import vllm
import torch

# Step 1: Read all PDFs and convert to text
if LOCAL:
    pdf_directory = "/root/autodl-tmp/train/PDF"
    md_directory = "./train_parsed/"
else:
    pdf_directory = "/kaggle/input/make-data-count-finding-data-references/test/PDF" \
                    if os.getenv('KAGGLE_IS_COMPETITION_RERUN') \
                    else "/kaggle/input/make-data-count-finding-data-references/train/PDF"
    md_directory = "/kaggle/working/pdf_parsed"

patterns_to_find = {
    # 1. 文献与书籍
    # 'doi': re.compile(r'\b(10\.\d{4,}/[-._;()/:A-Z0-9]*[A-Z0-9])', re.IGNORECASE),
    'doi': re.compile(
        r'(?:\b|(?<!https://))'  # 确保不是其他https前缀
        r'(?:https://doi\.org/)?'  # 可选匹配https://doi.org/
        r'(10\.\d{4,}/(?:[-._;/:A-Z0-9]*(?:\([-._;/:A-Z0-9]*\)[-._;/:A-Z0-9]*)*[A-Z0-9]|[-._;/:A-Z0-9]*[A-Z0-9]))', 
        re.IGNORECASE
    ),
    # 'doi': re.compile(r'10\.\d{4}'),
    # ISBN 模式可以根据需要添加
    'isbn': re.compile(r'\b(?:ISBN(?:-1[03])?:? )?(?=[0-9X]{10}$|(?=(?:[0-9]+[- ]){3})[- 0-9X]{13}$|97[89][0-9]{10}$|(?=(?:[0-9]+[- ]){4})[- 0-9]{17}$)(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]\b'),

    # 2. 生物信息学数据库ID (按前缀精确匹配)
    'kegg_like_id': re.compile(r'\b[A-Z]\d{5}\b'),
    'gisaid_id': re.compile(r'\b(EPI_ISL_\d+|EPI\d+)\b'),
    'prj_id': re.compile(r'\b(PRJ[ED]?[A-Z]+\d+)\b'),
    'chembl_id': re.compile(r'\b(CHEMBL\d+)\b'),
    'interpro_id': re.compile(r'\b(IPR\d{5,})\b'),
    'pfam_id': re.compile(r'\b(PF\d{5,})\b'),
    'geo_id': re.compile(r'\b(GSE\d+)\b'),
    'sra_id': re.compile(r'\b((?:SR|ER|DR)[RX]\d{7})\b'), # 涵盖 SRP, SRR, SRX, ERR 等
    'srp_id': re.compile(r'\b(SRP\d{6})\b'), # 涵盖 SRP, SRR, SRX, ERR 等
    'empiar_id': re.compile(r'\b(EMPIAR-\d{4,7})\b'),
    'ensembl_id': re.compile(r'\b(ENS[A-Z0-9]+)\b'),
    'refseq_id': re.compile(r'\b((?:NM|NC|NR|NP|NG|NT|XM|XR|XP|WP|NW|NZ|CP|AP|AC|KX)_?\d+(?:\.\d+)?)\b'),
    'gisaid_id': re.compile(r'\b(EPI_ISL_\d+|EPI\d+)'),
    'biosample_id': re.compile(r'\b(SAMN\d+)\b'),
    'proteomexchange_id': re.compile(r'\b(PXD\d+)\b'),
    'cellosaurus_id': re.compile(r'\b(CVCL_[A-Z0-9]+)\b'),
    'dbsnp_id': re.compile(r'\b(rs\d+)\b'),
    
    'arrayexpress_id': re.compile(r'\b(E-[A-Z]{3,5}-\d+)\b'), # 新增对 ArrayExpress 的支持
    'hpa_id': re.compile(r'\b((?:HPA|CAB)\d+)\b'), # 涵盖 HPA 和 CAB
    # 3. 格式匹配
    # 'pdb_id': re.compile(r'\b(?=[0-9][a-zA-Z0-9]{3}\b)(?=.*[a-zA-Z])[a-zA-Z0-9]{4}\b'), # 4位字母数字，首位是数字
    'uniprot_id': re.compile(r'\b([A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9]|[OPQ][0-9][A-Z0-9]{3}[0-9])\b'), # 6位字母数字

    'cath_id': re.compile(r'\b3\.(?:[1-9]\d{0,2}|0)\.(?:[1-9]\d{0,2}|0)\.(?:[1-9]\d{0,2}|0)\b'),
}
chunks = []
text_span_len = 100
chunk_size = 200 
overlap = 50
chunks_raw = []
found_items = []
j = 0
 
for filename in tqdm(os.listdir(pdf_directory), total=len(os.listdir(pdf_directory))):
    # print(j)
    # if LOCAL and j >= 20:
    #     break
    # print(i)
    if filename.endswith(".pdf"):
        #continue
        j += 1
        pdf_path = os.path.join(pdf_directory, filename)
        
        # Extract article_id from filename
        article_id = filename.split(".pdf")[0]
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            page_text = page.get_text()
            text += page_text + "\n"
            
        doc.close()
        # text = re.sub(r'\s+', ' ', content)
        text = text.strip()
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r'\\_', '_', text)
        # text = text.replace('\n', ' ')
        # text = content
        # text = re.split(r'references', content, maxsplit=1, flags=re.IGNORECASE)[0]
        text = remove_references_section(text)
        # parts = re.split(r'references', text, flags=re.IGNORECASE)
        # text = 'references'.join(parts[:-1]) if len(parts) > 1 else parts[0]  # 合并所有非最后一段
        # step = chunk_size - overlap
        min_char_len = 100
        max_char_len = 500
        overlap_char_len = 50
        text_chunks = smart_chunker(text, min_char_len, max_char_len, overlap_char_len)
        for chunk in text_chunks:
            for pattern_name, compiled_pattern in patterns_to_find.items():
                matches = compiled_pattern.findall(chunk)
                if matches:
                    # 对每个匹配到的结果都保存
                    for match in set(matches): # 使用 set(matches) 可以对同一个块内的重复匹配去重
                        # 这里可以根据 pattern_name 进行特殊处理
                        if pattern_name == 'doi':
                            if article_id.split('_')[0] in match:
                                continue
                            if not  match.startswith('https://doi.org/'):
                                result_value = 'https://doi.org/' + match.lower()
                        else:
                            result_value = match

                        chunks.append((
                            article_id,     # 文章ID
                            chunk,          # 文本块
                            result_value,   # 具体的匹配结果
                            pattern_name    # 匹配到的模式名
                        ))
  
# # 打印找到的总数和一些示例
print(f"\n在所有文件中总共找到了 {len(chunks)} 个匹配项。")

# # 打印前5个找到的结果作为示例
#for item in chunks[:20]:
#    if not item[2].startswith('https://doi.org/'):
#        continue
#    print(f"ID: {item[0]}, 文本: '{item[1]}', 匹配结果: {item[2]}")

  0%|          | 0/524 [00:00<?, ?it/s]

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: uns

## Load LLM

In [None]:
if LOCAL:
    model_path = "/root/autodl-tmp/Qwen2.5-32B-Insturct-AWQ"
    # model_path = "/root/autodl-tmp/Qwen3-32B-AWQ"
else:
    model_path = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
llm = vllm.LLM(
    model_path,
    quantization='awq',
    tensor_parallel_size=torch.cuda.device_count(),
    gpu_memory_utilization=0.90,
    trust_remote_code=True,
    dtype="half",
    enforce_eager=True,
    max_model_len=2048,
    disable_log_stats=True,
    enable_prefix_caching=True
)
# tokenizer = llm.get_tokenizer()

## Ask LLM to extract DOI links

## Ask LLM to classify DOI links
Use logits-processor-zoo MultipleChoiceLogitsProcessor to enforce LLM choose between classes.

In [None]:
label_df = pd.read_csv(labels_dir)
label_df = label_df[label_df['type'] != 'Missing'].reset_index(drop=True)

# 提取数据
true_values = set(label_df['dataset_id'].astype(str).unique())
detected_values = {str(item[2]) for item in chunks}
# print(f"\n在所有文件中总共找到了 {len(detected_values)} 个匹配项。")
# 计算差异
extra_detections = detected_values - true_values
missing_detections = true_values - detected_values

# 输出结果
print(f"多出的检测结果数量: {len(extra_detections)}")
if extra_detections:
    print("多出的内容示例:", list(extra_detections)[:10])  # 打印前5个示例

print(f"\n缺少的检测结果数量: {len(missing_detections)}")
if missing_detections:
    print("缺少的内容示例:", list(missing_detections)[:10])

In [None]:
SYS_PROMPT = """
You are an expert academic data curator. You will be given a piece of academic text and a specific identifier (e.g., a DOI, a database accession number like GSE12345, a UniProt ID, etc.) found within that text.
Your task is to analyze the context of this identifier within the text and classify the data it refers to.
Please classify the data as one of the following:
A) Primary: if the data was newly generated by the authors specifically for the current study. (e.g., "The raw sequencing data for this study have been deposited to SRA under accession SRP987654.")
B) Secondary: if the data was reused, re-analyzed, or derived from a prior, different study. (e.g., "We downloaded and re-analyzed the dataset from Smith et al. (2020), accession GSE12345.")
C) None: if the identifier is a bibliographic citation to another paper (like in a reference list), is mentioned in passing without being a core dataset for this study, or does not refer to research data at all.
Respond with only the single capital letter: A, B, or C.
"""

prompts = []
for article_id, academic_text, ref,_ in chunks:
    # article_id, academic_text = item
    # for ref in items[i]:

    messages = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": f"Academic Text:\n\"\"\"\n{academic_text}\n\"\"\"\n\nBased on the text above, classify the data associated with this specific identifier: {ref}.\n The classification result is"}
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
        enable_thinking=False,
    )
    prompts.append(prompt)
print(len(prompts[0]))
mclp = MultipleChoiceLogitsProcessor(tokenizer, 
                                     choices=["A", "B", "C"])


outputs = llm.generate(
    prompts,
    vllm.SamplingParams(
        seed=0,
        skip_special_tokens=True,
        max_tokens=1,
        logits_processors=[mclp],
        logprobs=len(mclp.choices)

    ),
    use_tqdm=True
)

In [None]:
logprobs = []
for lps in [output.outputs[0].logprobs[0].values() for output in outputs]:
    logprobs.append({lp.decoded_token: lp.logprob for lp in list(lps)})

logit_matrix = pd.DataFrame(logprobs)[["A", "B", "C"]].values
# a_discount_factor = 10
logit_matrix[:, 0] = logit_matrix[:, 0] * 50
logit_matrix[:, 1] = logit_matrix[:, 1] * 0.01
# logit_matrix[:, 0] = logit_matrix[:, 0] * 100
# logit_matrix[:, 1] = logit_matrix[:, 1] * 0.01
choices = ["Primary", "Secondary", None]
print(logit_matrix[100:110])
answers = [choices[pick] for pick in np.argmax(logit_matrix, axis=1)]
print(answers[100:110])

## Prepare Submission

In [None]:
sub_df = pd.DataFrame()
sub_df["article_id"] = [c[0] for c in chunks]
sub_df["dataset_id"] = [c[2] for c in chunks]
# sub_df["dataset_id"] = sub_df["dataset_id"].str.lower()
sub_df["type"] = answers
sub_df = sub_df[sub_df["type"].notnull()].reset_index(drop=True)


sub_df = sub_df.sort_values(by=["article_id", "dataset_id", "type"], ascending=True).drop_duplicates(subset=['article_id', 'dataset_id'], keep="first").reset_index(drop=True)

sub_df['row_id'] = range(len(sub_df))
sub_df.to_csv("submission.csv", index=False, columns=["row_id", "article_id", "dataset_id", "type"])

sub_df["type"].value_counts()

## Evaluate validation score

In [None]:
def f1_score(tp, fp, fn):
    return 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else 0.0

if not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):

    try:
        pred_df = pd.read_csv("submission.csv")
        label_df = pd.read_csv(labels_dir) 
        label_df = label_df[label_df['type'] != 'Missing'].reset_index(drop=True)
        hits_df = label_df.merge(pred_df, on=["article_id", "dataset_id", "type"])
        tp = hits_df.shape[0]

        # 2. 假正例 (False Positives, FP): 预测了，但真实标签里没有的样本 (误报)
        fp = pred_df.shape[0] - tp

        # 3. 假反例 (False Negatives, FN): 真实标签里有，但没能预测出来的样本 (漏报)
        fn = label_df.shape[0] - tp
        
        # 精确率 (Precision)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        # 召回率 (Recall)
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        # F1 分数
        f1 = f1_score(tp, fp, fn)

        # --- 开始格式化输出，使其更直观 ---
        print("--- 核心指标 ---")
        print(f"TP (真正例): {tp:<5} | 说明：模型正确识别的目标数量。")
        print(f"FP (假正例): {fp:<5} | 说明：模型识别错了，把不是目标的当成了目标 (误报)。")
        print(f"FN (假反例): {fn:<5} | 说明：模型没能识别出来，遗漏了本该找到的目标 (漏报)。")
        print("\n--- 性能评估 ---")
        print(f"精确率 (Precision): {precision:.2%} | 回答：在所有模型认为是目标的预测中，有多少是真的？(查准率)")
        print(f"召回率 (Recall)   : {recall:.2%} | 回答：在所有真实的目标中，模型找到了多少？(查全率)")
        print(f"F1 Score         : {f1:.3f}    | 说明：精确率和召回率的调和平均值，综合评价指标。")
        
        print("\n--- 简单诊断 ---")
        if precision < 0.5 and recall < 0.5:
            print("结论：模型性能较差，精确率和召回率都低。建议从数据和特征工程入手进行大的改进。")
        elif fp > fn:
            print(f"结论：误报(FP={fp})多于漏报(FN={fn})，模型倾向于“过于激进”。可以尝试提高预测的置信度门槛。")
        elif fn > fp:
            print(f"结论：漏报(FN={fn})多于误报(FP={fp})，模型倾向于“过于保守”。这是最常见的情况，说明模型识别目标的能力有待加强。")
        else:
            print("结论：模型在误报和漏报之间取得了相对平衡，可以根据F1分数判断总体性能。")

    except FileNotFoundError:
        print("错误：无法找到 'submission.csv' 或 'train_labels.csv' 文件。请检查文件路径是否正确。")
    except Exception as e:
        print(f"在评估过程中发生了一个错误: {e}")
        