## Installs / Imports

In [1]:
# Installs

!pip install --no-index --find-links=/kaggle/input/latest-mdc-whls/whls pymupdf transformers accelerate

Looking in links: /kaggle/input/latest-mdc-whls/whls
Processing /kaggle/input/latest-mdc-whls/whls/pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl
Processing /kaggle/input/latest-mdc-whls/whls/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (from torch>=2.0.0->accelerate)
Processing /kaggle/input/latest-mdc-whls/whls/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (from torch>=2.0.0->accelerate)
Processing /kaggle/input/latest-mdc-whls/whls/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (from torch>=2.0.0->accelerate)
Processing /kaggle/input/latest-mdc-whls/whls/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (from torch>=2.0.0->accelerate)
Processing /kaggle/input/latest-mdc-whls/whls/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (from torch>=2.0.0->accelerate)
Processing /kaggle/input/latest-mdc-whls/whls/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (from torch>=2.0.0->acc

In [2]:
# Imports

import os 
from pathlib import Path
import re
from collections import Counter
import nltk

import pandas as pd
import numpy as np

# PDF parsing
import fitz

# plotting
import matplotlib.pyplot as plt

# Sentence tokenizer
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# load LLM
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm.auto import tqdm
import torch

import transformers
transformers.logging.set_verbosity_error() # hide non critical warnings 


import random

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
2025-08-06 15:30:15.084989: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754494215.265732      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754494215.318149      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Filepaths

In [3]:
# File paths

train_path = '/kaggle/input/make-data-count-finding-data-references/train'
train_pdf_path = '/kaggle/input/make-data-count-finding-data-references/train/PDF'
train_xml_path = '/kaggle/input/make-data-count-finding-data-references/train/XML'

train_labels_path = '/kaggle/input/make-data-count-finding-data-references/train_labels.csv'

test_path = '/kaggle/input/make-data-count-finding-data-references/test'
test_pdf_path = '/kaggle/input/make-data-count-finding-data-references/test/PDF'
test_xml_path = '/kaggle/input/make-data-count-finding-data-references/test/XML'

sample_submission_path = '/kaggle/input/make-data-count-finding-data-references/sample_submission.csv'

df_labels = pd.read_csv(train_labels_path)

## Regex Extraction 

In [4]:
def extract_contexts(pdf_directory, file_list, verbose=True):

    chunks = []    # Stores (article_id, chunk) around DOI
    chunks2 = []   # Stores (article_id, chunk, accession_id) for accessions
    text_span_len = 500

    # --- DOI PATTERN ---
    re_doi = re.compile(r"10\.\d{4,9}/[-._;()/:A-Z0-9]+", re.IGNORECASE)
    # --- ACCESSION/ID PATTERNS ---
    accession_regexes = [
        re.compile(r"GSE\d+|SR[APRX]\d+|PRJ[NAED][A-Z]?\d+|E-[A-Z]+-\d+", re.IGNORECASE),  # GEO, SRA, PRJ, ArrayExpress
        re.compile(r"IPR\d{6}|PF\d{5}|EMPIAR-\d{5}|EMD-\d{4,5}", re.IGNORECASE),            # InterPro, Pfam, EMPIAR, EMDB
        re.compile(r"CHEMBL\d+|CVCL_[A-Z0-9]{4}|CID:\d+", re.IGNORECASE),                   # Chembl, Cellosaurus, PubChem CID
        re.compile(r"ENS[A-Z]{0,6}[GT]\d{11}|ENSG\d{11}", re.IGNORECASE),                   # Ensemble Gene/Transcript/IDs
        re.compile(r"N[MC]_\d+(?:\.\d+)?|rs\d+|XM_\d+|XP_\d+", re.IGNORECASE),              # GenBank/RefSeq/rs IDs
        re.compile(r"(?:uniprot:)?(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9][A-Z][A-Z0-9]{2}[0-9])", re.IGNORECASE), # UniProt
        re.compile(r"EPI(?:_ISL_)?\d+|GISAID", re.IGNORECASE),                              # GISAID
        re.compile(r"PXD\d{6}|SAM[ND]\d+|ERR\d+|DRR\d+|MSV\d+", re.IGNORECASE),             # ProteomeXchange, SRA, jPOST
        re.compile(r"GDS\d+|GPL\d+|GSM\d+", re.IGNORECASE),                                 # GEO Datasets, Platforms, Samples
        re.compile(r"E-[A-Z]+-\d+", re.IGNORECASE)                                          # ArrayExpress
    ]

    def remove_references_section(text):
        lines = text.split('\n')
        cut_index = -1
        for i in range(len(lines) - 1, max(0, int(len(lines) * 0.3)), -1):
            line = lines[i].strip()
            obvious_patterns = [
                r'^REFERENCES?$', r'^\d+\.?\s+REFERENCES?$', r'^\d+\.?\s+References?$',
                r'^References?:?$', r'^BIBLIOGRAPHY$', r'^\d+\.?\s+BIBLIOGRAPHY$',
                r'^\d+\.?\s+Bibliography$', r'^Bibliography:?$', r'^Literature\s+Cited$', r'^Works\s+Cited$'
            ]
            if any(re.match(pattern, line, re.IGNORECASE) for pattern in obvious_patterns):
                following_lines = lines[i+1:i+4]
                has_citations = False
                for follow_line in following_lines:
                    if follow_line.strip():
                        if (re.search(r'\(\d{4}\)', follow_line)
                            or re.search(r'\d{4}\.', follow_line)
                            or 'doi:' in follow_line.lower()
                            or ' et al' in follow_line.lower()):
                            has_citations = True
                            break
                if has_citations or i >= len(lines) - 3:
                    cut_index = i
                    break
        if cut_index != -1:
            return '\n'.join(lines[:cut_index]).strip()
        return text.strip()

    empty_or_error_pdfs = 0
    short_text_pdfs = 0
    total_files = 0

    if verbose:
        print(f"Found {len(file_list)} PDF files in directory:")
        for fn in file_list:
            print(f"  {fn}")

    for filename in tqdm(file_list, total=len(file_list)):
        if not filename.endswith(".pdf"):
            continue
        pdf_path = os.path.join(pdf_directory, filename)
        article_id = filename.split(".pdf")[0]
        total_files += 1

        try:
            doc = fitz.open(pdf_path)
            n_pages = len(doc)
            text = ""
            for page in doc:
                page_text = page.get_text()
                text += page_text + "\n"
            doc.close()
            if verbose:
                print(f"{filename}: {n_pages} pages, {len(text)} chars")
        except Exception as e:
            if verbose:
                print(f"Could not process {filename}: {e}")
            empty_or_error_pdfs += 1
            continue

        if len(text.strip()) < 800:
            short_text_pdfs += 1
            if verbose:
                print(f"  [{filename}] WARNING: very short text: {len(text)} chars")
                print(f"  Sample: {text[:200]}")
        text = remove_references_section(text)

        # --- DOI Extraction ---
        doi_chunk_count = 0
        for match in re_doi.finditer(text):
            if match.group() in article_id:
                continue
            doi_chunk_count += 1
            chunk = text[max(0, match.start() - text_span_len): match.end() + text_span_len]
            chunks.append((article_id, chunk))
        if verbose:
            print(f"  -> Found {doi_chunk_count} DOI matches (before filtering).")

        # --- Accession Extraction ---
        acc_chunk_count = 0
        for acc_regex in accession_regexes:
            for match in acc_regex.finditer(text):
                acc_id = match.group()
                acc_chunk_count += 1
                chunk = text[max(0, match.start() - text_span_len): match.end() + text_span_len]
                chunks2.append((article_id, chunk, acc_id))
        if verbose:
            print(f"  -> Found {acc_chunk_count} accession/other ID matches.")

    if verbose:
        print("="*60)
        print(f"DOI chunks (contexts): {len(chunks)}")
        print(f"Accession/other ID chunks: {len(chunks2)}")
        print(f"Skipped or errored PDFs: {empty_or_error_pdfs}")
        print(f"Short (<800 chars) but readable PDFs: {short_text_pdfs}")
        print(f"Total processed PDFs: {total_files}")
        print("="*60)
        if len(chunks):
            print("Sample DOI extraction chunk (first 1):\n", chunks[0])
        if len(chunks2):
            print("Sample accession extraction chunk (first 1):\n", chunks2[0])

    return chunks, chunks2


In [5]:
# 1. Get the directory containing test PDFs
pdf_directory = "/kaggle/input/make-data-count-finding-data-references/test/PDF"

# 2. Get a list of all PDF filenames in that directory
file_list = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]

# 3. Call your extract_contexts function
chunks, chunks2 = extract_contexts(pdf_directory, file_list, verbose=True)


Found 30 PDF files in directory:
  10.1002_cssc.202201821.pdf
  10.1002_ecs2.4619.pdf
  10.1002_ece3.9627.pdf
  10.1002_ejic.201900904.pdf
  10.1002_2017jc013030.pdf
  10.1002_ece3.3985.pdf
  10.1002_ejoc.202000139.pdf
  10.1002_chem.202001668.pdf
  10.1002_ece3.5395.pdf
  10.1002_ece3.6784.pdf
  10.1002_anie.202007717.pdf
  10.1002_chem.201903120.pdf
  10.1002_ece3.6303.pdf
  10.1002_chem.201902131.pdf
  10.1002_ecs2.1280.pdf
  10.1002_anie.201916483.pdf
  10.1002_esp.5058.pdf
  10.1002_ece3.5260.pdf
  10.1002_nafm.10870.pdf
  10.1002_ejoc.202000916.pdf
  10.1002_ece3.6144.pdf
  10.1002_ece3.961.pdf
  10.1002_anie.202005531.pdf
  10.1002_mp.14424.pdf
  10.1002_esp.5090.pdf
  10.1002_ece3.4466.pdf
  10.1002_chem.202001412.pdf
  10.1002_chem.202000235.pdf
  10.1002_chem.202003167.pdf
  10.1007_jhep07(2018)134.pdf


  0%|          | 0/30 [00:00<?, ?it/s]

10.1002_cssc.202201821.pdf: 13 pages, 70766 chars
  -> Found 29 DOI matches (before filtering).
  -> Found 0 accession/other ID matches.
10.1002_ecs2.4619.pdf: 14 pages, 66745 chars
  -> Found 12 DOI matches (before filtering).
  -> Found 0 accession/other ID matches.
10.1002_ece3.9627.pdf: 15 pages, 77933 chars
  -> Found 15 DOI matches (before filtering).
  -> Found 0 accession/other ID matches.
10.1002_ejic.201900904.pdf: 8 pages, 47466 chars
  -> Found 3 DOI matches (before filtering).
  -> Found 8 accession/other ID matches.
10.1002_2017jc013030.pdf: 22 pages, 98217 chars
  -> Found 17 DOI matches (before filtering).
  -> Found 0 accession/other ID matches.
10.1002_ece3.3985.pdf: 9 pages, 49173 chars
  -> Found 6 DOI matches (before filtering).
  -> Found 0 accession/other ID matches.
10.1002_ejoc.202000139.pdf: 11 pages, 70549 chars
  -> Found 22 DOI matches (before filtering).
  -> Found 1 accession/other ID matches.
10.1002_chem.202001668.pdf: 6 pages, 30884 chars
  -> Found 7 

## Load LLM

In [6]:
# Load LLM

from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = "/kaggle/input/qwen2.5/transformers/3b-instruct/1"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    torch_dtype="auto",
    local_files_only=True
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## LLM DOI / Accession Extraction

In [7]:
def run_llm_candidate_extraction(
    chunks, model, tokenizer, 
    max_new_tokens=64,
    show_samples=5,
    mode="doi" # or "accession"
):
    """
    Run LLM candidate extraction & validation for DOI or accession candidates.

    Args:
        chunks: 
            If mode=="doi": list of (article_id, context_window)
            If mode=="accession": list of (article_id, context_window, accession_id)
        model: HuggingFace (or similar) LLM.
        tokenizer: LLM tokenizer.
        max_new_tokens: LLM output cap.
        show_samples: Number of diagnostic printouts.
        mode: 'doi' or 'accession' -- determines prompt and chunk structure.
    Returns:
        prompts: list of string prompts sent to model.
        responses: list of model outputs.
    """

    if mode == "doi":
        SYS_PROMPT = """
You are an expert at identifying research data citations in academic papers.
Your task is to determine if a DOI citation in the given text refers specifically to research data, datasets, or data repositories.
Only respond with either a full normalized DOI URL starting with "https://doi.org/" or the word "Irrelevant" (without quotes).
Do NOT include any other text or explanation.
If there is no DOI related to research data, respond with exactly "Irrelevant".
If multiple DOIs refer to research data, return any one of them.
"""
    elif mode == "accession":
        SYS_PROMPT = """
Classify the following research data citation by Accession ID in the context of the academic text.

A) Primary: Data was newly GENERATED or COLLECTED in THIS study (the paper in question).
B) Secondary: Data is REUSED or TAKEN from a PUBLIC database/repository, previous study, or archive.
C) None: Not research data, or accession is not used in this study.

Respond with ONLY one letter: A, B, or C. Do NOT explain your answer.
"""
    else:
        raise ValueError("mode must be 'doi' or 'accession'")

    llm_prompts = []
    if mode == "doi":
        for article_id, context_window in chunks:
            messages = [
                {"role": "system", "content": SYS_PROMPT},
                {"role": "user", "content": context_window}
            ]
            prompt = tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=False,
            )
            llm_prompts.append(prompt)
    elif mode == "accession":
        for article_id, context_window, accession_id in chunks:
            messages = [
                {"role": "system", "content": SYS_PROMPT},
                {"role": "user", "content": f"Accession ID: {accession_id}\n\nContext:\n{context_window}\n\nOnly respond with a single letter: A, B, or C."}
            ]
            prompt = tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=False,
            )
            llm_prompts.append(prompt)

    responses = []
    for prompt in tqdm(llm_prompts, desc=f"LLM {mode} extraction"):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id
            )
        generated = output_ids[0][inputs["input_ids"].shape[1]:]
        response = tokenizer.decode(generated, skip_special_tokens=True).strip()
        responses.append(response)

    # Print a few sample outputs for inspection
    print("="*60)
    for idx, sample in enumerate(responses[:show_samples]):
        if mode == "doi":
            article_id, context = chunks[idx]
            print(f"\nSAMPLE {idx+1} (DOI)")
            print("Article:", article_id)
            print("Response:", sample)
            print("-" * 40)
        elif mode == "accession":
            article_id, context, accession_id = chunks[idx]
            print(f"\nSAMPLE {idx+1} (Accession)")
            print("Article:", article_id)
            print("Accession ID:", accession_id)
            print("Response:", sample)
            print("-" * 40)
    print("="*60)

    return llm_prompts, responses


In [8]:
doi_prompts, doi_responses = run_llm_candidate_extraction(
    chunks, model, tokenizer, max_new_tokens=64, show_samples=5, mode="doi"
)
acc_prompts, acc_responses = run_llm_candidate_extraction(
    chunks2, model, tokenizer, max_new_tokens=5, show_samples=5, mode="accession"
)


LLM doi extraction:   0%|          | 0/308 [00:00<?, ?it/s]


SAMPLE 1 (DOI)
Article: 10.1002_cssc.202201821
Response: Irrelevant
----------------------------------------

SAMPLE 2 (DOI)
Article: 10.1002_cssc.202201821
Response: Irrelevant
----------------------------------------

SAMPLE 3 (DOI)
Article: 10.1002_cssc.202201821
Response: Irrelevant
----------------------------------------

SAMPLE 4 (DOI)
Article: 10.1002_cssc.202201821
Response: Irrelevant
----------------------------------------

SAMPLE 5 (DOI)
Article: 10.1002_cssc.202201821
Response: Irrelevant
----------------------------------------


LLM accession extraction:   0%|          | 0/46 [00:00<?, ?it/s]


SAMPLE 1 (Accession)
Article: 10.1002_ejic.201900904
Accession ID: H2NEt2
Response: C
----------------------------------------

SAMPLE 2 (Accession)
Article: 10.1002_ejic.201900904
Accession ID: l2N6S2
Response: C
----------------------------------------

SAMPLE 3 (Accession)
Article: 10.1002_ejic.201900904
Accession ID: F3N6S2
Response: C
----------------------------------------

SAMPLE 4 (Accession)
Article: 10.1002_ejic.201900904
Accession ID: F3N6S2
Response: C
----------------------------------------

SAMPLE 5 (Accession)
Article: 10.1002_ejic.201900904
Accession ID: F3ClN6
Response: C
----------------------------------------


## Cleaning

In [9]:
# -- Minimal clean function for LLM DOI extraction --
def minimal_clean_llm_doi_responses(val_chunks, doi_responses):
    """
    Keeps all DOI candidates where LLM gave a non-empty, non-irrelevant answer.
    No dedup, no prefix cleaning.
    Returns a list of dicts with article_id, context_window, llm_response.
    """
    results = []
    for (article_id, context), resp in zip(val_chunks, doi_responses):
        if not resp or resp.strip().lower() == "irrelevant":
            continue
        results.append({
            "article_id": article_id,
            "context_window": context,
            "llm_response": resp
        })
    return results

# -- Minimal clean function for LLM accession extraction --
def minimal_clean_llm_accession_responses(val_chunks2, acc_responses):
    """
    Keeps all accession candidates where LLM responded with a non-empty answer.
    Returns a list of dicts with article_id, context_window, accession_id, llm_response.
    """
    results = []
    for (article_id, context, accession_id), resp in zip(val_chunks2, acc_responses):
        if not resp or resp.strip() == "":
            continue
        results.append({
            "article_id": article_id,
            "context_window": context,
            "accession_id": accession_id,
            "llm_response": resp
        })
    return results


In [10]:
# Minimal cleaning for DOIs
cleaned_doi_candidates = minimal_clean_llm_doi_responses(chunks, doi_responses)

# Minimal cleaning for accessions
cleaned_accession_candidates = minimal_clean_llm_accession_responses(chunks2, acc_responses)

# Optionally, convert to DataFrame for inspection

cleaned_doi_df = pd.DataFrame(cleaned_doi_candidates)
cleaned_accession_df = pd.DataFrame(cleaned_accession_candidates)

print("Cleaned DOI candidates:", cleaned_doi_df.shape)
print("Cleaned accession candidates:", cleaned_accession_df.shape)
print(cleaned_doi_df.head(2))
print(cleaned_accession_df.head(2))


Cleaned DOI candidates: (15, 3)
Cleaned accession candidates: (46, 4)
          article_id                                     context_window  \
0  10.1002_ecs2.4619  aniscalco, Yadi Galindo Salazar, Chelsea Steel...   
1  10.1002_ece3.9627  versity of Queensland, the National Geographic...   

                              llm_response  
0          https://doi.org/10.25349/D9QW5X  
1  https://doi.org/10.5061/dryad.b8gtht7h3  
               article_id                                     context_window  \
0  10.1002_ejic.201900904  one signal, indicating the\npurity of the comp...   
1  10.1002_ejic.201900904  42.4 and 41.8 (two s, CH2CH3 and N+H-CH3), 40....   

  accession_id llm_response  
0       H2NEt2            C  
1       l2N6S2            C  


## LLM Classify DOIs

In [11]:
def classify_doi_candidates_with_llm(
    df_dois_llm, tokenizer, model, max_new_tokens=1, show_head=5
):
    """
    Classifies (DOI, context_window) rows as A/B/C with LLM, attaches column 'classification'.

    Args:
        df_dois_llm: DataFrame with ['article_id', 'dataset_id', 'context_window']
        tokenizer:   Chat/instruction tokenizer for your LLM
        model:       Your LLM model
        max_new_tokens: Max tokens from LLM per response (should be 1 for A/B/C)
        show_head:   How many samples to print from classified DataFrame

    Returns: DataFrame with extra 'classification' column (A/B/C)
    """

    SYS_PROMPT_CLASSIFY_DOI = """
You are given a piece of academic text. Your task is to classify the data associated with the given DOI.

Classify the data as:

A) Primary: if the data was generated specifically for this study.
B) Secondary: if the data was reused or derived from prior work.
C) None: if the DOI does not refer to research data, or is unrelated.

Respond with only one letter: A, B, or C.
"""

    prompts = []
    for _, row in df_dois_llm.iterrows():
        messages = [
            {"role": "system", "content": SYS_PROMPT_CLASSIFY_DOI},
            {"role": "user", "content": f"DOI: {row['llm_response']}\n\nAcademic text/context:\n{row['context_window']}\n\nOnly respond with a single letter: A, B, or C."}
        ]
        prompt = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )
        prompts.append(prompt)

    responses_class = []
    for prompt in tqdm(prompts, desc="LLM (A/B/C) DOI Classification"):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                temperature=0.0,
                pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id
            )
        generated = output_ids[0][inputs["input_ids"].shape[1]:]
        resp = tokenizer.decode(generated, skip_special_tokens=True).strip().upper()
        # Accept only A, B, or C; fallback to C if not clear
        if resp and resp[0] in "ABC":
            responses_class.append(resp[0])
        else:
            responses_class.append("C")

    df_out = df_dois_llm.copy()
    df_out['classification'] = responses_class

    print(df_out['classification'].value_counts())
    if show_head:
        print(df_out.head(show_head))

    return df_out


In [12]:
classified_dois = classify_doi_candidates_with_llm(
    cleaned_doi_df, tokenizer, model, max_new_tokens=1, show_head=5
)


LLM (A/B/C) DOI Classification:   0%|          | 0/15 [00:00<?, ?it/s]

classification
A    8
B    7
Name: count, dtype: int64
          article_id                                     context_window  \
0  10.1002_ecs2.4619  aniscalco, Yadi Galindo Salazar, Chelsea Steel...   
1  10.1002_ece3.9627  versity of Queensland, the National Geographic...   
2  10.1002_ece3.9627  an Research Council Discovery Early Career Awa...   
3  10.1002_ece3.5395  Christiane \nNüsslein‐Volhard Stiftung.\nCONFL...   
4  10.1002_ece3.5395  FLICT OF INTEREST\nNone declared.\nAUTHORS' CO...   

                              llm_response classification  
0          https://doi.org/10.25349/D9QW5X              B  
1  https://doi.org/10.5061/dryad.b8gtht7h3              B  
2  https://doi.org/10.5061/dryad.b8gtht7h3              A  
3   https://doi.org/10.5441/001/1.v1cs4nn0              B  
4   https://doi.org/10.5441/001/1.v1cs4nn0              B  


## Map, Clean, and Merge

In [13]:
# --- CLEAN AND MAP DOIs ---
# classified_dois columns: ['article_id', 'context_window', 'llm_response', 'classification']

doi_df = pd.DataFrame()
doi_df["article_id"] = classified_dois["article_id"]
doi_df["dataset_id"] = classified_dois["llm_response"].astype(str).str.lower().str.strip()
type_map = {"A": "Primary", "B": "Secondary", "C": None}
doi_df["type"] = classified_dois["classification"].map(type_map)
doi_df = doi_df[doi_df["type"].notnull()].reset_index(drop=True)
doi_df = doi_df[doi_df["dataset_id"] != "irrelevant"]  # In notebook style, filter "irrelevant"
# Optional: remove any other nonsense values as needed

# --- CLEAN AND MAP ACCESSIONS ---
# cleaned_accession_df columns: ['article_id', 'context_window', 'accession_id', 'classification']
acc_df = pd.DataFrame()
acc_df["article_id"] = cleaned_accession_df["article_id"]
acc_df["dataset_id"] = cleaned_accession_df["accession_id"].astype(str).str.upper().str.strip()
acc_df["type"] = cleaned_accession_df["classification"].map(type_map)
acc_df = acc_df[acc_df["type"].notnull()].reset_index(drop=True)

# --- MERGE, DEDUPLICATE, FINALIZE ---
final_results = pd.concat([doi_df, acc_df], ignore_index=True)
final_results = final_results.drop_duplicates(subset=["article_id", "dataset_id"], keep="first")
final_results = final_results[final_results["type"].isin(["Primary", "Secondary"])]  # just in case
final_results = final_results.reset_index(drop=True)
final_results["row_id"] = range(len(final_results))

# --- (Optional) Save for submission or show head ---
print(final_results.head())
print(final_results["type"].value_counts())


KeyError: 'classification'

# Submit

In [None]:
print("Final submission stats:")
print(final_results["type"].value_counts())
print(f"Total entries: {len(final_results)}")
print(final_results.dtypes)
print(final_results.isnull().sum())
print(final_results.shape)

In [None]:
final_results.to_csv('/kaggle/working/submission.csv', index=False)