In [1]:
from pathlib import Path
import json
from collections import defaultdict
from typing import Any, Dict, Optional

import pandas as pd
from datasets import Dataset, DatasetDict

# Citation Search Result Coverage and HF Dataset Prep

This notebook loads the aggregated search results, reports coverage statistics for each external source, and prepares a Hugging Face DatasetDict with one split per reference source.

In [2]:


DATA_PATH = Path("results_20251020_124218_limitNone_openalex_wikidata_matilda.json")
OPEN_CITATION_DATA_PATH = Path("results_20251022_161614_limitNone_opencitations.json")

with DATA_PATH.open() as f:
    records = json.load(f)
print(f"Loaded {len(records)} reference records from {DATA_PATH}")

# Load OpenCitations records (separate file)
with OPEN_CITATION_DATA_PATH.open() as f:
    records_opencitations = json.load(f)
print(f"Loaded {len(records_opencitations)} reference records from {OPEN_CITATION_DATA_PATH}")

sources = sorted({record.get("source", "unknown") for record in records})
print(f"Reference sources (main): {sources}")
providers_main = ["openalex", "matilda", "wikidata"]

# fix the omid mapping for opencitations
for record in records_opencitations:
    ids = record.get("ids", {})
    omid = ids.get("omid")
    if omid:
        cleaned = omid.rstrip("/")
        parts = cleaned.split("/")
        # Return last two segments, e.g., "br/06210459208"
        if len(parts) >= 2:
            ids["omid"] = "/".join(parts[-2:])
    record["ids"] = ids
providers_opencitations = ["opencitations"]

Loaded 759 reference records from results_20251020_124218_limitNone_openalex_wikidata_matilda.json
Loaded 759 reference records from results_20251022_161614_limitNone_opencitations.json
Reference sources (main): ['cex', 'excite', 'linkedbook']


In [3]:
def summarize_provider(records: list[dict], provider: str) -> Dict[str, Optional[float]]:
    total_records = len(records)
    success_count = 0
    top_result_count = 0
    match_count = 0
    year_match_count = 0
    title_sims: list[float] = []
    author_sims: list[float] = []

    for record in records:
        metadata = (record.get("search_results") or {}).get(provider, {}).get("metadata_search")
        if not metadata:
            continue
        if metadata.get("success"):
            success_count += 1
        top_result = metadata.get("top_result")
        if not top_result:
            continue
        top_result_count += 1
        if top_result.get("is_match"):
            match_count += 1
        match_details = top_result.get("match_details") or {}
        title_sim = match_details.get("title_similarity")
        author_sim = match_details.get("author_similarity")
        if title_sim is not None:
            title_sims.append(float(title_sim))
        if author_sim is not None:
            author_sims.append(float(author_sim))
        if match_details.get("year_match"):
            year_match_count += 1

    mean_title_sim = float(pd.Series(title_sims).mean()) if title_sims else None
    mean_author_sim = float(pd.Series(author_sims).mean()) if author_sims else None

    return {
        "success_count": success_count,
        "success_rate": success_count / total_records if total_records else 0.0,
        "top_result_count": top_result_count,
        "top_result_rate": top_result_count / total_records if total_records else 0.0,
        "is_match_count": match_count,
        "is_match_rate": match_count / total_records if total_records else 0.0,
        "year_match_count": year_match_count,
        "year_match_rate": year_match_count / total_records if total_records else 0.0,
        "mean_title_similarity": mean_title_sim,
        "mean_author_similarity": mean_author_sim,
    }

summary_main = {provider: summarize_provider(records, provider) for provider in providers_main}
# Build stats for OpenCitations from its separate records
summary_oc = {"opencitations": summarize_provider(records_opencitations, "opencitations")}
summary = {**summary_main, **summary_oc}
stats_df = pd.DataFrame(summary).T
display(stats_df)

Unnamed: 0,success_count,success_rate,top_result_count,top_result_rate,is_match_count,is_match_rate,year_match_count,year_match_rate,mean_title_similarity,mean_author_similarity
openalex,757.0,0.997365,593.0,0.781291,383.0,0.504611,385.0,0.507246,87.416526,61.976391
matilda,757.0,0.997365,421.0,0.554677,314.0,0.413702,280.0,0.368906,99.337292,67.581948
wikidata,757.0,0.997365,161.0,0.212121,124.0,0.163373,124.0,0.163373,93.428571,59.15528
opencitations,757.0,0.997365,116.0,0.152833,88.0,0.115942,89.0,0.11726,99.827586,0.0


## Build Hugging Face dataset

We expand every reference across the available providers and materialize a DatasetDict where each split key corresponds to the reference source.

In [10]:
def extract_openalex_id(raw_id: Optional[str]) -> Optional[str]:
    if not raw_id:
        return None
    cleaned = raw_id.rstrip("/")
    return cleaned.split("/")[-1]


def build_matched_id(provider: str, ids: Dict[str, Any]) -> Optional[str]:
    if provider == "openalex":
        return extract_openalex_id(ids.get("openalex_id"))
    if provider == "matilda":
        return ids.get("matilda_id")
    if provider == "wikidata":
        return ids.get("wikidata_id")
    if provider == "opencitations":
        omid = ids.get("omid")
        if omid:
            cleaned = omid.rstrip("/")
            parts = cleaned.split("/")
            # Return last two segments, e.g., "br/06210459208"
            if len(parts) >= 2:
                return "/".join(parts[-2:])
        return None
    return None


def build_matched_link(provider: str, matched_id: Optional[str]) -> Optional[str]:
    search_links = {
        "openalex": "https://openalex.org/",
        "matilda": "https://matilda.science/?l=en",
        "wikidata": "https://www.wikidata.org/wiki/Wikidata:Main_Page",
        "opencitations": "https://sparql.opencitations.net/",
    }
    if matched_id:
        if provider == "openalex":
            return f"https://openalex.org/works?zoom={matched_id.lower()}"
        if provider == "matilda":
            return f"https://matilda.science/work/{matched_id}"
        if provider == "wikidata":
            return f"https://www.wikidata.org/wiki/{matched_id}"
        if provider == "opencitations":
            return f"https://api.opencitations.net/meta/v1/metadata/omid:{matched_id}"
    return search_links.get(provider)


def resolve_is_match(metadata: Optional[Dict[str, Any]]) -> bool:
    if not metadata:
        return False
    top_result = metadata.get("top_result") or {}
    is_match = top_result.get("is_match")
    if is_match is not None:
        return bool(is_match)
    match_details = top_result.get("match_details") or {}
    title_similarity = match_details.get("title_similarity")
    if title_similarity is None:
        return False
    return float(title_similarity) >= 90.0


def summarize_top_result(top_result: Optional[Dict[str, Any]]) -> Dict[str, Any]:
    fields = ["title", "first_author", "year", "journal"]
    data = top_result or {}
    return {field: data.get(field) for field in fields}



def matched_result_to_text(m: Optional[Dict[str, Any]]) -> str:
    if not m:
        return ""
    parts = []
    for k in ("title", "first_author", "year", "journal"):
        v = m.get(k)
        if v:
            parts.append(f"{k}: {v}")
    return " | ".join(parts)


split_rows: dict[str, list[dict[str, Any]]] = defaultdict(list)

# Build splits for the main providers from the combined records file
for record in records:
    base_row = {
        "ref_id": record.get("ref_id"),
        "original_ref_string": record.get("original_string"),
        # "reference_source": record.get("source"),
    }
    search_results = record.get("search_results") or {}
    for provider in providers_main:
        metadata = (search_results.get(provider) or {}).get("metadata_search")
        top_result = metadata.get("top_result") if metadata else None
        ids = (top_result or {}).get("ids") or {}
        matched_id = build_matched_id(provider, ids)
        row = {
            **base_row,
            "matched_id": matched_id or "Not Found",
            "matched_doi": ids.get("doi") or "",
            # Store as dict/JSON for consistent schema across splits
            "matched_result": summarize_top_result(top_result),
            "is_match_by_similarity": resolve_is_match(metadata),
            "matched_link": build_matched_link(provider, matched_id),
        }
        split_rows[provider].append(row)

# Build split for OpenCitations from its separate records file
for record in records_opencitations:
    base_row = {
        "ref_id": record.get("ref_id"),
        "original_ref_string": record.get("original_string"),
    }
    metadata = (record.get("search_results") or {}).get("opencitations", {}).get("metadata_search")
    top_result = metadata.get("top_result") if metadata else None
    ids = (top_result or {}).get("ids") or {}
    matched_id = build_matched_id("opencitations", ids)
    row = {
        **base_row,
        "matched_id": matched_id or "Not Found",
        "matched_doi": ids.get("doi") or "",
        # Store as dict/JSON for consistent schema across splits
        "matched_result": summarize_top_result(top_result),
        "is_match_by_similarity": resolve_is_match(metadata),
        "matched_link": build_matched_link("opencitations", matched_id),
    }
    split_rows["opencitations"].append(row)


dataset_dict = DatasetDict({
    split: Dataset.from_list(rows)
    for split, rows in split_rows.items()
})


dataset_dict

DatasetDict({
    openalex: Dataset({
        features: ['ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],
        num_rows: 759
    })
    matilda: Dataset({
        features: ['ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],
        num_rows: 759
    })
    wikidata: Dataset({
        features: ['ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],
        num_rows: 759
    })
    opencitations: Dataset({
        features: ['ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],
        num_rows: 759
    })
})

In [11]:
print(dataset_dict)
preview_split = next(iter(dataset_dict.keys()))
print(f"Preview split: {preview_split}")
dataset_dict[preview_split][:5]


DatasetDict({
    openalex: Dataset({
        features: ['ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],
        num_rows: 759
    })
    matilda: Dataset({
        features: ['ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],
        num_rows: 759
    })
    wikidata: Dataset({
        features: ['ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],
        num_rows: 759
    })
    opencitations: Dataset({
        features: ['ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],
        num_rows: 759
    })
})
Preview split: openalex


{'ref_id': ['cex_BIO-GEN-MOL_9_1',
  'cex_BIO-GEN-MOL_9_0',
  'cex_BIO-GEN-MOL_9_3',
  'cex_BIO-GEN-MOL_9_2',
  'cex_BIO-GEN-MOL_9_4'],
 'original_ref_string': ['L R Leddy, R E Holmes. Chondrosarcoma of bone. Cancer Treat Res. 2014. Vol. 162',
  'A Y Giuffrida, J E Burgueno, L G Koniaris, J C Gutierrez, R Duncan, S P Scully. Chondrosarcoma in the United States (1973 to 2003): an analysis of 2890 cases from the SEER database. J Bone Joint Surg Am. 2009. Vol. 91',
  'E Lhuissier, C Bazille, J Aury-Landas, N Girard, J Pontin, M Boittin, K Boumediene, C Baugé. Identification of an easy to use 3D culture model to investigate invasion and anticancer drug response in chondrosarcomas. BMC Cancer. 2017. Vol. 17',
  'N Girard, C Bazille, E Lhuissier, H Benateau, A Llombart-Bosch, K Boumediene, C Baugé. 3-Deazaneplanocin A (DZNep), an Inhibitor of the Histone Methyltransferase EZH2, Induces Apoptosis and Reduces Cell Migration in Chondrosarcoma Cells. PloS One. 2014. Vol. 9. pp. e98176',
  'J Pup

In [12]:
import os
from huggingface_hub import login

hf_token = os.environ.get("HF_TOKEN")
login(token=hf_token)

# Push to Hugging Face Hub
# matched_result is now stored as dict/JSON (not string) for better type consistency
dataset_dict.push_to_hub("yurui983/citation_linking", private=False)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  148kB /  148kB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  243kB /  243kB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########| 98.5kB / 98.5kB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  33%|###3      | 52.7kB /  159kB            

README.md:   0%|          | 0.00/881 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/yurui983/citation_linking/commit/198d81017ef579553969ff004655bcbd87302b14', commit_message='Upload dataset', commit_description='', oid='198d81017ef579553969ff004655bcbd87302b14', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/yurui983/citation_linking', endpoint='https://huggingface.co', repo_type='dataset', repo_id='yurui983/citation_linking'), pr_revision=None, pr_num=None)

## Push to Argilla for annotation

This section creates one Argilla dataset per provider split and uploads records for human annotation. Each record will include:

- the original reference string
- the matched id and a link to the matched result
- a compact matched result summary (title, first_author, year, journal)

The annotation questions are:
1) Is the matched result correct? (true/false)
2) If incorrect, provide the correct ID (text input).

You will be prompted for Argilla API URL and API token to connect to your deployment.

In [None]:
# Install argilla if not already installed (uncomment to run in notebook)
# !pip install -U argilla


In [13]:
guidelines_markdown = """
# Annotation Guidelines: Citation Linking

## What the fields mean
- ref_id: Internal identifier of the reference.
- original_ref_string: The raw reference text extracted from the document.
- matched_id: Provider-specific ID of the candidate match (or "Not Found").
  - OpenAlex: W1234567890 (last segment of /works/W…)
  - Wikidata: Q12345
  - Matilda: last path segment in /work/<id>
  - OpenCitations: OMID last path segment (e.g., br/06210459208)
- matched_doi: DOI of the candidate match (if available).
- matched_result: Compact summary of the candidate (title, first_author, year, journal).
- is_match_by_similarity: Model’s heuristic guess (for context only).
- matched_link: Link to view the candidate on the provider site.

## What you need to do
1) Check if the candidate is the same work as the reference.
   - Compare title, first author surname, year, and DOI (if present).
   - Use matched_link to verify details on the provider page.
2) If the candidate is incorrect OR empty (matched_id = "Not Found" or no details shown), find the correct record in the same provider and paste its ID in correct_id.
3) If you believe no record exists for this reference in this provider, select “No match”.

## How to answer
- Candidate is correct:
  - is_match_correct = true
  - No match = false
  - correct_id = (leave blank)
- Candidate is incorrect but correct record exists:
  - is_match_correct = false
  - No match = false
  - correct_id = provider-specific ID (OpenAlex W…, Wikidata Q…, Matilda work id, OpenCitations OMID br/…)
- No record in this provider:
  - is_match_correct = false
  - No match = true
  - correct_id = (leave blank)

Notes:
- Minor formatting/casing differences are fine; it must be the same work.
- Provide only the ID (not a URL).
"""

In [14]:


import os
from datasets import load_dataset

try:
    import argilla as rg
except Exception:
    raise

ARGILLA_API_URL = os.environ.get("ARGILLA_API_URL") or 'https://argilla-route-graphia-app1-staging.apps.bst2.paas.psnc.pl/'
ARGILLA_API_TOKEN = os.environ.get("ARGILLA_API_TOKEN") or 'argilla.apikey'

client = rg.Argilla(api_key=ARGILLA_API_TOKEN, api_url=ARGILLA_API_URL)
# 'ref_id', 'original_ref_string', 'matched_id', 'matched_doi', 'matched_result', 'is_match_by_similarity', 'matched_link'],

settings = rg.Settings(
        fields=[
            rg.TextField(name="ref_id", title="Reference ID", description="The ID of the original reference."),
            rg.TextField(name="original_ref_string", title="Reference String", description="The original reference string extracted from the document."),
            rg.TextField(name="matched_id", title="Matched ID", description="The ID of the matched record from the external provider."),
            rg.TextField(name="matched_doi", title="Matched DOI", description="The DOI of the matched record from the external provider."),
            rg.TextField(name="matched_result", title="Matched Result", description="The result of the matching process.",use_markdown=True),
            rg.TextField(name="is_match_by_similarity", title="Is Match by Similarity", description="Indicates if the match was found by similarity."),
            rg.TextField(name="matched_link", title="Matched Link", use_markdown=True, description="The link to the matched record from the external provider."),
        ],
        guidelines=guidelines_markdown,
        questions=[
            rg.LabelQuestion(
                name="is_match_correct",
                title="Is the matched result correct?",
                labels=["true", "false"],
                description="Select 'true' if the candidate matched record corresponds to the same publication as the reference (matching title, first author and year). Otherwise select 'false'.",
                required=True,
            ),  
            rg.TextQuestion(
                name="correct_id",
                title="If incorrect, provide the correct ID (only id)",
                description="When the candidate is incorrect, type the provider-specific identifier of the correct match (e.g., OpenAlex id like W1234567890). Leave blank if 'true'.",
                required=False,
            ),
            rg.LabelQuestion(
                name="This reference has no match",
                title="Does this reference have no match?",
                labels=["true", "false"],
                description="Select 'true' if the reference has no match in the this provider. Otherwise select 'false'.",
                required=True,
            ),
            ]
            
)

# Helper to format matched_result into a readable string
from typing import Optional, Dict, Any


def matched_result_to_markdown_json(m: Optional[Dict[str, Any]]) -> str:
    if not m:
        return ""
    
    json_str = '```json\n' + json.dumps(m, indent=2) + '\n```'
    return json_str

# Define the Argilla dataset schema (fields/questions)
# from argilla.client.models import TextField, TextAreaField, LabelQuestion, ResponseSchema, FieldSchema

def matched_link_to_markdown(m: Optional[str]) -> str:
    if not m:
        return ""
    return f"[{m}]({m})"

# Use the local dataset_dict (already created in Cell 5)
# If you want to load from hub instead, use: load_dataset("yurui983/citation_linking")
dataset_for_argilla = dataset_dict

for split_name, ds in dataset_for_argilla.items():

    if split_name != 'opencitations':
        continue

    dataset_name = f"citation_linking_{split_name}"

    #delete existing dataset if exists
    # dataset = rg.Dataset(name=dataset_name)
    # dataset.delete()
    print("Creating dataset:", dataset_name)
    
    dataset = rg.Dataset(name=dataset_name, settings=settings)
    dataset.create()
    hf_dataset = dataset_for_argilla[split_name]
    records_to_log = []
    for ex in hf_dataset:
        matched_raw = ex.get("matched_result") or {}
        # If matched_result is a dict with the summary fields, convert to text
        matched_text = (
            matched_result_to_markdown_json(matched_raw)
            if isinstance(matched_raw, dict)
            else str(matched_raw or "")
        )

        is_match_val = ex.get("is_match_by_similarity")
        # Argilla TextField expects string (or list/dict depending on schema); convert bool to string
        if isinstance(is_match_val, bool):
            is_match_str = "true" if is_match_val else "false"
        # else:
        #     is_match_str = str(is_match_val) if is_match_val is not None else ""

        record = {
            "ref_id": ex.get("ref_id") or "",
            "original_ref_string": ex.get("original_ref_string") or "",
            "matched_id": ex.get("matched_id") or "Not Found",
            "matched_doi": ex.get("matched_doi") or "",
            "matched_result": matched_text,
            "is_match_by_similarity": is_match_str,
            "matched_link": ex.get("matched_link") or "",
            # include ref_id as metadata so annotators / export scripts can trace back
            "metadata": {"ref_id": ex.get("ref_id")},
        }
        records_to_log.append(record)

    # Log records (batch)
    # ds.records.log(records=records_to_log)
    dataset.records.log(records=records_to_log)

    # break  # Only do one split for testing

Creating dataset: citation_linking_opencitations


Sending records...: 3batch [00:06,  2.30s/batch]                    
