In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
import numpy as np
import json
import os
import re
import time

In [2]:
# Load data from JSON files
data_dir = "/Users/alex/docs/code/Odoma/citation_index/benchmarks/finetune"

print("Loading JSON files...")
with open(f"{data_dir}/finetuning_train_single.json", "r") as f:
    train_single = json.load(f)
print(f"✓ Loaded train_single: {len(train_single):,} examples")

with open(f"{data_dir}/finetuning_valid_single.json", "r") as f:
    valid_single = json.load(f)
print(f"✓ Loaded valid_single: {len(valid_single):,} examples")

with open(f"{data_dir}/finetuning_train_group.json", "r") as f:
    train_group = json.load(f)
print(f"✓ Loaded train_group: {len(train_group):,} examples")

with open(f"{data_dir}/finetuning_valid_group.json", "r") as f:
    valid_group = json.load(f)
print(f"✓ Loaded valid_group: {len(valid_group):,} examples")

print(f"\nTotal examples: {len(train_single) + len(valid_single) + len(train_group) + len(valid_group):,}")



Loading JSON files...
✓ Loaded train_single: 25,772 examples
✓ Loaded valid_single: 1,235 examples
✓ Loaded train_group: 725 examples
✓ Loaded valid_group: 31 examples

Total examples: 27,763


In [3]:
# Statistics: Distribution by source
print("=" * 60)
print("DISTRIBUTION BY SOURCE")
print("=" * 60)

for split_name, split_data in [
    ("train_single", train_single),
    ("valid_single", valid_single),
    ("train_group", train_group),
    ("valid_group", valid_group)
]:
    source_counts = {}
    for item in split_data:
        source = item.get("source", "unknown")
        source_counts[source] = source_counts.get(source, 0) + 1
    
    print(f"\n{split_name} ({len(split_data):,} examples):")
    for source, count in sorted(source_counts.items()):
        pct = (count / len(split_data)) * 100
        print(f"  {source:12s}: {count:6,} ({pct:5.1f}%)")



DISTRIBUTION BY SOURCE

train_single (25,772 examples):
  cex         :    496 (  1.9%)
  excite      :    661 (  2.6%)
  linkedbook  : 24,615 ( 95.5%)

valid_single (1,235 examples):
  cex         :    173 ( 14.0%)
  excite      :      7 (  0.6%)
  linkedbook  :  1,055 ( 85.4%)

train_group (725 examples):
  cex         :     11 (  1.5%)
  excite      :     33 (  4.6%)
  linkedbook  :    681 ( 93.9%)

valid_group (31 examples):
  cex         :      5 ( 16.1%)
  excite      :      2 (  6.5%)
  linkedbook  :     24 ( 77.4%)


In [4]:
# Statistics: Language distribution (single dataset only)
print("=" * 60)
print("LANGUAGE DISTRIBUTION (Single Dataset)")
print("=" * 60)

for split_name, split_data in [("train_single", train_single), ("valid_single", valid_single)]:
    lang_counts = {}
    for item in split_data:
        lang = item.get("language", "unknown")
        lang_counts[lang] = lang_counts.get(lang, 0) + 1
    
    print(f"\n{split_name} ({len(split_data):,} examples):")
    for lang, count in sorted(lang_counts.items(), key=lambda x: x[1], reverse=True):
        pct = (count / len(split_data)) * 100
        print(f"  {lang:12s}: {count:6,} ({pct:5.1f}%)")



LANGUAGE DISTRIBUTION (Single Dataset)

train_single (25,772 examples):
  IT          : 18,256 ( 70.8%)
  EN          :  1,920 (  7.4%)
  FR          :  1,712 (  6.6%)
  DE          :  1,141 (  4.4%)
  ES          :    801 (  3.1%)
  en          :    606 (  2.4%)
  de          :    551 (  2.1%)
  PT          :    452 (  1.8%)
  NL          :    301 (  1.2%)
  UNKNOWN     :     32 (  0.1%)

valid_single (1,235 examples):
  IT          :    842 ( 68.2%)
  en          :    173 ( 14.0%)
  EN          :     88 (  7.1%)
  DE          :     71 (  5.7%)
  FR          :     46 (  3.7%)
  de          :      7 (  0.6%)
  ES          :      4 (  0.3%)
  NL          :      2 (  0.2%)
  PT          :      2 (  0.2%)


In [5]:
# Statistics: Reference count distribution (group dataset only)
print("=" * 60)
print("REFERENCE COUNT DISTRIBUTION (Group Dataset)")
print("=" * 60)

for split_name, split_data in [("train_group", train_group), ("valid_group", valid_group)]:
    ref_counts = []
    for item in split_data:
        count = item.get("ref_count", 0)
        ref_counts.append(count)
    
    print(f"\n{split_name} ({len(split_data):,} examples):")
    print(f"  Min refs:    {min(ref_counts):,}")
    print(f"  Max refs:    {max(ref_counts):,}")
    print(f"  Mean refs:   {np.mean(ref_counts):.1f}")
    print(f"  Median refs: {np.median(ref_counts):.0f}")
    print(f"  Total refs:  {sum(ref_counts):,}")



REFERENCE COUNT DISTRIBUTION (Group Dataset)

train_group (725 examples):
  Min refs:    1
  Max refs:    200
  Mean refs:   35.7
  Median refs: 30
  Total refs:  25,893

valid_group (31 examples):
  Min refs:    1
  Max refs:    156
  Mean refs:   44.7
  Median refs: 32
  Total refs:  1,387


In [6]:
# Statistics: Message length (approximate token count)
print("=" * 60)
print("MESSAGE LENGTH STATISTICS (character counts)")
print("=" * 60)

for split_name, split_data in [
    ("train_single", train_single),
    ("valid_single", valid_single),
    ("train_group", train_group),
    ("valid_group", valid_group)
]:
    total_chars = []
    for item in split_data:
        total = sum(len(msg["content"]) for msg in item["messages"])
        total_chars.append(total)
    
    print(f"\n{split_name} ({len(split_data):,} examples):")
    print(f"  Min chars:    {min(total_chars):,}")
    print(f"  Max chars:    {max(total_chars):,}")
    print(f"  Mean chars:   {np.mean(total_chars):,.0f}")
    print(f"  Median chars: {np.median(total_chars):,.0f}")
    print(f"  Est. tokens:  {np.mean(total_chars)/4:.0f} (mean)")



MESSAGE LENGTH STATISTICS (character counts)

train_single (25,772 examples):
  Min chars:    908
  Max chars:    2,608
  Mean chars:   1,357
  Median chars: 1,360
  Est. tokens:  339 (mean)

valid_single (1,235 examples):
  Min chars:    989
  Max chars:    2,245
  Mean chars:   1,425
  Median chars: 1,428
  Est. tokens:  356 (mean)

train_group (725 examples):
  Min chars:    1,005
  Max chars:    155,319
  Mean chars:   17,039
  Median chars: 12,355
  Est. tokens:  4260 (mean)

valid_group (31 examples):
  Min chars:    1,762
  Max chars:    140,624
  Mean chars:   31,334
  Median chars: 19,037
  Est. tokens:  7834 (mean)


In [7]:
# Show sample examples
print("=" * 60)
print("SAMPLE EXAMPLES")
print("=" * 60)

# Single example
print("\n--- SINGLE REFERENCE EXAMPLE ---")
sample_single = train_single[0]
print(f"Source: {sample_single.get('source')}")
print(f"Language: {sample_single.get('language', 'N/A')}")
print(f"\nUser message (first 200 chars):")
print(sample_single['messages'][1]['content'][:200] + "...")
print(f"\nAssistant response (first 300 chars):")
print(sample_single['messages'][2]['content'][:300] + "...")

# Group example
print("\n\n--- GROUP REFERENCE EXAMPLE ---")
sample_group = train_group[0]
print(f"Source: {sample_group.get('source')}")
print(f"Ref count: {sample_group.get('ref_count')}")
print(f"\nUser message (first 300 chars):")
print(sample_group['messages'][1]['content'][:300] + "...")
print(f"\nAssistant response (first 400 chars):")
print(sample_group['messages'][2]['content'][:400] + "...")



SAMPLE EXAMPLES

--- SINGLE REFERENCE EXAMPLE ---
Source: linkedbook
Language: IT

User message (first 200 chars):
Goal: From the Input Text, detect all bibliographic references and return only this JSON object:

```json
{
  "references": [
    {
      "reference": {
        "authors": [
          {"first_name": "...

Assistant response (first 300 chars):
{"references": [{"reference": {"authors": ["ROMANIN."], "full_title": "Storia Documentata di Venezia, Venezia", "journal_title": "", "volume": "IH,", "issue": "", "pages": "pp. 300 - 301.", "publication_date": "1853,", "publisher": "", "publication_place": ""}}]}...


--- GROUP REFERENCE EXAMPLE ---
Source: linkedbook
Ref count: 25

User message (first 300 chars):
Convert these references to the following JSON structure:

{
    "references": [
        {
            "reference": {
                "authors": [list of author objects | "original string"],
                "full_title": "string",
                "journal_title": "string",
 

In [8]:
# Normalize features across all datasets
print("=" * 60)
print("NORMALIZING DATASET FEATURES")
print("=" * 60)

def normalize_single(item):
    """Add group-specific fields to single dataset items"""
    normalized = item.copy()
    # Single references always have ref_count = 1
    normalized['ref_count'] = 1
    # Convert single language string to list of languages
    if 'language' in item:
        normalized['languages'] = [item['language']]
    else:
        normalized['languages'] = []
    
    # remove language field
    normalized.pop('language', None)
    return normalized


print("Normalizing single datasets...")
train_single_normalized = [normalize_single(item) for item in train_single]
valid_single_normalized = [normalize_single(item) for item in valid_single]

print("✓ All datasets normalized with consistent schema")
print("  Fields: messages, source, languages, ref_count")



NORMALIZING DATASET FEATURES
Normalizing single datasets...
✓ All datasets normalized with consistent schema
  Fields: messages, source, languages, ref_count


In [9]:
# Create HuggingFace DatasetDict with normalized data
print("=" * 60)
print("CREATING HUGGINGFACE DATASET")
print("=" * 60)

# Convert to datasets with normalized data
dataset_dict = DatasetDict({
    "train_single": Dataset.from_list(train_single_normalized),
    "valid_single": Dataset.from_list(valid_single_normalized),
    "train_group": Dataset.from_list(train_group),
    "valid_group": Dataset.from_list(valid_group)
})

print("\nDataset created successfully!")
print(dataset_dict)
print("\n✓ All splits have the same feature schema")


CREATING HUGGINGFACE DATASET

Dataset created successfully!
DatasetDict({
    train_single: Dataset({
        features: ['messages', 'source', 'ref_count', 'languages'],
        num_rows: 25772
    })
    valid_single: Dataset({
        features: ['messages', 'source', 'ref_count', 'languages'],
        num_rows: 1235
    })
    train_group: Dataset({
        features: ['messages', 'source', 'ref_count', 'languages'],
        num_rows: 725
    })
    valid_group: Dataset({
        features: ['messages', 'source', 'ref_count', 'languages'],
        num_rows: 31
    })
})

✓ All splits have the same feature schema


In [10]:
# Push to HuggingFace Hub
print("=" * 60)
print("PUSHING TO HUGGINGFACE HUB")
print("=" * 60)

# Repository name
repo_name = "reference-parsing-lora"

# Push to hub with description
print(f"\nPushing dataset to: {repo_name}")
print("This may take a few minutes...\n")

dataset_dict.push_to_hub(
    repo_name,
    private=False,  # Set to True if you want a private dataset
    commit_message="Upload with normalized schema across all splits"
)

print(f"\n✓ Dataset successfully pushed to HuggingFace Hub!")
print(f"View at: https://huggingface.co/datasets/{repo_name}")



PUSHING TO HUGGINGFACE HUB

Pushing dataset to: reference-parsing-lora
This may take a few minutes...



Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   9%|8         |  582kB / 6.48MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  272kB /  272kB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  96%|#########6| 4.19MB / 4.36MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  397kB /  397kB            

README.md: 0.00B [00:00, ?B/s]


✓ Dataset successfully pushed to HuggingFace Hub!
View at: https://huggingface.co/datasets/reference-parsing-lora


In [11]:
# Load no-PDF group datasets, merge with single, and push to HuggingFace Hub
print("=" * 60)
print("LOADING NO-PDF VERSION, MERGING, AND PUSHING")
print("=" * 60)

# Load no-PDF group datasets
with open(f"{data_dir}/finetuning_train_group_nopdf.json", "r") as f:
    train_group_nopdf = json.load(f)
print(f" Loaded train_group_nopdf: {len(train_group_nopdf):,} examples")

with open(f"{data_dir}/finetuning_valid_group_nopdf.json", "r") as f:
    valid_group_nopdf = json.load(f)
print(f"  Loaded valid_group_nopdf: {len(valid_group_nopdf):,} examples")

# Merge with single normalized datasets

train_mix_nopdf = train_single_normalized.copy()
train_mix_nopdf.extend(train_group_nopdf)
print(f" train: {len(train_mix_nopdf):,} examples (single + group_nopdf)")

valid_mix_nopdf = valid_single_normalized.copy()
valid_mix_nopdf.extend(valid_group_nopdf)
print(f"  valid: {len(valid_mix_nopdf):,} examples (single + group_nopdf)")

# Create HuggingFace Dataset
dataset_dict_nopdf_mix = DatasetDict({
    "train": Dataset.from_list(train_mix_nopdf),
    "valid": Dataset.from_list(valid_mix_nopdf)
})
print(f" {len(train_mix_nopdf) + len(valid_mix_nopdf):,} total examples")

# Push to HuggingFace Hub

repo_name = "yurui983/reference-parsing-lora-mix"
print(f"   Repository: {repo_name}")

dataset_dict_nopdf_mix.push_to_hub(
    repo_name,
    private=False,
    commit_message="Update with no-PDF version: single + group_nopdf (references only, no full PDF text)"
)

print(f"\n✓ Successfully pushed to HuggingFace Hub!")
print(f"   View at: https://huggingface.co/datasets/{repo_name}")


LOADING NO-PDF VERSION, MERGING, AND PUSHING
 Loaded train_group_nopdf: 725 examples
  Loaded valid_group_nopdf: 31 examples
 train: 26,497 examples (single + group_nopdf)
  valid: 1,266 examples (single + group_nopdf)
 27,763 total examples
   Repository: yurui983/reference-parsing-lora-mix


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  97%|#########7| 9.43MB / 9.68MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  46%|####6     |  209kB /  451kB            

README.md:   0%|          | 0.00/562 [00:00<?, ?B/s]


✓ Successfully pushed to HuggingFace Hub!
   View at: https://huggingface.co/datasets/yurui983/reference-parsing-lora-mix
