In [2]:
import pandas as pd
import json

import pandas as pd

qrels_train = pd.read_csv("data/PAR/qrels_train.tsv", sep="\t")
qrels_test = pd.read_csv("data/PAR/qrels_test.tsv", sep="\t")
qrels_dev = pd.read_csv("data/PAR/qrels_dev.tsv", sep="\t")

print(qrels_train.head())

print(qrels_train["score"].unique())

print(qrels_train.shape)
print(qrels_test.shape)
print(qrels_dev.shape)

    query-id  corpus-id  score
0  7665777-1   32320506      1
1  7665777-1   32293716      1
2  7665777-1   23219649      1
3  7665777-1   30339549      1
4  7665777-1   17470624      1
[1 2]
(1978118, 3)
(77147, 3)
(73394, 3)


In [None]:
# Make new corpus
# 50k train 
# top 30 articles from BM25 retrieval results for each query as negative samples

# 1. Randomly select 50k articles from the corpus and queries
# 2. Fore each query, 30 irrelevant articles(documents)


In [43]:
# Sample 50k unique articles (distinct corpus-id) from qrels_train.tsv
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Read the original qrels_train.tsv
qrels_train = pd.read_csv("data/PAR/qrels_train.tsv", sep="\t")
qrels_dev = pd.read_csv("data/PAR/qrels_dev.tsv", sep="\t")
qrels_test = pd.read_csv("data/PAR/qrels_test.tsv", sep="\t")

# Get unique document IDs available in qrels
unique_docs_train = qrels_train['corpus-id'].unique()
unique_docs_dev = qrels_dev['corpus-id'].unique()
unique_docs_test = qrels_test['corpus-id'].unique()
print(f"Unique documents in qrels_train: {len(unique_docs_train)}")
print(f"Unique documents in qrels_dev: {len(unique_docs_dev)}")
print(f"Unique documents in qrels_test: {len(unique_docs_test)}")

# Decide sample size (don't request more than exist)
n_to_sample_train = min(80000, len(unique_docs_train))
n_to_sample_dev = min(10000, len(unique_docs_dev))
n_to_sample_test = min(10000, len(unique_docs_test))

# Sample unique document IDs without replacement
sampled_docs_train = np.random.choice(unique_docs_train, size=n_to_sample_train, replace=False)
sampled_docs_dev = np.random.choice(unique_docs_dev, size=n_to_sample_dev, replace=False)
sampled_docs_test = np.random.choice(unique_docs_test, size=n_to_sample_test, replace=False)

# Keep all qrels rows that reference the sampled documents
sampled_qrels_train = qrels_train[qrels_train['corpus-id'].isin(sampled_docs_train)]
sampled_qrels_dev = qrels_dev[qrels_dev['corpus-id'].isin(sampled_docs_dev)]
sampled_qrels_test = qrels_test[qrels_test['corpus-id'].isin(sampled_docs_test)]

# Save only the original query pair + scores in fixed column order
cols = ['query-id', 'corpus-id', 'score']
output_path_train = "data/PAR/80k_qrels_train.tsv"
output_path_dev = "data/PAR/10k_qrels_dev.tsv"
output_path_test = "data/PAR/10k_qrels_test.tsv"

sampled_qrels_train[cols].to_csv(output_path_train, sep='\t', index=False)
sampled_qrels_dev[cols].to_csv(output_path_dev, sep='\t', index=False)
sampled_qrels_test[cols].to_csv(output_path_test, sep='\t', index=False)

print(f"Sampled {n_to_sample_train} unique documents and saved {sampled_qrels_train.shape[0]} rows to {output_path_train}")
print(f"Sampled {n_to_sample_dev} unique documents and saved {sampled_qrels_dev.shape[0]} rows to {output_path_dev}")
print(f"Sampled {n_to_sample_test} unique documents and saved {sampled_qrels_test.shape[0]} rows to {output_path_test}")

# Optional: save the list of sampled document ids for reproducibility
# pd.Series(sampled_docs).to_csv('data/PAR/50k_sampled_doc_ids.txt', index=False, header=False)
# print("Saved sampled document ids to data/PAR/50k_sampled_doc_ids.txt")

# Check if we maintained the same columns and structure
# print("\nFirst few rows of sampled data:")
# print(sampled_qrels_train.head())

# Check unique queries and documents in the sample
print(f"\nNumber of unique queries: {sampled_qrels_train['query-id'].nunique()}")
print(f"Number of unique documents in saved file: {sampled_qrels_train['corpus-id'].nunique()}")

print(f"\nNumber of unique queries: {sampled_qrels_dev['query-id'].nunique()}")
print(f"Number of unique documents in saved file: {sampled_qrels_dev['corpus-id'].nunique()}")

print(f"\nNumber of unique queries: {sampled_qrels_test['query-id'].nunique()}")
print(f"Number of unique documents in saved file: {sampled_qrels_test['corpus-id'].nunique()}")



Unique documents in qrels_train: 879083
Unique documents in qrels_dev: 55265
Unique documents in qrels_test: 56304
Sampled 80000 unique documents and saved 180770 rows to data/PAR/80k_qrels_train.tsv
Sampled 10000 unique documents and saved 13440 rows to data/PAR/10k_qrels_dev.tsv
Sampled 10000 unique documents and saved 13666 rows to data/PAR/10k_qrels_test.tsv

Number of unique queries: 95914
Number of unique documents in saved file: 80000

Number of unique queries: 4841
Number of unique documents in saved file: 10000

Number of unique queries: 4868
Number of unique documents in saved file: 10000


In [48]:
# Combine unique corpus IDs from train/dev/test and extract documents
import pandas as pd
import json
import logging
from datetime import datetime
from pathlib import Path

# Set up logging
log_path = "data/PAR/corpus_extraction.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    handlers=[
        logging.FileHandler(log_path),
        logging.StreamHandler()  # Also print to console
    ]
)
logger = logging.getLogger(__name__)

# Log start time
logger.info("Starting corpus extraction process")

# Read and combine unique corpus IDs from all qrels files
logger.info("Reading qrels files...")

train_50k = pd.read_csv("data/PAR/80k_qrels_train.tsv", sep="\t")
dev = pd.read_csv("data/PAR/10k_qrels_dev.tsv", sep="\t")
test = pd.read_csv("data/PAR/10k_qrels_test.tsv", sep="\t")

# Get unique corpus IDs from each file
train_ids = set(train_50k['corpus-id'].unique())
dev_ids = set(dev['corpus-id'].unique())
test_ids = set(test['corpus-id'].unique())

logger.info(f"Found {len(train_ids)} unique documents in train")
logger.info(f"Found {len(dev_ids)} unique documents in dev")
logger.info(f"Found {len(test_ids)} unique documents in test")

# Combine all unique IDs
all_ids = train_ids.union(dev_ids, test_ids)
logger.info(f"Total unique documents needed: {len(all_ids)}")

# Convert IDs to strings for matching (in case they're numeric)
all_ids = {str(id) for id in all_ids}

# Read and filter corpus.jsonl
logger.info("Reading corpus.jsonl and extracting required documents...")

output_docs = []
total_docs = 0
matched_docs = 0

with open("data/PAR/corpus.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        total_docs += 1
        if total_docs % 1000000 == 0:  # Log progress every million documents
            logger.info(f"Processed {total_docs:,} documents...")
        
        if line.strip():
            doc = json.loads(line)
            if str(doc['_id']) in all_ids:
                output_docs.append(doc)
                matched_docs += 1

logger.info(f"Finished processing {total_docs:,} documents")
logger.info(f"Found {matched_docs:,} matching documents")

# Save filtered corpus
output_path = "data/PAR/sampled_corpus.jsonl"
logger.info(f"Saving matched documents to {output_path}")

with open(output_path, 'w', encoding='utf-8') as f:
    for doc in output_docs:
        f.write(json.dumps(doc, ensure_ascii=False) + '\n')

logger.info("✅ Extraction complete!")
logger.info(f"Full log saved to {log_path}")

# Print summary statistics
print("\nSummary:")
print(f"Total documents processed: {total_docs:,}")
print(f"Documents in train_50k: {len(train_ids):,}")
print(f"Documents in dev: {len(dev_ids):,}")
print(f"Documents in test: {len(test_ids):,}")
print(f"Total unique documents needed: {len(all_ids):,}")
print(f"Documents extracted and saved: {matched_docs:,}")
print(f"\nDetailed log saved to: {log_path}")

2025-11-01 14:37:23,469 - Starting corpus extraction process
2025-11-01 14:37:23,471 - Reading qrels files...
2025-11-01 14:37:23,586 - Found 80000 unique documents in train
2025-11-01 14:37:23,586 - Found 10000 unique documents in dev
2025-11-01 14:37:23,587 - Found 10000 unique documents in test
2025-11-01 14:37:23,600 - Total unique documents needed: 98687
2025-11-01 14:37:23,628 - Reading corpus.jsonl and extracting required documents...
2025-11-01 14:37:29,393 - Processed 1,000,000 documents...
2025-11-01 14:37:34,872 - Processed 2,000,000 documents...
2025-11-01 14:37:40,363 - Processed 3,000,000 documents...
2025-11-01 14:37:45,830 - Processed 4,000,000 documents...
2025-11-01 14:37:51,317 - Processed 5,000,000 documents...
2025-11-01 14:37:56,778 - Processed 6,000,000 documents...
2025-11-01 14:38:02,278 - Processed 7,000,000 documents...
2025-11-01 14:38:07,851 - Processed 8,000,000 documents...
2025-11-01 14:38:13,314 - Processed 9,000,000 documents...
2025-11-01 14:38:18,630


Summary:
Total documents processed: 11,713,201
Documents in train_50k: 80,000
Documents in dev: 10,000
Documents in test: 10,000
Total unique documents needed: 98,687
Documents extracted and saved: 98,687

Detailed log saved to: data/PAR/corpus_extraction.log


In [49]:
#!/usr/bin/env python3
import json
import random
from pathlib import Path
import logging

# Config
SAMPLED_CORPUS = Path("data/PAR/sampled_corpus.jsonl")       # existing corpus (98,687)
IRRELEVANT = Path("data/PAR/irrelevant.jsonl")               # large irrelevant pool
OUTPUT = Path("data/PAR/sampled_corpus_200k.jsonl")          # new combined output
ADDED_IDS = Path("data/PAR/added_irrelevant_ids.txt")
LOG_PATH = Path("data/PAR/irrelevant_sampling.log")
TARGET_TOTAL = 200_000
RANDOM_SEED = 42

# Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler(LOG_PATH), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

def load_existing_docs(path):
    docs = []
    ids = set()
    logger.info(f"Loading existing sampled corpus from {path}")
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            doc = json.loads(line)
            docs.append(doc)
            ids.add(str(doc["_id"]))
    logger.info(f"Loaded {len(docs):,} documents; unique IDs: {len(ids):,}")
    return docs, ids

def reservoir_sample_irrelevant(irrelevant_path, exclude_ids, k, seed=None):
    """
    Reservoir-sample k documents from irrelevant_path while skipping docs whose id is in exclude_ids.
    Returns list of sampled doc objects (length <= k if not enough candidates).
    """
    if k <= 0:
        return []
    if seed is not None:
        random.seed(seed)

    reservoir = []
    count_candidates = 0
    logger.info(f"Starting reservoir sampling from {irrelevant_path} for k={k} (excluding {len(exclude_ids):,} ids)")
    with irrelevant_path.open("r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, start=1):
            if not line.strip():
                continue
            try:
                doc = json.loads(line)
            except Exception as e:
                logger.warning(f"Skipping invalid JSON at line {line_no}: {e}")
                continue
            doc_id = str(doc.get("_id") or doc.get("id") or "")
            if not doc_id or doc_id in exclude_ids:
                continue
            count_candidates += 1
            if len(reservoir) < k:
                reservoir.append(doc)
            else:
                # replace with decreasing probability
                j = random.randint(0, count_candidates - 1)
                if j < k:
                    reservoir[j] = doc
            # optional progress log
            if count_candidates % 100000 == 0:
                logger.info(f"Seen {count_candidates:,} eligible irrelevant docs so far (processed {line_no:,} lines)")
    logger.info(f"Reservoir sampling finished: seen {count_candidates:,} eligible candidates, sampled {len(reservoir):,}")
    return reservoir

def main():
    # 1) load existing sampled corpus
    existing_docs, existing_ids = load_existing_docs(SAMPLED_CORPUS)
    current_count = len(existing_docs)
    need = max(0, TARGET_TOTAL - current_count)
    logger.info(f"Current sampled corpus size: {current_count:,}. Need {need:,} more documents to reach {TARGET_TOTAL:,}.")

    if need == 0:
        logger.info("Target already reached; copying original file to output.")
        # just copy
        with OUTPUT.open("w", encoding="utf-8") as out_f, SAMPLED_CORPUS.open("r", encoding="utf-8") as in_f:
            for line in in_f:
                out_f.write(line)
        return

    # 2) reservoir sample from irrelevant.jsonl
    sampled_new = reservoir_sample_irrelevant(IRRELEVANT, existing_ids, need, seed=RANDOM_SEED)

    # 3) merge and write output
    total_written = 0
    logger.info(f"Writing combined file to {OUTPUT}")
    with OUTPUT.open("w", encoding="utf-8") as out_f:
        for doc in existing_docs:
            out_f.write(json.dumps(doc, ensure_ascii=False) + "\n")
            total_written += 1
        for doc in sampled_new:
            out_f.write(json.dumps(doc, ensure_ascii=False) + "\n")
            total_written += 1

    logger.info(f"Wrote {total_written:,} documents to {OUTPUT} (existing: {len(existing_docs):,}, added: {len(sampled_new):,})")

    # 4) save added IDs
    with ADDED_IDS.open("w", encoding="utf-8") as idf:
        for doc in sampled_new:
            idf.write(str(doc["_id"]) + "\n")
    logger.info(f"Saved added IDs ({len(sampled_new):,}) to {ADDED_IDS}")

    # 5) final summary
    print()
    print(f"Existing docs: {len(existing_docs):,}")
    print(f"Added docs: {len(sampled_new):,}")
    print(f"Final total written: {total_written:,}")
    print(f"Output file: {OUTPUT}")
    print(f"Log: {LOG_PATH}")
    logger.info("Done.")

if __name__ == "__main__":
    main()

2025-11-01 14:50:45,085 - Loading existing sampled corpus from data/PAR/sampled_corpus.jsonl
2025-11-01 14:50:45,677 - Loaded 98,687 documents; unique IDs: 98,687
2025-11-01 14:50:45,677 - Current sampled corpus size: 98,687. Need 101,313 more documents to reach 200,000.
2025-11-01 14:50:45,678 - Starting reservoir sampling from data/PAR/irrelevant.jsonl for k=101313 (excluding 98,687 ids)
2025-11-01 14:50:46,311 - Seen 100,000 eligible irrelevant docs so far (processed 100,000 lines)
2025-11-01 14:50:47,106 - Seen 200,000 eligible irrelevant docs so far (processed 200,000 lines)
2025-11-01 14:50:47,859 - Seen 300,000 eligible irrelevant docs so far (processed 300,000 lines)
2025-11-01 14:50:48,608 - Seen 400,000 eligible irrelevant docs so far (processed 400,000 lines)
2025-11-01 14:50:49,285 - Seen 500,000 eligible irrelevant docs so far (processed 500,000 lines)
2025-11-01 14:50:50,043 - Seen 600,000 eligible irrelevant docs so far (processed 600,000 lines)
2025-11-01 14:50:50,736 -


Existing docs: 98,687
Added docs: 101,313
Final total written: 200,000
Output file: data/PAR/sampled_corpus_200k.jsonl
Log: data/PAR/irrelevant_sampling.log


In [50]:
with open("data/PAR/sampled_corpus_200k.jsonl", 'r') as data:
    lines = data.readlines()
    print(f"Total documents in new sampled corpus: {len(lines)}")

Total documents in new sampled corpus: 200000


---

# Draft

In [40]:
# Load the sampled 50k qrels and count per query how many 'score' == 2
import pandas as pd

train_50k = pd.read_csv("data/PAR/qrels_dev.tsv", sep="\t")

# Filter rows where score == 2, then group by query-id and count
score2_counts = train_50k[train_50k['score'] == 2].groupby('query-id').size().rename('count_score_2')

# Show top rows
print(score2_counts.head())

# Summary stats
print(f"Total queries in TEST: {train_50k['query-id'].nunique()}")
print(f"Total articles: {train_50k['corpus-id'].nunique()}")
print(f"Total articles with score==2: {train_50k[train_50k['score'] == 2].count()}")
print(f"Queries with at least one score==2: {score2_counts.shape[0]}")
print(f"Total rows with score==2: {score2_counts.sum()}")

# Save counts to file
# score2_counts.to_csv('data/PAR/train_50k_score2_counts.tsv', sep='\t', header=True)
# print("Saved per-query score==2 counts to data/PAR/train_50k_score2_counts.tsv")

query-id
1208875-1    1
1208875-2    1
1262727-1    2
1373662-1    1
1373662-2    1
Name: count_score_2, dtype: int64
Total queries in TEST: 5900
Total articles: 55265
Total articles with score==2: query-id     2800
corpus-id    2800
score        2800
dtype: int64
Queries with at least one score==2: 2109
Total rows with score==2: 2800


In [42]:
# Load the sampled 50k qrels and count per query how many 'score' == 2
import pandas as pd

train_50k = pd.read_csv("data/PAR/qrels_test.tsv", sep="\t")

# Filter rows where score == 2, then group by query-id and count
score2_counts = train_50k[train_50k['score'] == 2].groupby('query-id').size().rename('count_score_2')

# Show top rows
print(score2_counts.head())

# Summary stats
print(f"Total queries in TEST: {train_50k['query-id'].nunique()}")
print(f"Total articles: {train_50k['corpus-id'].nunique()}")
print(f"Total articles with score==2: {train_50k[train_50k['score'] == 2].count()}")
print(f"Queries with at least one score==2: {score2_counts.shape[0]}")
print(f"Total rows with score==2: {score2_counts.sum()}")
print(len(train_50k))

# Save counts to file
# score2_counts.to_csv('data/PAR/train_50k_score2_counts.tsv', sep='\t', header=True)
# print("Saved per-query score==2 counts to data/PAR/train_50k_score2_counts.tsv")

query-id
1181818-1    1
1192792-1    1
1208925-1    1
1386678-1    1
1403799-1    1
Name: count_score_2, dtype: int64
Total queries in TEST: 5937
Total articles: 56304
Total articles with score==2: query-id     3088
corpus-id    3088
score        3088
dtype: int64
Queries with at least one score==2: 2150
Total rows with score==2: 3088
77147


In [47]:
# Load the sampled 50k qrels and count per query how many 'score' == 2
import pandas as pd

train_50k = pd.read_csv("data/PAR/qrels_train.tsv", sep="\t")

# Filter rows where score == 2, then group by query-id and count
score2_counts = train_50k[train_50k['score'] == 2].groupby('query-id').size().rename('count_score_2')

# Show top rows
print(score2_counts.head())

# Summary stats
print(f"Total queries in sample: {train_50k['query-id'].nunique()}")
print(f"Total articles with score==2: {train_50k[train_50k['score'] == 2].count()}")
print(f"Queries with at least one score==2: {score2_counts.shape[0]}")
print(f"Total rows with score==2: {score2_counts.sum()}")
print(f"Total articles:{train_50k['corpus-id'].shape[0]}")

# Save counts to file
# score2_counts.to_csv('data/PAR/train_50k_score2_counts.tsv', sep='\t', header=True)
# print("Saved per-query score==2 counts to data/PAR/train_50k_score2_counts.tsv")

query-id
1065068-1    1
1079861-1    1
1079877-1    1
1079877-2    1
1079877-3    1
Name: count_score_2, dtype: int64
Total queries in sample: 154518
Total articles with score==2: query-id     74886
corpus-id    74886
score        74886
dtype: int64
Queries with at least one score==2: 55561
Total rows with score==2: 74886
Total articles:1978118


---

In [32]:
# Load the sampled 50k qrels and count per query how many 'score' == 2
import pandas as pd

train_50k = pd.read_csv("data/PAR/50k_qrels_train.tsv", sep="\t")

# Filter rows where score == 2, then group by query-id and count
score2_counts = train_50k[train_50k['score'] == 2].groupby('query-id').size().rename('count_score_2')

# Show top rows
print(score2_counts.head())

# Summary stats
print(f"Total queries in sample: {train_50k['query-id'].nunique()}")
print(f"Total articles with score==2: {train_50k[train_50k['score'] == 2].count()}")
print(f"Queries with at least one score==2: {score2_counts.shape[0]}")
print(f"Total rows with score==2: {score2_counts.sum()}")

# Save counts to file
# score2_counts.to_csv('data/PAR/train_50k_score2_counts.tsv', sep='\t', header=True)
# print("Saved per-query score==2 counts to data/PAR/train_50k_score2_counts.tsv")

query-id
1087841-1    1
1087841-2    1
1087841-3    1
1087841-4    1
1087841-5    1
Name: count_score_2, dtype: int64
Total queries in sample: 72381
Total articles with score==2: query-id     4286
corpus-id    4286
score        4286
dtype: int64
Queries with at least one score==2: 4197
Total rows with score==2: 4286


In [28]:
train_50k[train_50k['score'] == 2].count()

query-id     74886
corpus-id    74886
score        74886
dtype: int64