In [1]:
import json

def load_hits_from_qrels_queries_corpus(qrels_file, queries_file, corpus_file=None):
    print(f"Loading qids from '{queries_file}'")
    queries = load_qids_to_queries(queries_file)

    print(f"Loading corpus from '{corpus_file}'")
    corpus = load_pids_to_passages(corpus_file) if corpus_file is not None else None

    # Step 3: Load qrels and combine all data
    results = {}
    with open(qrels_file, 'r') as f:
        for line in f:
            # Skip if the first line is the header
            if line.startswith("query-id"):
                continue

            qid, docid, score = line.strip().split('\t')
            score = float(score)

            # Initialize query entry if not already present
            if qid not in results:
                results[qid] = {'query': queries[qid], 'hits': []}

            # Create a hit entry
            hit = {
                'qid': qid,
                'docid': docid,
                'score': score,
                'content': corpus[docid] if corpus_file is not None else None
            }

            results[qid]['hits'].append(hit)

    # Step 4: Sort the queries by numeric qid and their hits by score
    rank_results = []
    for qid in sorted(results.keys(), key=lambda x: int(x.replace("test", "").replace("train", "").replace("dev", ""))):  # Sort by numeric qid
        sorted_hits = sorted(
            results[qid]['hits'], 
            key=lambda x: -x['score']  # Sort hits by score in descending order
        )
        rank_results.append({
            'query': results[qid]['query'],
            'hits': sorted_hits
        })

    return rank_results

def load_qids_to_queries(queries_file):
    queries = {}
    with open(queries_file, 'r') as f:
        for line in f:
            line = json.loads(line)
            qid, query = line["_id"], line["text"]
            queries[qid] = query
    return queries

def load_pids_to_passages(corpus_file):
    corpus = {}
    with open(corpus_file, 'r') as f:
        for line in f:
            data = json.loads(line)
            pid = data["_id"]
            
            # Extract title and text, combining them if the title exists
            title = data.get("title", "")
            text = data["text"]
            passage = title + "\n" + text if title and title.strip() else text
            
            corpus[pid] = passage
    return corpus

def load_qid_to_pid_to_score(qrels_file):
    qid_to_pid_to_score = {}
    with open(qrels_file, 'r') as f:
        for line in f:
            if line.startswith("query-id"):
                continue

            qid, pid, score = line.strip().split('\t')
            score = float(score)
            
            if qid not in qid_to_pid_to_score:
                qid_to_pid_to_score[qid] = {}
            qid_to_pid_to_score[qid][pid] = score
    return qid_to_pid_to_score


In [30]:

from torch.utils.data import Dataset

class RawTextPairDataset(Dataset):
    def __init__(self, qrels_path, queries_path, corpus_path):
        rank_results = load_hits_from_qrels_queries_corpus(qrels_path, queries_path, corpus_path)

        # Attach query to hits
        for rank_result in rank_results:
            for hit in rank_result['hits']:
                hit['query'] = rank_result['query']

        self.hits = [hit for rank_result in rank_results for hit in rank_result['hits']]

    def __len__(self):
        return len(self.hits)

    def __getitem__(self, idx):
        item = self.hits[idx]
        return {
            "qid": item['qid'],
            "query": item['query'],
            "pid": item['docid'],
            "passage": item['content'],
        }
    
    def collate_fn(self, batch):
        qids = [sample['qid'] for sample in batch]
        queries = [sample['query'] for sample in batch]
        pids = [sample['pid'] for sample in batch]
        passages = [sample['passage'] for sample in batch]

        return {
            "qids": qids,
            "queries": queries,
            "pids": pids,
            "passages": passages
        }

In [15]:
queries_path = "../data/nq/queries.jsonl"
corpus_path = "../data/nq/corpus.jsonl"
qrels_path = "../data/nq/qrels/test.tsv"

dataset = RawTextPairDataset(qrels_path, queries_path, corpus_path)

Loading qids from '../data/nq/queries.jsonl'
Loading corpus from '../data/nq/corpus.jsonl'


In [28]:
len(dataset)

4201

In [29]:
dataset[4199]

{'qid': 'test3450',
 'query': 'where does the great outdoors movie take place',
 'pid': 'doc117663',
 'passage': 'The Great Outdoors (film)\nChicagoan Chester "Chet" Ripley, his wife, Connie, and their two sons, Buckley "Buck" and Ben, are on vacation at a lake resort in Pechoggin, Wisconsin during the summer. All is going as planned until Connie\'s sister, Kate, Kate\'s investment broker husband, Roman Craig, and their twin daughters, Mara and Cara, crash the vacation.'}

In [25]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=2, collate_fn=dataset.collate_fn)

In [27]:
for i, batch in enumerate(dataloader):
    if i == 10:
        break
    
    print(batch)

{'qids': ['test0', 'test0'], 'queries': ['what is non controlling interest on balance sheet', 'what is non controlling interest on balance sheet'], 'pids': ['doc0', 'doc1'], 'passages': ["Minority interest\nIn accounting, minority interest (or non-controlling interest) is the portion of a subsidiary corporation's stock that is not owned by the parent corporation. The magnitude of the minority interest in the subsidiary company is generally less than 50% of outstanding shares, or the corporation would generally cease to be a subsidiary of the parent.[1]", 'Minority interest\nIt is, however, possible (such as through special voting rights) for a controlling interest requiring consolidation to be achieved without exceeding 50% ownership, depending on the accounting standards being employed. Minority interest belongs to other investors and is reported on the consolidated balance sheet of the owning company to reflect the claim on assets belonging to other, non-controlling shareholders. Als