In [327]:
import json

def load_corpus(file_path: str) -> dict[str, dict]:
    """
    Load corpus data from JSONL file.
    Returns dictionary mapping document IDs to document data.
    """
    corpus = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            doc = json.loads(line)
            id = doc.pop('_id')
            corpus[id] = doc
    return corpus

def load_queries(file_path: str) -> dict[str, dict]:
    """
    Load query data from JSONL file.
    Returns dictionary mapping query IDs to query data.
    """
    queries = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            query = json.loads(line)
            id = query.pop('_id')
            queries[id] = query
    return queries
    

def load_qrels(file_path: str) -> dict[str, dict[str, int]]:
    """
    Load relevance judgments from TSV file.
    Returns dictionary mapping query IDs to candidate relevance scores.
    """
    qrels = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        next(f)  # Skip header line
        for line in f:
            query_id, doc_id, score = line.strip().split('\t')
            if query_id not in qrels:
                qrels[query_id] = {}
            qrels[query_id][doc_id] = int(score)
    return qrels

def load_test(file_path: str) -> dict[str, dict[str, int]]:
    """
    Load test relevance judgments from TSV file.
    """
    test_set = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        next(f)  # Skip header line
        for line in f:
            query_id, doc_id, _ = line.strip().split('\t')
            if query_id not in test_set:
                test_set[query_id] = {}
            test_set[query_id][doc_id] = "#"
    return test_set

In [None]:
# Load the dataset
print("Loading dataset...")
corpus = load_corpus('data/corpus.jsonl')
queries = load_queries('data/queries.jsonl')
qrels_valid = load_qrels('data/valid.tsv')


print(f"Loaded {len(corpus)} documents in corpus")
print(f"Loaded {len(queries)} queries")
print(f"Loaded relevance for {len(qrels_valid)} queries (dataset)")

In [None]:
def preprocessing(corpus, queries):
    preprocessed_corpus = {}
    preprocessed_queries = {}
    for doc_id, doc in corpus.items():
        preprocessed_corpus[doc_id] = f"{doc.get('title', '')}\n{doc.get('text', '')}"
    for query_id, query in queries.items():
        preprocessed_queries[query_id] = f"{query.get('text', '')}"
    return preprocessed_corpus, preprocessed_queries

text_corpus, text_queries = preprocessing(corpus, queries)

In [None]:
for id in qrels_valid.keys():
    num_positive_match = sum(qrels_valid[id].values())
    num_negative_match = len(qrels_valid[id]) - num_positive_match
    print(f"Query ID: {id}, positive Docs: {num_positive_match}, negative Docs: {num_negative_match}")

## Encodage des documents

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def filter_corpus(corpus: dict[str, dict], filter: str = 'title') -> dict[str, dict]:
    """
    Keep only titles
    """
    filtered_corpus = {doc_id: doc[filter] for doc_id, doc in corpus.items()}
    return filtered_corpus

count_vectorizer = CountVectorizer()
X_corpus = count_vectorizer.fit_transform(text_corpus.values())
X_corpus.shape

In [None]:
import pandas as pd
from scipy.sparse import find

def print_feats(v, features, top_n = 30):
    _, ids, values = find(v)
    feats = [(ids[i], values[i], features[ids[i]]) for i in range(len(list(ids)))]
    top_feats = sorted(feats, key=lambda x: x[1], reverse=True)[0:top_n]
    return pd.DataFrame({"word" : [t[2] for t in top_feats], "value": [t[1] for t in top_feats]}) 

print_feats(X_corpus[0], count_vectorizer.get_feature_names_out())

In [None]:
import matplotlib.pyplot as plt

df = print_feats(X_corpus.sum(axis=0), count_vectorizer.get_feature_names_out(), top_n=50)
plt.figure(figsize=(12, 6))
plt.bar(df["word"], df["value"])
plt.xticks(rotation=90)
plt.xlabel("Word")
plt.ylabel("Value")
plt.title("Top Word Values")
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

ids = np.random.randint(0, X_corpus.shape[0]-1, size=2,)
similary = cosine_similarity(X_corpus[ids[0]], X_corpus[ids[1]])

print(f"Document 1:\n {list(text_corpus.values())[ids[0].item()]}")
print(f"Document 2:\n {list(text_corpus.values())[ids[1].item()]}")
print(f"similary: {similary.item()*100:.3f}%")

In [None]:
X_queries = count_vectorizer.transform(text_queries.values())
X_queries.shape

In [None]:
def query_document_similarity(query_vector, document_vectors, top_k: int = 10):
    similarities = cosine_similarity(query_vector, document_vectors)
    similarities = similarities.squeeze()

    # On récupère les top_k meilleurs scores et indices associés
    top_indices = np.argsort(-similarities)[:top_k]
    top_scores = np.take_along_axis(similarities, top_indices, axis=0)
    return top_indices.squeeze(), top_scores.squeeze()

In [None]:
import random

def simple_browser(query, corpus, vectorizer, top_k=10):
    corpus_ids = list(corpus.keys())

    X_corpus = vectorizer.fit_transform(corpus.values())
    X_query = vectorizer.transform(query.values())
    top_indices, top_scores = query_document_similarity(X_query, X_corpus, top_k=top_k)
    
    results = {corpus_ids[idx]: float(top_scores[i]) for i, idx in enumerate(top_indices)}
    return results

random_id = random.choice(list(text_queries.keys()))
text_query = {random_id: text_queries[random_id]}

results = simple_browser(text_query, text_corpus, count_vectorizer, top_k=10)
results

In [None]:
import numpy as np

def save_embeddings(encoder, texts: dict[str, str], file_path: str, device='cpu'):
    ids = np.array(list(texts.keys()))
    vectors = encoder.encode(list(texts.values()), device=device, convert_to_numpy=True, show_progress_bar=True)
    np.savez(file_path, ids=ids, vectors=vectors)

def load_embeddings(file_path: str):
    data = np.load(file_path)
    embeddings = {str(id): vector for id, vector in zip(data['ids'], data['vectors'])}
    return embeddings

In [None]:
from sentence_transformers import SentenceTransformer

device = 'mps'
encoder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

save_embeddings(encoder, text_corpus, 'data/corpus_embeddings.npz', device=device)
save_embeddings(encoder, text_queries, 'data/queries_embeddings.npz', device=device)

In [None]:
from sentence_transformers import SentenceTransformer

device = 'mps'
encoder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

corpus_embeddings = load_embeddings('data/corpus_embeddings.npz')
queries_embeddings = load_embeddings('data/queries_embeddings.npz')

In [294]:
def softmax(x, T=1.0):
    x = np.array(x, dtype=float) / T
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def browser(
    query_embedding: dict[str, np.ndarray],
    corpus_embeddings: dict[str, np.ndarray],
    encoder,
    ppr: dict[str, float] = None,
    beta: float = 0.0,
    temperature: float = 2.0,
    top_k: int = 10,
):  
    query_id = list(query_embedding.keys())[0]
    corpus_ids = list(corpus_embeddings.keys())
    
    query_vector = np.array(list(query_embedding.values())[0])
    corpus_vectors = np.array([corpus_embeddings[cid] for cid in corpus_ids])
    
    similarities =  encoder.similarity(query_vector, corpus_vectors)
    similarities = similarities.squeeze().numpy()

    if ppr is not None:
        ppr_scores = np.array([ppr.get(cid, 0.0) for cid in corpus_ids], dtype=float)
        similarities = softmax(similarities, T=temperature)
        ppr_scores = softmax(ppr_scores, T=temperature)
        final_scores = similarities * (1 - beta) + ppr_scores * beta
    else:
        final_scores = similarities
    
    top_indices = np.argsort(-final_scores)[:top_k]
    results = {corpus_ids[idx]: final_scores[idx].item() for idx in top_indices}
    return results

query_id = random.sample(list(queries_embeddings.keys()), 1)[0]
query_embedding = {query_id: queries_embeddings[query_id]}

results = browser(query_embedding, corpus_embeddings, encoder, ppr=None, top_k=10)
results

{'31900b62fabf7da87573e93e473dd72cc68f24fa': 0.6516915559768677,
 '5ee0e0656af301131a6a4a945113d91cf60575ae': 0.637022078037262,
 'da7ce9b241b076da2b24bd44ea463ba1f0074ac8': 0.6248031854629517,
 'dbcade6e8f05655e55e0f7b9a58088855814cab0': 0.6099755764007568,
 '540c90efdf4c2a5c1f63e69cdd56fcbc79291f94': 0.6024110317230225,
 '1f979f28a267522126acd8569ec2e3b964a7f656': 0.5241543650627136,
 'a7d70bdfd81c27203eab5fc331602494c0ec64f5': 0.5157453417778015,
 '559e2ae733f4d231e2739dbc6d8d528af6feddf3': 0.5080769658088684,
 '655ba8fbbde231371203fa6b389bf6084660bf3e': 0.5029830932617188,
 '8f64de9c6e3c52222896280c9fb19ff6c0a504ea': 0.5005195736885071}

In [295]:
import networkx as nx

def build_citation_graph(queries, docs):
    # Fusionner en un seul dictionnaire
    all_docs = {**queries, **docs}

    G = nx.DiGraph()

    # Ajouter tous les nœuds
    for doc_id, content in all_docs.items():
        G.add_node(doc_id, **content["metadata"])

    # Ajouter les arêtes de citation
    for doc_id, content in all_docs.items():
        references = content["metadata"].get("references", [])
        
        for ref_id in references:
            if ref_id in all_docs:    # évite les références manquantes
                G.add_edge(doc_id, ref_id)  # doc_id --> ref_id (doc_id cites ref_id)

    return G

In [296]:
G = build_citation_graph(queries, corpus)
print("Nombre de documents:", G.number_of_nodes())
print("Nombre de liens de citation:", G.number_of_edges())

Nombre de documents: 26657
Nombre de liens de citation: 24637


In [297]:
sorted(G.in_degree(), key=lambda x: x[1], reverse=True)[:10]

[('272216c1f097706721096669d85b2843c23fa77d', 182),
 ('2c03df8b48bf3fa39054345bafabfeff15bfd11d', 175),
 ('061356704ec86334dbbc073985375fe13cd39088', 167),
 ('0b3cfbf79d50dae4a16584533227bb728e3522aa', 162),
 ('cbcd9f32b526397f88d18163875d04255e72137f', 133),
 ('1e56ed3d2c855f848ffd91baa90f661772a279e1', 117),
 ('39dba6f22d72853561a4ed684be265e179a39e4f', 113),
 ('14318685b5959b51d0f1e3db34643eb2855dc6d9', 110),
 ('0825788b9b5a18e3dfea5b0af123b5e939a4f564', 107),
 ('722fcc35def20cfcca3ada76c8dd7a585d6de386', 106)]

In [337]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from tqdm.notebook import tqdm

def get_all_ppr(G, queries, alpha=0.85):
    ppr_dict = {}
    for query_id in tqdm(queries.keys()):
        ppr = nx.pagerank(G, alpha=alpha, personalization={query_id: 1.0})
        ppr_dict[query_id] = ppr
    return ppr_dict

def test_browser(
        browser,
        qrels_valid,
        queries_embeddings,
        corpus_embeddings,
        encoder,
        ppr_dict,
        beta=0.9,
        temperature=2.0,
):
    preds = []
    targets = []

    for query_id in tqdm(qrels_valid.keys()):
        q_embedding = {query_id: queries_embeddings[query_id]}
        c_embedding = {doc_id: corpus_embeddings[doc_id] for doc_id in qrels_valid[query_id].keys()}
        ppr = ppr_dict[query_id]

        results = browser(q_embedding, c_embedding, encoder, ppr, beta=beta, temperature=temperature, top_k=30)
        threshold = len(qrels_valid[query_id]) - 25
        results = {k: (1 if i < threshold else 0) for i, k in enumerate(results.keys())}

        preds.extend(list(dict(sorted(results.items())).values()))
        targets.extend(list(dict(sorted(qrels_valid[query_id].items())).values()))

    y_pred = np.array(preds)
    y_true = np.array(targets)

    metrics = {
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "auroc": roc_auc_score(y_true, y_pred),
    }
    return metrics


In [299]:
ppr_dict = get_all_ppr(G, qrels_valid, alpha=0.9)

  0%|          | 0/700 [00:00<?, ?it/s]

In [338]:
metrics = test_browser(
    browser, 
    qrels_valid,
    queries_embeddings,
    corpus_embeddings,
    encoder,
    ppr_dict,
    beta=0.99,
    temperature=3.0,
)
metrics

  0%|          | 0/700 [00:00<?, ?it/s]

{'precision': 0.9626086956521739,
 'recall': 0.9626086956521739,
 'f1_score': 0.9626086956521739,
 'auroc': 0.9776186335403727}

In [332]:
test_queries = load_test('data/test_final.tsv')
for id in test_queries.keys():
    length = len(test_queries[id])
    print(f"Query ID: {id}, length: {length}")


Query ID: 7b901e88e8a4afcc4c60c52833820156525f4aed, length: 30
Query ID: 8aba3628ad8c9cec11b2b518a96b883bda8b3cbb, length: 30
Query ID: 39d428fd8c6b73ed070921a856f03c2c5b5377ba, length: 30
Query ID: bcdd5670761de0087d2d2bb0388da697b0d4348c, length: 30
Query ID: 9c824df69c7c6fc350d2981bed00b6df6ffb33ad, length: 30
Query ID: fd4537b92ab9fa7c653e9e5b9c4f815914a498c0, length: 30
Query ID: 30c9a7660281ad8e4538ff9beb20282c74fac810, length: 30
Query ID: 66ef0f611b2b5b22dd3eb13a144c0e7d3286623a, length: 30
Query ID: 7a59595f1859b72761b34892be8b6dc43f71d01e, length: 30
Query ID: 46c87eae824a442323147a845e285167f283dd08, length: 30
Query ID: 5aadc803228b70c3cc6b31e332770d47d7fb1e6e, length: 30
Query ID: 8c872ecd87945e71fcd9fa1b6cb1133cfe805bf2, length: 29
Query ID: 9003fb79e7848ced3be975c3d87a9348a4b8d377, length: 29
Query ID: 8384094ce1b342da9eabd2ec939e9bb16ca7ff5c, length: 30
Query ID: 9759c425008506dac507ed26057febd9cab822b8, length: 30
Query ID: 23fa7b866a1b1fee7bb71c8b5a9235cca7120bbc, len

In [None]:
ppr_dict_test = get_all_ppr(G, test_queries, alpha=0.9)

In [339]:
def predict_browser(
    browser,
    test_set,
    queries_embeddings,
    corpus_embeddings,
    encoder,
    ppr_dict: dict[str, float] = None,
    beta: float = 0.0,
    temperature: float = 2.0,
):  
    preds = {}
    for query_id in tqdm(test_set.keys()):
        q_embedding = {query_id: queries_embeddings[query_id]}
        c_embedding = {doc_id: corpus_embeddings[doc_id] for doc_id in test_set[query_id].keys()}
        ppr = ppr_dict[query_id]

        results = browser(q_embedding, c_embedding, encoder, ppr, beta=beta, temperature=temperature, top_k=30)
        threshold = len(test_set[query_id]) - 25
        results = {k: (1 if i < threshold else 0) for i, k in enumerate(results.keys())}
        preds[query_id] = results
    return preds

In [345]:
def save_predictions(preds: dict[str, dict[str, int]], test_set: dict[str, dict[str, int]], file_path: str):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write("query_id\tcorpus_id\tscore\n")
        for query_id in test_set.keys():
            for doc_id in test_set[query_id].keys():
                score = preds[query_id].get(doc_id, 0)
                f.write(f"{query_id}\t{doc_id}\t{score}\n")

In [346]:
preds = predict_browser(
    browser,
    test_queries,
    queries_embeddings,
    corpus_embeddings,
    encoder,
    ppr_dict_test,
    beta=0.99,
    temperature=3.0,
)


save_predictions(preds, test_queries, 'data/test_predictions.csv')

  0%|          | 0/300 [00:00<?, ?it/s]