In [None]:
!pip install datasets
!pip install --upgrade sentence-transformers
!pip install langchain_experimental

import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
from sentence_transformers import CrossEncoder
from torch.utils.data import DataLoader
import torch
from datasets import Dataset
from datasets import load_dataset
from sentence_transformers.util import mine_hard_negatives
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder.losses import BinaryCrossEntropyLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainer
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments
import os
from collections import defaultdict
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [3]:
PATH_COLLECTION_DATA = '../X_Data/subtask4b_collection_data.pkl'
PATH_QUERY_TRAIN_DATA = '../X_Data/subtask4b_query_tweets_train.tsv' #MODIFY PATH
PATH_QUERY_DEV_DATA = '../X_Data/subtask4b_query_tweets_dev.tsv' #MODIFY PATH
PATH_QUERY_TRAIN_BM25 = '../X_Data/df_train_bm25_50.csv' #MODIFY PATH
PATH_QUERY_DEV_BM25 = '../X_Data/df_dev_bm25_50.csv' #MODIFY PATH

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')
df_train_bm25 = pd.read_csv(PATH_QUERY_TRAIN_BM25, sep = ',')
df_dev_bm25 = pd.read_csv(PATH_QUERY_DEV_BM25, sep = ',')

df_dev_bm25["bm25_topk"] = df_dev_bm25["bm25_topk"].apply(ast.literal_eval)
df_train_bm25["bm25_topk"] = df_train_bm25["bm25_topk"].apply(ast.literal_eval)

In [None]:
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L12-v2"
EMBEDDING_MODEL_NAME = "sentence-transformers/static-retrieval-mrl-en-v1"
TRAIN_BATCH_SIZE = 32
NUM_EPOCHS = 3
NUM_HARD_NEGATIVES = 3
os.environ["WANDB_DISABLED"] = "true"

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
text_splitter = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="gradient",
    breakpoint_threshold_amount=0.3
)

def semantic_chunking(text):
    documents = text_splitter.create_documents([text])
    chunks = [doc.page_content for doc in documents]
    return chunks

In [None]:
queries, documents, labels = [], [], []

for row in tqdm(df_train_bm25.itertuples(), total=len(df_train_bm25)):
    matched = df_collection[df_collection['cord_uid'] == row.cord_uid]

    queries.append(row.tweet_text)
    documents.append(matched['abstract'].iloc[0])
    labels.append(1.0)

full_dataset = Dataset.from_dict({
    "query": queries,
    "answer": documents,
    "label": labels
})

dataset = full_dataset.train_test_split(test_size=1000, seed=12)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

100%|██████████| 12853/12853 [00:14<00:00, 908.27it/s]


In [None]:
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

hard_train_dataset = mine_hard_negatives(
    train_dataset,
    embedding_model,
    num_negatives=NUM_HARD_NEGATIVES,
    relative_margin=0.1,
    max_score=0.8,
    range_min=1,
    range_max=2000,
    sampling_strategy="top",
    output_format='labeled-pair',
    batch_size=128
)

Found 11843 unique queries out of 11853 total queries.
Found an average of 1.001 positives per query.


Batches:   0%|          | 0/52 [00:00<?, ?it/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Metric       Positive       Negative     Difference
Count          11,853         34,177               
Mean           0.5195         0.4491         0.0842
Median         0.5362         0.4560         0.0639
Std            0.1639         0.1216         0.0630
Min           -0.0981         0.0285        -0.2449
25%            0.4158         0.3675         0.0463
50%            0.5362         0.4560         0.0639
75%            0.6411         0.5380         0.0981
Max            0.9577         0.7882         0.6012
Skipped 2,687,081 potential negatives (11.33%) due to the relative_margin of 0.1.
Could not find enough negatives for 1382 samples (3.89%). Consider adjusting the range_max, range_min, relative_margin and max_score parameters if you'd like to find more valid negatives.


In [None]:
query, document, labels = [], [], []

for row in tqdm(hard_train_dataset):
    doc_chunks = semantic_chunking(row['answer'])
    for chunk in doc_chunks:
        query.append(row['query'])
        document.append(chunk)
        labels.append(row['label'])

hard_train_dataset = Dataset.from_dict({
    "query": query,
    "document": document,
    "label": labels
})

100%|██████████| 46000/46000 [03:11<00:00, 239.74it/s]


In [None]:
hard_eval_dataset = mine_hard_negatives(
    eval_dataset,
    embedding_model,
    num_negatives=NUM_HARD_NEGATIVES,
    relative_margin=0.1,
    max_score=0.8,
    range_min=1,
    range_max=250,
    sampling_strategy="top",
    batch_size=128,
    output_format='n-tuple'
)

reranking_evaluator = CrossEncoderRerankingEvaluator(
    samples=[{
        "query": sample["query"],
        "positive": [sample["answer"]],
        "negative": [sample[col] for col in hard_eval_dataset.column_names[2:]],
    } for sample in hard_eval_dataset],
    batch_size=TRAIN_BATCH_SIZE
)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Metric       Positive       Negative     Difference
Count           1,000          2,844               
Mean           0.5174         0.4193         0.1161
Median         0.5309         0.4238         0.0847
Std            0.1652         0.1141         0.0876
Min           -0.0474        -0.0157        -0.0012
25%            0.4085         0.3414         0.0539
50%            0.5310         0.4238         0.0848
75%            0.6416         0.4989         0.1540
Max            0.9473         0.7379         0.5897
Skipped 32,181 potential negatives (12.82%) due to the relative_margin of 0.1.
Could not find enough negatives for 156 samples (5.20%). Consider adjusting the range_max, range_min, relative_margin and max_score parameters if you'd like to find more valid negatives.


In [None]:
model = CrossEncoder(MODEL_NAME, device='cuda')
loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(NUM_HARD_NEGATIVES))

args = CrossEncoderTrainingArguments(
    output_dir="models",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=TRAIN_BATCH_SIZE,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_mrr@10',
    eval_strategy="steps",
    eval_steps=5000,
    save_steps=5000,
    seed=12
)

trainer = CrossEncoderTrainer(
    model=model,
    args=args,
    train_dataset=hard_train_dataset,
    loss=loss,
    evaluator=reranking_evaluator,
)

trainer.train()

results = reranking_evaluator(model)
print(results)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Map,Mrr@10,Ndcg@10
5000,0.5075,No log,0.954553,0.954553,0.966229
10000,0.4228,No log,0.958421,0.958421,0.969151
15000,0.3228,No log,0.952971,0.952971,0.96512
20000,0.2998,No log,0.965014,0.965014,0.974034
25000,0.2792,No log,0.960883,0.960883,0.970984
30000,0.2188,No log,0.957894,0.957894,0.968787
35000,0.2214,No log,0.966772,0.966772,0.975365
40000,0.2016,No log,0.966069,0.966069,0.974838


{'map': 0.9667721518987342, 'mrr@10': 0.9667721518987342, 'ndcg@10': 0.9753649591641104}


In [None]:
#model.half()

collection_dict = df_collection.set_index('cord_uid')['abstract'].to_dict()

pairs = []
query_indices = []
uid_mappings = []

for idx, row in tqdm(enumerate(df_dev_bm25.itertuples())):
    query = row.tweet_text
    candidate_uids = row.bm25_topk

    for uid in candidate_uids:
        abstract = collection_dict[uid]
        chunks = semantic_chunking(abstract)

        for chunk in chunks:
            pairs.append([query, chunk])
            query_indices.append(idx)
            uid_mappings.append(uid)


all_scores = model.predict(pairs)

query_results = [defaultdict(float) for _ in range(len(df_dev_bm25))]

for idx, uid, score in zip(query_indices, uid_mappings, all_scores):
    query_results[idx][uid] = max(query_results[idx][uid], score)

reranked_uids = []

for idx in range(len(df_dev_bm25)):
    max_scores = query_results[idx]
    sorted_uids = sorted(max_scores.items(), key=lambda x: x[1], reverse=True)
    reranked_uids.append([uid for uid, _ in sorted_uids])

df_dev_bm25['reranked_topk'] = reranked_uids

1400it [05:21,  4.36it/s]


In [None]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

results_dev = get_performance_mrr(df_dev_bm25, 'cord_uid', 'bm25_topk')
results_dev_reranked = get_performance_mrr(df_dev_bm25, 'cord_uid', 'reranked_topk')

print(f"BM25 Results: {results_dev}")
print(f"Reranked Results: {results_dev_reranked}")

BM25 Results: {1: np.float64(0.5057142857142857), 5: np.float64(0.5522738095238094), 10: np.float64(0.557658163265306)}
Reranked Results: {1: np.float64(0.5542857142857143), 5: np.float64(0.6044285714285714), 10: np.float64(0.6088543083900227)}
