In [None]:
!pip install datasets
!pip install --upgrade sentence-transformers
!pip install langchain_experimental

In [61]:
import pandas as pd
import numpy as np
import ast
import json
from tqdm import tqdm
from sentence_transformers import CrossEncoder
from torch.utils.data import DataLoader
import torch
from datasets import Dataset
from datasets import load_dataset
from sentence_transformers.util import mine_hard_negatives
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder.losses import BinaryCrossEntropyLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainer
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments
import os
from collections import defaultdict
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

In [62]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [63]:
PATH_COLLECTION_DATA = 'subtask4b_collection_data.pkl'
PATH_QUERY_TRAIN_DATA = 'subtask4b_query_tweets_train.tsv' #MODIFY PATH
PATH_QUERY_DEV_DATA = 'subtask4b_query_tweets_train.tsv' #MODIFY PATH
PATH_QUERY_TRAIN_BM25 = 'df_train_bm25_50.csv' #MODIFY PATH
PATH_QUERY_DEV_BM25 = 'df_dev_bm25_50.csv' #MODIFY PATH
PATH_QUERY_TRAIN_GRANITE = 'granite_top75_train.json' #MODIFY PATH
PATH_QUERY_DEV_GRANITE = 'granite_top75_dev.json' #MODIFY PATH

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')
df_train_bm25 = pd.read_csv(PATH_QUERY_TRAIN_BM25, sep = ',')
df_dev_bm25 = pd.read_csv(PATH_QUERY_DEV_BM25, sep = ',')

df_dev_bm25["bm25_topk"] = df_dev_bm25["bm25_topk"].apply(ast.literal_eval)
df_train_bm25["bm25_topk"] = df_train_bm25["bm25_topk"].apply(ast.literal_eval)

df_train_granite = pd.read_json(PATH_QUERY_TRAIN_GRANITE)
df_dev_granite = pd.read_json(PATH_QUERY_DEV_GRANITE)

df_train = pd.merge(df_train_granite, df_train[['post_id', 'tweet_text']], left_on='tweet', right_on='post_id', how='left').drop(columns='post_id')
df_dev = pd.merge(df_dev_granite, df_dev[['post_id', 'tweet_text']], left_on='tweet', right_on='post_id', how='left').drop(columns='post_id')

df_train = pd.merge(df_train_granite, df_dev_bm25, left_on='tweet', right_on='post_id', how='left').drop(columns='post_id')
df_dev = pd.merge(df_dev_granite, df_dev_bm25, left_on='tweet', right_on='post_id', how='left').drop(columns='post_id')

In [64]:
df_dev.head()

Unnamed: 0,tweet,gold_paper,retrieved,tweet_text,cord_uid,bm25_topk
0,16,3qvh482o,"[jrqlhjsm, hg3xpej0, vccct6hq, mamtxi9v, styav...",covid recovery: this study from the usa reveal...,3qvh482o,"[25aj8rj5, gatxuwz7, 59up4v56, styavbvi, 6sy80..."
1,69,r58aohnu,"[r58aohnu, mm4kgvt1, qtzhfnr6, sjkni2uc, kiq6x...","""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, p0kg6dyz, 9dlaaye8, iu1d9i57, d06np..."
2,73,sts48u9i,"[6hnts5l2, ujq9mxk7, gruir7aw, zhh2c89o, 21lbb...",I recall early on reading that researchers who...,sts48u9i,"[tz2shoso, o877uul1, m1sf159a, sgo76prc, gruir..."
3,93,3sr2exq9,"[3sr2exq9, u43jmpyx, 8hvve871, 121p2shq, h7n8w...",You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, hgpiig0g, sv48gjkk, k0f4cwig, ihgxt..."
4,96,ybwwmyqy,"[ybwwmyqy, lzddnb8j, ierqfgo5, qh6rif48, sxx3y...",Resistance to antifungal medications is a grow...,ybwwmyqy,"[lzddnb8j, ouvq2wpq, sxx3yid9, vabb2f26, y9fqa..."


In [6]:
MODEL_DIRECTORY = './reranker/'
EMBEDDING_MODEL_NAME = "sentence-transformers/static-retrieval-mrl-en-v1"

In [7]:
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
text_splitter = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="gradient",
    breakpoint_threshold_amount=0.3
)

def semantic_chunking(text):
    documents = text_splitter.create_documents([text])
    chunks = [doc.page_content for doc in documents]
    return chunks

  embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


In [9]:
model = CrossEncoder(MODEL_DIRECTORY, device=device)

In [10]:
collection_dict = df_collection.set_index('cord_uid')['abstract'].to_dict()

pairs = []
query_indices = []
uid_mappings = []

for idx, row in tqdm(enumerate(df_dev.itertuples()), desc="Processing rows", total=len(df_dev)):
    query = row.tweet_text
    candidate_uids = row.retrieved

    for uid in candidate_uids[:25]:
        abstract = collection_dict[uid]
        chunks = semantic_chunking(abstract)

        for chunk in chunks:
            pairs.append([query, chunk])
            query_indices.append(idx)
            uid_mappings.append(uid)

Processing rows: 100%|██████████| 1400/1400 [02:13<00:00, 10.49it/s]


In [12]:
all_scores = model.predict(pairs)

query_results = [defaultdict(float) for _ in range(len(df_dev))]

for idx, uid, score in zip(query_indices, uid_mappings, all_scores):
    query_results[idx][uid] = max(query_results[idx][uid], score)

reranked_uids = []

for idx in range(len(df_dev)):
    max_scores = query_results[idx]
    sorted_uids = sorted(max_scores.items(), key=lambda x: x[1], reverse=True)
    reranked_uids.append([uid for uid, _ in sorted_uids])

df_dev['reranked'] = reranked_uids

In [51]:

def reciprocal_rank_fusion(list1, list2, k=40, alpha=0.5):
    """
    Compute Reciprocal Rank Fusion (RRF) score for two ranked lists.
    """
    beta = 1 - alpha
    scores = defaultdict(float)

    for rank, uid in enumerate(list1):
        scores[uid] += alpha / (k + rank)
    for rank, uid in enumerate(list2):
        scores[uid] += beta / (k + rank)

    # Sort by highest score
    sorted_uids = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [uid for uid, _ in sorted_uids[:5]]  # top 5

In [52]:
def get_performance_mrr(data, col_gold, col_pred, list_k=[1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [53]:
def matrix_parameter_tuning(df, a, b):
    best = - 1
    best_values = (-1, -1)
    for k in tqdm(a):
        for alpha in b:
            df["result"] = df.apply(
                lambda x: reciprocal_rank_fusion(x["bm25_topk"], x["reranked"], k=k, alpha=alpha), axis=1
            )
            res = get_performance_mrr(df_dev, "gold_paper", "result")[5]
            if res > best:
                best = res
                best_values = (k, alpha)
    return best, best_values

In [54]:
smoothing_paramters = list(range(5, 76, 5))
alpha_values = list(x/100 for x in range(5, 101, 5))

best, best_values = matrix_parameter_tuning(df_dev, smoothing_paramters, alpha_values)

print(f"Best: {best}, with values {best_values}")

100%|██████████| 15/15 [00:19<00:00,  1.33s/it]

Best: 0.6578333333333333, with values (5, 0.45)





In [55]:
close_sp = [1, 2, 3, 4, 5, 6, 7, 8, 9]
close_alpha = [0.41, 0.42, 0.43, 0.44, 0.45, 0.56, 0.47, 0.48, 0.49]

local_best, local_best_values = matrix_parameter_tuning(df_dev, close_sp, close_alpha)
print(f"Best: {local_best}, with values {local_best_values}")

100%|██████████| 9/9 [00:05<00:00,  1.63it/s]

Best: 0.6583214285714285, with values (6, 0.43)





In [58]:
df_dev["hybrid"] = df_dev.apply(
    lambda x: reciprocal_rank_fusion(
        x["bm25_topk"],
        x["reranked"],
        k=6,
        alpha=0.43
    ),
    axis=1
)

In [59]:
results_bm25 = get_performance_mrr(df_dev, "gold_paper", "bm25_topk")
results_granite = get_performance_mrr(df_dev, "gold_paper", "retrieved")
results_reranked = get_performance_mrr(df_dev, "gold_paper", "reranked")
results_hybrid = get_performance_mrr(df_dev, "gold_paper", "hybrid")


In [60]:
print("BM25 Results:", results_bm25)
print("Granite Results:", results_granite)
print("Reranked Results:", results_reranked)
print("Hybrid Results:", results_hybrid)

BM25 Results: {1: 0.5057142857142857, 5: 0.5522738095238094, 10: 0.557658163265306}
Granite Results: {1: 0.5257142857142857, 5: 0.5813214285714287, 10: 0.5882225056689343}
Reranked Results: {1: 0.5742857142857143, 5: 0.628904761904762, 10: 0.6347264739229024}
Hybrid Results: {1: 0.605, 5: 0.6583214285714285, 10: 0.6583214285714285}
