In [None]:
!pip install datasets
!pip install --upgrade sentence-transformers
!pip install langchain_experimental

In [1]:
import pandas as pd
import numpy as np
import ast
import json
from tqdm import tqdm
from sentence_transformers import CrossEncoder
from torch.utils.data import DataLoader
import torch
from datasets import Dataset
from datasets import load_dataset
from sentence_transformers.util import mine_hard_negatives
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder.losses import BinaryCrossEntropyLoss
from sentence_transformers.cross_encoder import CrossEncoderTrainer
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments
import os
from collections import defaultdict
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
PATH_COLLECTION_DATA = '../X_Data/subtask4b_collection_data.pkl'
PATH_QUERY_TEST = '../X_Data/subtask4b_query_tweets_test.tsv'
PATH_BM25 = '../X_Data/bm25_test.tsv'
PATH_GRANITE = '../X_Data/granite_test.tsv'

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_test_raw = pd.read_csv(PATH_QUERY_TEST, sep='\t')
df_bm25 = pd.read_csv(PATH_BM25, sep='\t')
df_granite = pd.read_csv(PATH_GRANITE, sep='\t')

df_granite["retrieved"] = df_granite["retrieved"].apply(ast.literal_eval)
df_bm25["bm25_topk"] = df_bm25["bm25_topk"].apply(ast.literal_eval)

df_test = pd.merge(df_test_raw, df_bm25, on=["post_id", "tweet_text"])
df_test = pd.merge(df_test, df_granite, on=["post_id", "tweet_text"])

df_test.head()

Unnamed: 0.1,post_id,tweet_text,bm25_topk,Unnamed: 0,retrieved
0,1,A recent research study published yesterday cl...,"[8fkzc445, qgwu9fsk, bttme4wn, j0bu0upi, jqwox...",0,"[8zufbeuz, tpic8ddl, 5hei9fac, j0bu0upi, jzosd..."
1,2,"""We should track the long-term effects of thes...","[evf9nz05, ynaxwnlp, 5vp2r2bd, 65n6p550, y8puo...",1,"[evf9nz05, 3swdnn29, pyz2tnhk, i0swl50w, ikacd..."
2,3,"the agony of ""long haul"" covid-19 symptoms.","[l4u01fzk, y6jw3gws, ls4qfvwq, qvuuhkg6, 6gm8k...",2,"[m3m2n3fw, 8qdcls1k, jbs3d5xo, o4vvlmr4, v53e7..."
3,4,Home and online monitoring and assessment of b...,"[ru2ty1y9, bnkggl84, wabd3b9z, 5gshj480, rpxw3...",3,"[ru2ty1y9, wabd3b9z, zmk8bbcd, kkbkh4yi, r4q0z..."
4,5,"it may be a long one, folks! to avoid exceedin...","[f5p37j7g, x9rv72dl, 32gnw4sv, n9zqc1gm, yoiq6...",4,"[f5p37j7g, l8sozg8v, mz1bof2x, nzat41wu, kvu0h..."


In [None]:
# reranker contains the model files (too big to upload)
MODEL_DIRECTORY = './reranker/'
EMBEDDING_MODEL_NAME = "sentence-transformers/static-retrieval-mrl-en-v1"

In [24]:
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
text_splitter = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="gradient",
    breakpoint_threshold_amount=0.3
)

def semantic_chunking(text):
    documents = text_splitter.create_documents([text])
    chunks = [doc.page_content for doc in documents]
    return chunks

In [25]:
model = CrossEncoder(MODEL_DIRECTORY, device=device)

In [26]:
collection_dict = df_collection.set_index('cord_uid')['abstract'].to_dict()

pairs = []
query_indices = []
uid_mappings = []

for idx, row in tqdm(enumerate(df_test.itertuples()), desc="Processing rows", total=len(df_test)):
    query = row.tweet_text
    candidate_uids = row.retrieved

    for uid in candidate_uids[:25]:
        abstract = collection_dict[uid]
        chunks = semantic_chunking(abstract)

        for chunk in chunks:
            pairs.append([query, chunk])
            query_indices.append(idx)
            uid_mappings.append(uid)

Processing rows: 100%|██████████| 1446/1446 [02:22<00:00, 10.14it/s]


In [27]:
all_scores = model.predict(pairs)

query_results = [defaultdict(float) for _ in range(len(df_test))]

for idx, uid, score in zip(query_indices, uid_mappings, all_scores):
    query_results[idx][uid] = max(query_results[idx][uid], score)

reranked_uids = []

for idx in range(len(df_test)):
    max_scores = query_results[idx]
    sorted_uids = sorted(max_scores.items(), key=lambda x: x[1], reverse=True)
    reranked_uids.append([uid for uid, _ in sorted_uids])

df_test['reranked'] = reranked_uids

In [28]:
def reciprocal_rank_fusion(list1, list2, k=40, alpha=0.5):
    """
    Compute Reciprocal Rank Fusion (RRF) score for two ranked lists.
    """
    beta = 1 - alpha
    scores = defaultdict(float)

    for rank, uid in enumerate(list1):
        scores[uid] += alpha / (k + rank)
    for rank, uid in enumerate(list2):
        scores[uid] += beta / (k + rank)

    # Sort by highest score
    sorted_uids = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [uid for uid, _ in sorted_uids[:5]]  # top 5

In [None]:
def get_performance_mrr(data, col_gold, col_pred, list_k=[1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [None]:
df_test["hybrid"] = df_test.apply(
    lambda x: reciprocal_rank_fusion(
        x["bm25_topk"],
        x["reranked"],
        k=6, # k and alpha obtained through grid search
        alpha=0.43
    ),
    axis=1
)

In [None]:
df_test.rename(columns={"hybrid": "preds"})[["post_id", "preds"]].to_csv("../submissions/hybrid_predictions_test.tsv", index=None, sep="\t")