### This script computes the top 75 results according to cosine similarity with granite embeddings

#### It is meant to be used to generate the granite results for the hybrid approach

In [6]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from tqdm import tqdm
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SentenceTransformer(
    'ibm-granite/granite-embedding-278m-multilingual',
    device=device
)

PATH_QUERY_TEST = "../X_Data/subtask4b_query_tweets_test.tsv"

df_test = pd.read_csv(PATH_QUERY_TEST, sep="\t")

In [7]:
query_embeddings = model.encode(df_test["tweet_text"].to_list(), normalize_embeddings=True, convert_to_tensor=True)

RuntimeError: HIP error: invalid device function
HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing AMD_SERIALIZE_KERNEL=3
Compile with `TORCH_USE_HIP_DSA` to enable device-side assertions.


In [None]:
PATH_CORD = "../X_Data/docs.json"

df_collection = pd.read_json(PATH_CORD)

In [None]:
collection_embeddings = model.encode(df_collection["text"].to_list(), normalize_embeddings=True, convert_to_tensor=True)

In [None]:
similarity = torch.matmul(query_embeddings, collection_embeddings.T)

In [None]:
chunk_uids = df_collection['cord_uid'].values
chunk_texts = df_collection['text'].values

In [None]:
top_results = []

for q_idx, sim_row in tqdm(enumerate(similarity), desc="Tweet", total=len(similarity)):
    best_scores = {}  # cord_uid -> (score, text)

    for chunk_idx, score in enumerate(sim_row):
        uid = chunk_uids[chunk_idx]
        score_val = score.item()
        if uid not in best_scores or score_val > best_scores[uid][0]:
            best_scores[uid] = (score_val, chunk_texts[chunk_idx])

    # Get top 5 documents by their best matching chunk score
    top_docs = sorted(best_scores.items(), key=lambda x: x[1][0], reverse=True)[:75]

    # Format: [(cord_uid, score, best_matching_chunk_text), ...]
    top_results.append([(uid, score, text) for uid, (score, text) in top_docs])

Tweet: 100%|██████████| 1446/1446 [18:47<00:00,  1.28it/s]


In [None]:
df_test["retrieved"] = [[uid for uid, _, _ in result] for result in top_results]

In [None]:
df_test.to_csv('../X_Data/granite_test.tsv', sep='\t')