In [13]:
import os
import gc
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from sklearn.metrics.pairwise import cosine_similarity
import pyarrow.parquet as pq
from typing import Tuple, Generator, Dict, Text
import math
import time

# Set random seed for reproducibility
SEED = 11925939
tf.random.set_seed(SEED)
np.random.seed(SEED)

# Configure TensorFlow for optimal GPU usage
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # If using PyTorch elsewhere
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"{len(gpus)} GPU(s) Available: {gpus}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPUs found. Using CPU.")

print("TensorFlow version:", tf.__version__)

PCOLLECTION_FILE_PATH = '../p_collection.pkl'
SYNGET_U = 'models/model-u/weights'
SYNGET_06 = 'models/model-0.6/weights'
SYNGET_07 = 'models/model-0.7/weights'
SYNGET_08 = 'models/model-0.8v2/weights'
SYNGET_08_AC = 'models/model-0.8v2_all_candidates/weights'

QUERY_TO_QUERY = '../qid2query.tsv'
QREL_TEST = '../QREL/2023test.qrel'

No GPUs found. Using CPU.
TensorFlow version: 2.15.0


In [2]:
class QueryModel(tf.keras.Model):
    """Model for encoding user queries."""

    def __init__(self):
        super().__init__()
        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()
        self.dense_layers.add(tf.keras.layers.Dense(128, activation='elu'))
        self.dense_layers.add(tf.keras.layers.Dense(128))
        self.dense_layers.add(tf.keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1)))

    def call(self, inputs):
        return self.dense_layers(inputs)


class ProductModel(tf.keras.Model):
    """Model for encoding products."""

    def __init__(self):
        super().__init__()
        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()
        self.dense_layers.add(tf.keras.layers.Dense(128, activation='elu'))
        self.dense_layers.add(tf.keras.layers.Dense(128))
        self.dense_layers.add(tf.keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1)))

    def call(self, inputs):
        return self.dense_layers(inputs)


class TwoTowerModel(tfrs.models.Model):

    def __init__(self, candidates):
        super().__init__()
        self.query_model = QueryModel()
        self.product_model = ProductModel()
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=candidates.batch(128).map(self.product_model)
            ),
        )
        
    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        query_embeddings = self.query_model(features["query_embedding"])
        product_embeddings = self.product_model(features["product_embedding"])

        return (
            query_embeddings,
            product_embeddings
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False):
        query_embeddings = self.query_model(features["query_embedding"])
        product_embeddings = self.product_model(features["product_embedding"])

        return self.task(
            query_embeddings, product_embeddings, compute_metrics=not training)

In [3]:
def produce_ground_truth(qid: int, qrel: pd.DataFrame, collection: pd.DataFrame) -> [int]:
    df = qrel[(qrel['qid'] == qid) & (qrel['docid'].isin(collection['id'].values))]
    df = df.sort_values(by='relevance_score', ascending=False)
    return df.set_index('docid')['relevance_score'].to_dict()

def produce_y_pred(topk: pd.DataFrame, y_true) -> dict:
    # Create a result dictionary or DataFrame if needed
    matched_scores = [y_true[_id] for _id in topk['id'].values]
    result = {'docid': topk['id'].values, 'relevance_score': matched_scores}
    return pd.DataFrame(result).set_index('docid')['relevance_score'].to_dict()
        
def normalized_discounted_cumulative_gain(temp_set, p=10):
    dc_gain = 0
    idc_gain = 0
    for idx, value in enumerate(temp_set.values()):
        pos = idx + 1
        dc_gain += value / math.log2(pos + 1)
        if pos == p:
            break
    for idx, value in enumerate(sorted(temp_set.values(), reverse=True)):
        pos = idx + 1
        idc_gain += value / math.log2(pos + 1)
        if pos == p:
            break
    return round(dc_gain / idc_gain, 5)

def precision_at_k(predicted_dict, ideal_dict, k):
    # Get the top K docids from the predicted results
    top_k_pred = list(predicted_dict.keys())[:k]
    # Count the number of relevant documents in the top K predicted results
    relevant_in_pred = sum([1 for docid in top_k_pred if ideal_dict.get(docid, 0) > 0])
    # Precision is the number of relevant documents divided by K
    return relevant_in_pred / k


def recall_at_k(predicted_dict, ideal_dict, k):
    # Get the top K docids from the predicted results
    top_k_pred = list(predicted_dict.keys())[:k]
    # Count the total number of relevant documents in the ideal results
    total_relevant = sum([1 for score in ideal_dict.values() if score > 0])
    # Count the number of relevant documents in the top K predicted results
    relevant_in_pred = sum([1 for docid in top_k_pred if ideal_dict.get(docid, 0) > 0])
    # Recall is the number of relevant documents in top K divided by the total number of relevant documents
    return relevant_in_pred / total_relevant if total_relevant > 0 else 0

def evaluate_two_tower(query_df: pd.DataFrame, qrel_df: pd.DataFrame, model,
                       collection: pd.DataFrame, sentence_transformer: SentenceTransformer, k=10 ) -> (
        float, float, float):
    ndcg = []
    precision = []
    recall = []
    for _, row in query_df.iterrows():
        if row.text is np.nan:
            continue
        #optimal ranking
        y_true = produce_ground_truth(int(row.qid), qrel_df, collection)
        # get the candidates
        df_candidates = collection[collection['id'].isin(y_true.keys())].copy()
        candidates = tf.data.Dataset.from_tensor_slices(np.stack(df_candidates['product_embedding'].values))
        # build index
        brute_force = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
        brute_force.index_from_dataset(candidates.batch(128).map(model.product_model))
        # predict top k
        scores, indices = brute_force(np.array(sentence_transformer.encode([str(row.text)])), k=df_candidates.shape[0])
        indices = indices.numpy().flatten()
        scores = scores.numpy().flatten()
        topk_df = df_candidates.iloc[indices].copy()
        topk_df['score'] = scores
        topk_df = topk_df.sort_values(by='score', ascending=False).reset_index(drop=True)
        y_pred = produce_y_pred(topk_df, y_true)
        ndcg.append(normalized_discounted_cumulative_gain(y_pred))
        precision.append(precision_at_k(y_pred, y_true, k))
        recall.append(recall_at_k(y_pred, y_true, k))

    return np.mean(ndcg), np.mean(precision), np.mean(recall)


def evaluate_gte(query_df: pd.DataFrame, qrel_df: pd.DataFrame,
                       collection: pd.DataFrame, sentence_transformer: SentenceTransformer, k=10) -> (
        float, float, float):
    ndcg = []
    precision = []
    recall = []
    for _, row in query_df.iterrows():
        if row.text is np.nan:
            continue
        #optimal ranking
        y_true = produce_ground_truth(int(row.qid), qrel_df, collection)
        # get the candidates
        df_candidates = collection[collection['id'].isin(y_true.keys())].copy()
        product_embeddings = np.stack(df_candidates['product_embedding'].values)
        query_embedding = np.array(sentence_transformer.encode([str(row.text)]))
        df_candidates['score'] = cosine_similarity(query_embedding, product_embeddings).flatten()
        topk_df = df_candidates.sort_values(by='score', ascending=False).reset_index(drop=True)
        y_pred = produce_y_pred(topk_df, y_true)
        ndcg.append(normalized_discounted_cumulative_gain(y_pred))
        precision.append(precision_at_k(y_pred, y_true, k))
        recall.append(recall_at_k(y_pred, y_true, k))

    return np.mean(ndcg), np.mean(precision), np.mean(recall)

In [4]:
collection = pd.read_pickle(PCOLLECTION_FILE_PATH)
candidates = tf.data.Dataset.from_tensor_slices(np.stack(collection['product_embedding'].values))
qid2query_df = pd.read_csv(QUERY_TO_QUERY, sep='\t', names=['qid', 'text'], header=None)
qrel_df = pd.read_csv(QREL_TEST, sep='\t', names=['qid', '0', 'docid', 'relevance_score'], header=None)
common_qids = set(qrel_df['qid']).intersection(set(qid2query_df['qid']))
qrel_df = qrel_df[qrel_df['qid'].isin(common_qids)]
qid2query_df = qid2query_df[qid2query_df['qid'].isin(common_qids)]
sentence_transformer = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
print(f'Testing the model with {qid2query_df.shape} queries on {qrel_df.shape} pairs')

Testing the model with (186, 2) queries on (115490, 4) pairs


## Evaluate the pretrained GTE model embeddings
As a second baseline we evaluate the retrieval performance of the [GTE_large](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5) model on our specific dataset. This model ranks 32 on the overal [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard), rank 21 on retrieval tasks and rank 51 on reranking tasks (12.05.2024). As reference BM25s ranks at 182 and [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) ranks at 139.

In [7]:
k = 5
score = evaluate_gte(qid2query_df, qrel_df, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")
print('\n')
k = 10
score = evaluate_gte(qid2query_df, qrel_df, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")

NDCG@10: 0.6173832795698925
Precision@5: 0.6795698924731183
Recall@5: 0.1788642520648675


NDCG@10: 0.6173832795698925
Precision@10: 0.5973118279569892
Recall@10: 0.2615271195428554


## Evaluate SYNGET Models
This model is now finetuned on our synthetic positive pairs using one elu Layer followed by a linear Layer

### SYNGET-U

In [8]:
model = TwoTowerModel(candidates)
model.load_weights(SYNGET_U)
k = 5
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")
print('\n')
k = 10
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")

NDCG@10: 0.1653624193548387
Precision@5: 0.23978494623655916
Recall@5: 0.029447296198577153


NDCG@10: 0.1653624193548387
Precision@10: 0.24301075268817207
Recall@10: 0.05284969649480427


### SYNGET-0.6

In [9]:
model = TwoTowerModel(candidates)
model.load_weights(SYNGET_06)
k = 5
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")
print('\n')
k = 10
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")

NDCG@10: 0.16914865591397849
Precision@5: 0.26344086021505375
Recall@5: 0.030466103954099443


NDCG@10: 0.16914865591397849
Precision@10: 0.25967741935483873
Recall@10: 0.05712073493564846


### SYNGET-0.7

In [10]:
model = TwoTowerModel(candidates)
model.load_weights(SYNGET_07)
k = 5
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")
print('\n')
k = 10
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")

NDCG@10: 0.21971370967741932
Precision@5: 0.32903225806451614
Recall@5: 0.039629984038986085


NDCG@10: 0.21971370967741932
Precision@10: 0.31451612903225806
Recall@10: 0.07222742633021043


### SYNGET-0.8

In [14]:
model = TwoTowerModel(candidates)
model.load_weights(SYNGET_08)
k = 5
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")
print('\n')
k = 10
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")

NDCG@10: 0.6597959677419355
Precision@5: 0.710752688172043
Recall@5: 0.1700878859727594


NDCG@10: 0.6597959677419355
Precision@10: 0.6349462365591397
Recall@10: 0.2531176025286843


## SYNGET-0.8 AC

In [15]:
model = TwoTowerModel(candidates)
model.load_weights(SYNGET_08_AC)
k = 5
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")
print('\n')
k = 10
score = evaluate_two_tower(qid2query_df, qrel_df, model, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")

NDCG@10: 0.5642075806451613
Precision@5: 0.6258064516129033
Recall@5: 0.12341903977208206


NDCG@10: 0.5642075806451613
Precision@10: 0.5725806451612904
Recall@10: 0.20759569236918432
