In [1]:
#%pip install -U tensorflow[and-cuda] tensorflow_recommenders sentence_transformers tf_keras

In [2]:
import os
import gc
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from sklearn.metrics.pairwise import cosine_similarity
import pyarrow.parquet as pq
from typing import Tuple, Generator, Dict, Text
import math
import time

# Set random seed for reproducibility
SEED = 11925939
tf.random.set_seed(SEED)
np.random.seed(SEED)
GENRATE_EMBEDDINGS = True

# Configure TensorFlow for optimal GPU usage
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # If using PyTorch elsewhere
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"{len(gpus)} GPU(s) Available: {gpus}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPUs found. Using CPU.")

print("TensorFlow version:", tf.__version__)

PCOLLECTION_FILE_PATH = '../p_collection.pkl'
ALL_MINI_L6_EMBEDDINGS = 'all-MiniLM-L6-v2.pkl'
QUERY_TO_QUERY = '../qid2query.tsv'
QREL_TEST = '../QREL/2023test.qrel'

import torch
print(torch.__version__)           # PyTorch version
print(torch.cuda.is_available())    # Should be True
print(torch.version.cuda)  


import os

# Remove PYTORCH_CUDA_ALLOC_CONF from the environment if it exists
os.environ.pop("PYTORCH_CUDA_ALLOC_CONF", None)

import torch
from sentence_transformers import SentenceTransformer

def encode_batch(batch):
    return sentence_transformer.encode(batch)


sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
collection = pd.read_pickle(PCOLLECTION_FILE_PATH)

if GENRATE_EMBEDDINGS:
    start = time.time()
    product_texts = collection['product_text'].values
    embeddings = np.stack(sentence_transformer.encode(product_texts))

    # Create a DataFrame with the embeddings
    df = pd.DataFrame(index=range(len(embeddings)))  # Initialize with correct number of rows
    # Store embeddings as list of arrays
    df['product_embeddings'] = [emb for emb in embeddings]
    df.to_pickle(ALL_MINI_L6_EMBEDDINGS)
    print(f'Generating {len(df)} embeddings took {time.time() - start} seconds')
    del df
    


def produce_ground_truth(qid: int, qrel: pd.DataFrame, collection: pd.DataFrame) -> [int]:
    df = qrel[(qrel['qid'] == qid) & (qrel['docid'].isin(collection['id'].values))]
    df = df.sort_values(by='relevance_score', ascending=False)
    return df.set_index('docid')['relevance_score'].to_dict()

def produce_y_pred(topk: pd.DataFrame, y_true) -> dict:
    # Create a result dictionary or DataFrame if needed
    matched_scores = [y_true[_id] for _id in topk['id'].values]
    result = {'docid': topk['id'].values, 'relevance_score': matched_scores}
    return pd.DataFrame(result).set_index('docid')['relevance_score'].to_dict()
        
def normalized_discounted_cumulative_gain(temp_set, p=10):
    dc_gain = 0
    idc_gain = 0
    for idx, value in enumerate(temp_set.values()):
        pos = idx + 1
        dc_gain += value / math.log2(pos + 1)
        if pos == p:
            break
    for idx, value in enumerate(sorted(temp_set.values(), reverse=True)):
        pos = idx + 1
        idc_gain += value / math.log2(pos + 1)
        if pos == p:
            break
    return round(dc_gain / idc_gain, 5)

def precision_at_k(predicted_dict, ideal_dict, k):
    # Get the top K docids from the predicted results
    top_k_pred = list(predicted_dict.keys())[:k]
    # Count the number of relevant documents in the top K predicted results
    relevant_in_pred = sum([1 for docid in top_k_pred if ideal_dict.get(docid, 0) > 0])
    # Precision is the number of relevant documents divided by K
    return relevant_in_pred / k


def recall_at_k(predicted_dict, ideal_dict, k):
    # Get the top K docids from the predicted results
    top_k_pred = list(predicted_dict.keys())[:k]
    # Count the total number of relevant documents in the ideal results
    total_relevant = sum([1 for score in ideal_dict.values() if score > 0])
    # Count the number of relevant documents in the top K predicted results
    relevant_in_pred = sum([1 for docid in top_k_pred if ideal_dict.get(docid, 0) > 0])
    # Recall is the number of relevant documents in top K divided by the total number of relevant documents
    return relevant_in_pred / total_relevant if total_relevant > 0 else 0

def evaluate_two_tower(query_df: pd.DataFrame, qrel_df: pd.DataFrame, model,
                       collection: pd.DataFrame, sentence_transformer: SentenceTransformer, k=10 ) -> (
        float, float, float):
    ndcg = []
    precision = []
    recall = []
    for _, row in query_df.iterrows():
        if row.text is np.nan:
            continue
        #optimal ranking
        y_true = produce_ground_truth(int(row.qid), qrel_df, collection)
        # get the candidates
        df_candidates = collection[collection['id'].isin(y_true.keys())].copy()
        candidates = tf.data.Dataset.from_tensor_slices(np.stack(df_candidates['product_embedding'].values))
        # build index
        brute_force = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
        brute_force.index_from_dataset(candidates.batch(128).map(model.product_model))
        # predict top k
        scores, indices = brute_force(np.array(sentence_transformer.encode([str(row.text)])), k=df_candidates.shape[0])
        indices = indices.numpy().flatten()
        scores = scores.numpy().flatten()
        topk_df = df_candidates.iloc[indices].copy()
        topk_df['score'] = scores
        topk_df = topk_df.sort_values(by='score', ascending=False).reset_index(drop=True)
        y_pred = produce_y_pred(topk_df, y_true)
        ndcg.append(normalized_discounted_cumulative_gain(y_pred))
        precision.append(precision_at_k(y_pred, y_true, k))
        recall.append(recall_at_k(y_pred, y_true, k))

    return np.mean(ndcg), np.mean(precision), np.mean(recall)


def evaluate_gte(query_df: pd.DataFrame, qrel_df: pd.DataFrame,
                       collection: pd.DataFrame, sentence_transformer: SentenceTransformer, k=10) -> (
        float, float, float):
    ndcg = []
    precision = []
    recall = []
    for _, row in query_df.iterrows():
        if row.text is np.nan:
            continue
        #optimal ranking
        y_true = produce_ground_truth(int(row.qid), qrel_df, collection)
        # get the candidates
        df_candidates = collection[collection['id'].isin(y_true.keys())].copy()
        product_embeddings = np.stack(df_candidates['product_embedding'].values)
        query_embedding = np.array(sentence_transformer.encode([str(row.text)]))
        df_candidates['score'] = cosine_similarity(query_embedding, product_embeddings).flatten()
        topk_df = df_candidates.sort_values(by='score', ascending=False).reset_index(drop=True)
        y_pred = produce_y_pred(topk_df, y_true)
        ndcg.append(normalized_discounted_cumulative_gain(y_pred))
        precision.append(precision_at_k(y_pred, y_true, k))
        recall.append(recall_at_k(y_pred, y_true, k))

    return np.mean(ndcg), np.mean(precision), np.mean(recall)



embeddings = pd.read_pickle(ALL_MINI_L6_EMBEDDINGS)
collection['product_embedding'] = embeddings['product_embeddings']
qid2query_df = pd.read_csv(QUERY_TO_QUERY, sep='\t', names=['qid', 'text'], header=None)
qrel_df = pd.read_csv(QREL_TEST, sep='\t', names=['qid', '0', 'docid', 'relevance_score'], header=None)
common_qids = set(qrel_df['qid']).intersection(set(qid2query_df['qid']))
qrel_df = qrel_df[qrel_df['qid'].isin(common_qids)]
qid2query_df = qid2query_df[qid2query_df['qid'].isin(common_qids)]

k = 5
score = evaluate_gte(qid2query_df, qrel_df, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")
print('\n')
k = 10
score = evaluate_gte(qid2query_df, qrel_df, collection, sentence_transformer, k=k)
print(f'NDCG@10: {score[0]}')
print(f"Precision@{k}: {score[1]}")
print(f"Recall@{k}: {score[2]}")

2025-02-13 14:15:09.492489: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-13 14:15:09.514013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739452509.529267 1657621 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739452509.534086 1657621 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-13 14:15:09.551595: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

2 GPU(s) Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
TensorFlow version: 2.18.0
2.4.0+cu121
True
12.1
Generating 980974 embeddings took 911.9814088344574 seconds
NDCG@10: 0.5841745161290323
Precision@5: 0.6473118279569893
Recall@5: 0.16199503960159423


NDCG@10: 0.5841745161290323
Precision@10: 0.5763440860215053
Recall@10: 0.24305364129403814
