# Importing libraries

In [4]:
import os

import numpy as np
from itertools import product

from collections import Counter
from tqdm import tqdm
import numpy as np


import torch
import faiss

In [5]:
res = faiss.StandardGpuResources()  # use a single GPU

# Helper functions to generate recall@k, precision@k, and mean average precision@k results

In [6]:
def recall_at_k(actual, predicted, k):
    recall_list = []
    for index, values in enumerate(actual):
        act_set = set(actual[index].tolist())
        pred_set = set(predicted[index].tolist()[:k])
        """
        if k < len(act_set):
            result = round(len(act_set & pred_set) / k, 2)
        else:
            result = round(len(act_set & pred_set) / float(len(act_set)), 2)
        """
        result = round(len(act_set & pred_set) / float(len(act_set)), 2)
        recall_list.append(result)
    return recall_list

def precision_at_k(y_true, y_pred, k=12):
    """ Computes Precision at k for one sample
    
    Parameters
    __________
    y_true: np.array
            Array of correct recommendations (Order doesn't matter)
    y_pred: np.array
            Array of predicted recommendations (Order does matter)
    k: int, optional
       Maximum number of predicted recommendations
            
    Returns
    _______
    score: double
           Precision at k
    """    
    precision_list = []
    for index, value in enumerate(y_true):
        intersection = np.intersect1d(y_true[index], y_pred[index][:k])
        precision_list.append(len(intersection) / k)
    return precision_list

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [7]:
def generate_results_for_untrained_models(model="style", mode="most"):
    # declutr
    train_embedding_filename = "untrained_" + model + "data_train.pt"
    train_label_filename = "untrained_" + model + "labels_train.pt"

    test_embedding_filename = "untrained_" + model + "data_test.pt"
    test_label_filename = "untrained_" + model + "labels_test.pt"

    train_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_embedding_filename), map_location=torch.device('cpu'))
    train_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_label_filename), map_location=torch.device('cpu')).detach().numpy()

    test_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_embedding_filename), map_location=torch.device('cpu'))
    test_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_label_filename), map_location=torch.device('cpu')).detach().numpy()

    all_embeddings = torch.cat([torch.tensor(train_embeddings), torch.tensor(test_embeddings)])
    all_labels = torch.cat([torch.tensor(train_labels), torch.tensor(test_labels)])
    
    for percent in [1.0]:
        print("*"*50)
        nr_vendors = int(torch.unique(all_labels).shape[0] * percent)
        if mode == "most":
            print("most active % vendors:", percent * 100)
            vendors = [vendor[0] for vendor in Counter(all_labels.tolist()).most_common()[:nr_vendors]]
        else:
            print("least active % vendors:", percent * 100)
            vendors = [vendor[0] for vendor in Counter(all_labels.tolist()).most_common()[-nr_vendors:]]
            
        vendorindex = [[i for i, x in enumerate(test_labels) if x == vendor] for vendor in vendors]
        
        vendor_embeddings, vendor_labels = ([] for i in range(2))
        for index, vendor in enumerate(vendorindex):
            for vendor_index in vendor:
                vendor_embeddings.append(test_embeddings[vendor_index])
                vendor_labels.append(vendors[index])
        
        vendor_embeddings = torch.tensor(vendor_embeddings)
        vendor_labels = torch.tensor(vendor_labels)
        
        dim = train_embeddings.shape[1]
        nb = train_embeddings.shape[0]
        nq = train_embeddings.shape[0]

        index = faiss.IndexFlatIP(dim)
        # make it a flat GPU index
        gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
        gpu_index_flat.add(train_embeddings) # add vectors to the index

        k = 100
        D, I = gpu_index_flat.search(vendor_embeddings, k)

        true_label_list, predicted_label_list = ([] for i in range(2))
        for index, rank_indices in enumerate(I):
            temp_predicted_list, temp_actual_list = ([] for i in range(2))
            label = vendor_labels[index]
            predicted_label_list.append(np.array(rank_indices))
            true_label_list.append(np.where(train_labels == int(label))[0])

        # print("-"*50)
        print("========== PRECISION@K ==========")
        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , np.mean(precision_at_k(true_label_list, predicted_label_list, k=i)), np.std(precision_at_k(true_label_list, predicted_label_list, k=i)))

        print("========== RECALL@K ==========")
        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , np.mean(recall_at_k(true_label_list, predicted_label_list, k=i)), np.std(recall_at_k(true_label_list, predicted_label_list, k=i)))
            
        print("========== MAP@K ==========")
        true_label_list, predicted_label_list = ([] for i in range(2))
        for index, rank_indices in enumerate(I):
            temp_predicted_list, temp_actual_list = ([] for i in range(2))
            temp_actual_list.append(vendor_labels[index])
            for rank in rank_indices:
                temp_predicted_list.append(train_labels[rank])

            predicted_label_list.append(temp_predicted_list)
            true_label_list.append(temp_actual_list)

        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , mapk(true_label_list, predicted_label_list, k=i), np.std([apk(a,p,k) for a,p in zip(actual, predicted)]))

In [8]:
def generate_results_for_trained_models(model="style", mode="most"):
    # declutr
    train_embedding_filename = "trained_traindata_" + model + "_mean.pt"
    train_label_filename = "trained_trainlabels_" + model + "_mean.pt"

    test_embedding_filename = "trained_testdata_" + model + "_mean.pt"
    test_label_filename = "trained_testlabels_" + model + "_mean.pt"

    train_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_embedding_filename), map_location=torch.device('cpu'))
    train_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_label_filename), map_location=torch.device('cpu')).detach().numpy()

    test_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_embedding_filename), map_location=torch.device('cpu'))
    test_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_label_filename), map_location=torch.device('cpu')).detach().numpy()

    all_embeddings = torch.cat([torch.tensor(train_embeddings), torch.tensor(test_embeddings)])
    all_labels = torch.cat([torch.tensor(train_labels), torch.tensor(test_labels)])

    
    for percent in [1.0]:
        print("*"*50)
        nr_vendors = int(torch.unique(all_labels).shape[0] * percent)
        if mode == "most":
            print("most active % vendors:", percent * 100)
            vendors = [vendor[0] for vendor in Counter(all_labels.tolist()).most_common()[:nr_vendors]]
        else:
            print("least active % vendors:", percent * 100)
            vendors = [vendor[0] for vendor in Counter(all_labels.tolist()).most_common()[-nr_vendors:]]
            
        vendorindex = [[i for i, x in enumerate(test_labels) if x == vendor] for vendor in vendors]
        
        vendor_embeddings, vendor_labels = ([] for i in range(2))
        for index, vendor in enumerate(vendorindex):
            for vendor_index in vendor:
                vendor_embeddings.append(test_embeddings[vendor_index])
                vendor_labels.append(vendors[index])
        
        vendor_embeddings = torch.stack(vendor_embeddings)
        vendor_labels = torch.tensor(vendor_labels)
        
        dim = train_embeddings.shape[1]
        nb = train_embeddings.shape[0]
        nq = train_embeddings.shape[0]

        index = faiss.IndexFlatIP(dim)
        # make it a flat GPU index
        gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
        gpu_index_flat.add(train_embeddings) # add vectors to the index

        k = 100
        D, I = gpu_index_flat.search(vendor_embeddings, k)

        true_label_list, predicted_label_list = ([] for i in range(2))
        for index, rank_indices in enumerate(I):
            temp_predicted_list, temp_actual_list = ([] for i in range(2))
            label = vendor_labels[index]
            predicted_label_list.append(np.array(rank_indices))
            true_label_list.append(np.where(train_labels == int(label))[0])

        # print("-"*50)
        print("========== PRECISION@K ==========")
        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , np.mean(precision_at_k(true_label_list, predicted_label_list, k=i)))

        print("========== RECALL@K ==========")
        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , np.mean(recall_at_k(true_label_list, predicted_label_list, k=i)), np.std(recall_at_k(true_label_list, predicted_label_list, k=i)))
            
        print("========== MAP@K ==========")
        true_label_list, predicted_label_list = ([] for i in range(2))
        for index, rank_indices in enumerate(I):
            temp_predicted_list, temp_actual_list = ([] for i in range(2))
            temp_actual_list.append(vendor_labels[index])
            for rank in rank_indices:
                temp_predicted_list.append(train_labels[rank])

            predicted_label_list.append(temp_predicted_list)
            true_label_list.append(temp_actual_list)

        # print("-"*50)
        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , mapk(true_label_list, predicted_label_list, k=i))

In [16]:
def generate_results_for_models(model="style", mode="most", model_type="trained"):
    
    if model_type == "trained":
        train_embedding_filename = "trained_traindata_" + model + "_mean.pt"
        train_label_filename = "trained_trainlabels_" + model + "_mean.pt"

        test_embedding_filename = "trained_testdata_" + model + "_mean.pt"
        test_label_filename = "trained_testlabels_" + model + "_mean.pt"
    else:
        train_embedding_filename = "untrained_" + model + "data_train.pt"
        train_label_filename = "untrained_" + model + "labels_train.pt"

        test_embedding_filename = "untrained_" + model + "data_test.pt"
        test_label_filename = "untrained_" + model + "labels_test.pt"

    train_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_embedding_filename), map_location=torch.device('cpu'))
    train_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_label_filename), map_location=torch.device('cpu')).detach().numpy()

    test_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_embedding_filename), map_location=torch.device('cpu'))
    test_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_label_filename), map_location=torch.device('cpu')).detach().numpy()

    all_embeddings = torch.cat([torch.tensor(train_embeddings), torch.tensor(test_embeddings)])
    all_labels = torch.cat([torch.tensor(train_labels), torch.tensor(test_labels)])
    
    for percent in [1.0]:
        print("*"*50)
        nr_vendors = int(torch.unique(all_labels).shape[0] * percent)
        if mode == "most":
            print("most active % vendors:", percent * 100)
            vendors = [vendor[0] for vendor in Counter(all_labels.tolist()).most_common()[:nr_vendors]]
        else:
            print("least active % vendors:", percent * 100)
            vendors = [vendor[0] for vendor in Counter(all_labels.tolist()).most_common()[-nr_vendors:]]
            
        vendorindex = [[i for i, x in enumerate(test_labels) if x == vendor] for vendor in vendors]
        
        vendor_embeddings, vendor_labels = ([] for i in range(2))
        for index, vendor in enumerate(vendorindex):
            for vendor_index in vendor:
                vendor_embeddings.append(test_embeddings[vendor_index])
                vendor_labels.append(vendors[index])
        
        if model_type == "trained":
            vendor_embeddings = torch.stack(vendor_embeddings)
            vendor_labels = torch.tensor(vendor_labels)
        else:
            vendor_embeddings = torch.tensor(vendor_embeddings)
            vendor_labels = torch.tensor(vendor_labels)
        
        dim = train_embeddings.shape[1]
        nb = train_embeddings.shape[0]
        nq = train_embeddings.shape[0]

        index = faiss.IndexFlatIP(dim)
        # make it a flat GPU index
        gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
        gpu_index_flat.add(train_embeddings) # add vectors to the index

        k = 100
        D, I = gpu_index_flat.search(vendor_embeddings, k)

        true_label_list, predicted_label_list = ([] for i in range(2))
        for index, rank_indices in enumerate(I):
            temp_predicted_list, temp_actual_list = ([] for i in range(2))
            label = vendor_labels[index]
            predicted_label_list.append(np.array(rank_indices))
            true_label_list.append(np.where(train_labels == int(label))[0])

        # print("-"*50)
        print("========== PRECISION@K ==========")
        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , np.mean(precision_at_k(true_label_list, predicted_label_list, k=i)), np.std(precision_at_k(true_label_list, predicted_label_list, k=i)))

        print("========== RECALL@K ==========")
        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , np.mean(recall_at_k(true_label_list, predicted_label_list, k=i)), np.std(recall_at_k(true_label_list, predicted_label_list, k=i)))
            
        print("========== MAP@K ==========")
        true_label_list, predicted_label_list = ([] for i in range(2))
        for index, rank_indices in enumerate(I):
            temp_predicted_list, temp_actual_list = ([] for i in range(2))
            temp_actual_list.append(vendor_labels[index])
            for rank in rank_indices:
                temp_predicted_list.append(train_labels[rank])

            predicted_label_list.append(temp_predicted_list)
            true_label_list.append(temp_actual_list)

        for i in [1, 3, 5, 10, 20, 25, 50, 100]:
            print(i , mapk(true_label_list, predicted_label_list, k=i), np.std([apk(a,p,i) for a,p in zip(true_label_list, predicted_label_list)]))

# Trained and Un-trained DeCLUTr model

In [None]:
generate_results_for_models(model="declutr", mode="most", model_type="untrained")

In [None]:
generate_results_for_models(model="declutr", mode="most", model_type="trained")

# Trained and Un-trained Style Representation Model

In [None]:
generate_results_for_models(model="style", mode="most", model_type="untrained")

In [None]:
generate_results_for_models(model="style", mode="most", model_type="trained")

# R-Precision score

In [7]:
from tqdm import tqdm

In [8]:
train_embedding_filename = "trained_traindata_" + "declutr" + "_mean.pt"
train_label_filename = "trained_trainlabels_" + "declutr" + "_mean.pt"

test_embedding_filename = "trained_testdata_" + "declutr" + "_mean.pt"
test_label_filename = "trained_testlabels_" + "declutr" + "_mean.pt"


train_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_embedding_filename), map_location=torch.device('cpu'))
train_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_label_filename), map_location=torch.device('cpu')).detach().numpy()

test_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_embedding_filename), map_location=torch.device('cpu'))
test_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_label_filename), map_location=torch.device('cpu')).detach().numpy()

In [27]:
dim = train_embeddings.shape[1]
nb = train_embeddings.shape[0]
nq = train_embeddings.shape[0]

index = faiss.IndexFlatIP(dim)
# make it a flat GPU index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
gpu_index_flat.add(train_embeddings) # add vectors to the index

In [28]:
K = 1500

In [29]:
vendor_dict = {}
unique_vendors = torch.unique(torch.tensor(test_labels))

pbar = tqdm(total=unique_vendors.shape[0])
for vendor_id in unique_vendors:
    vendor_id = int(vendor_id)
    # train_dataset
    train_adsidx = [i for i, x in enumerate(train_labels) if x == vendor_id]
    vendor_dict[vendor_id] = len(train_adsidx)
    pbar.update(1)
pbar.close()

100%|██████████| 4487/4487 [00:57<00:00, 78.16it/s] 


In [30]:
r_precision_score = {}

pbar = tqdm(total=unique_vendors.shape[0])
# Iterating through all unique vendors in the test dataset
for vendor_id in unique_vendors:
    vendor_id = int(vendor_id)
    
    # Collecting one vendor at a time from the test dataset
    test_adsidx = [i for i, x in enumerate(test_labels) if x == vendor_id]
    test_vendor_embeddings, test_vendor_labels = ([] for i in range(2))
    for _, ad_index in enumerate(test_adsidx):
        test_vendor_embeddings.append(test_embeddings[ad_index])
        test_vendor_labels.append(vendor_id)
    
    test_vendor_embeddings = torch.stack(test_vendor_embeddings)
    test_vendor_labels = torch.tensor(test_vendor_labels)

    # Performing K-Clustering for X relevant ads
    D, I = gpu_index_flat.search(test_vendor_embeddings, vendor_dict[vendor_id])
    
    true_label_list, predicted_label_list = ([] for i in range(2))
    for index, rank_indices in enumerate(I):
        temp_predicted_list, temp_actual_list = ([] for i in range(2))
        label = test_vendor_labels[index]
        
        predicted_label_list.append(np.array(rank_indices))
        true_label_list.append(np.where(train_labels == int(label))[0])
    
    # print(predicted_label_list)
    # print(true_label_list)
    r_precision_score[vendor_id] = np.mean(recall_at_k(true_label_list, predicted_label_list, vendor_dict[vendor_id]))
    pbar.update(1)
pbar.close()

100%|██████████| 4487/4487 [00:28<00:00, 159.18it/s]


In [10]:
def generate_rprecision_results_for_models(model="style", model_type="trained"):
    if model_type == "trained":
        train_embedding_filename = "trained_traindata_" + model + "_mean.pt"
        train_label_filename = "trained_trainlabels_" + model + "_mean.pt"

        test_embedding_filename = "trained_testdata_" + model + "_mean.pt"
        test_label_filename = "trained_testlabels_" + model + "_mean.pt"
    else:
        train_embedding_filename = "untrained_" + model + "data_train.pt"
        train_label_filename = "untrained_" + model + "labels_train.pt"

        test_embedding_filename = "untrained_" + model + "data_test.pt"
        test_label_filename = "untrained_" + model + "labels_test.pt"

    train_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_embedding_filename), map_location=torch.device('cpu'))
    train_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", train_label_filename), map_location=torch.device('cpu')).detach().numpy()
    
    test_embeddings = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_embedding_filename), map_location=torch.device('cpu'))
    test_labels = torch.load(os.path.join(os.getcwd(), "../pickled/embeddings/", test_label_filename), map_location=torch.device('cpu')).detach().numpy()
    
    dim = train_embeddings.shape[1]
    nb = train_embeddings.shape[0]
    nq = train_embeddings.shape[0]

    index = faiss.IndexFlatIP(dim)
    # make it a flat GPU index
    gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
    gpu_index_flat.add(train_embeddings) # add vectors to the index
    
    # The highest number of instances a vendor has in the training dataset is 1400
    K = 1500
    
    vendor_dict = {}
    unique_vendors = torch.unique(torch.tensor(test_labels))

    pbar = tqdm(total=unique_vendors.shape[0])
    for vendor_id in unique_vendors:
        vendor_id = int(vendor_id)
        # train_dataset
        train_adsidx = [i for i, x in enumerate(train_labels) if x == vendor_id]
        vendor_dict[vendor_id] = len(train_adsidx)
        pbar.update(1)
    pbar.close()
    
    r_precision_score = {}

    pbar = tqdm(total=unique_vendors.shape[0])
    # Iterating through all unique vendors in the test dataset
    for vendor_id in unique_vendors:
        vendor_id = int(vendor_id)

        # Collecting one vendor at a time from the test dataset
        test_adsidx = [i for i, x in enumerate(test_labels) if x == vendor_id]
        test_vendor_embeddings, test_vendor_labels = ([] for i in range(2))
        for _, ad_index in enumerate(test_adsidx):
            test_vendor_embeddings.append(test_embeddings[ad_index])
            test_vendor_labels.append(vendor_id)

        if model_type == "trained":
            test_vendor_embeddings = torch.stack(test_vendor_embeddings)
            test_vendor_labels = torch.tensor(test_vendor_labels)
        else:
            test_vendor_embeddings = torch.tensor(test_vendor_embeddings)
            test_vendor_labels = torch.tensor(test_vendor_labels)

        # Performing K-Clustering for X relevant ads
        D, I = gpu_index_flat.search(test_vendor_embeddings, vendor_dict[vendor_id])

        true_label_list, predicted_label_list = ([] for i in range(2))
        for index, rank_indices in enumerate(I):
            temp_predicted_list, temp_actual_list = ([] for i in range(2))
            label = test_vendor_labels[index]

            predicted_label_list.append(np.array(rank_indices))
            true_label_list.append(np.where(train_labels == int(label))[0])

        # print(predicted_label_list)
        # print(true_label_list)
        r_precision_score[vendor_id] = np.mean(recall_at_k(true_label_list, predicted_label_list, vendor_dict[vendor_id]))
        pbar.update(1)
    pbar.close()
    
    print("R precision mean:", np.mean(list(r_precision_score.values())))
    print("R precision std:", np.std(list(r_precision_score.values())))

    return r_precision_score

# Style Embedding models

In [13]:
r_precision = generate_rprecision_results_for_models()

100%|██████████| 4487/4487 [00:56<00:00, 79.16it/s] 
100%|██████████| 4487/4487 [00:22<00:00, 196.07it/s]


R precision mean: 0.8601571023493308
R precision std: 0.22739694373594802


In [11]:
r_precision = generate_rprecision_results_for_models(model_type="untrained")

100%|██████████| 4487/4487 [00:51<00:00, 87.16it/s] 
  test_vendor_embeddings = torch.tensor(test_vendor_embeddings)
100%|██████████| 4487/4487 [00:21<00:00, 207.81it/s]


R precision mean: 0.019967233403539328
R precision std: 0.07883495708402365


# Declutr models

In [12]:
r_precision = generate_rprecision_results_for_models(model="declutr")

100%|██████████| 4487/4487 [00:49<00:00, 90.88it/s] 
100%|██████████| 4487/4487 [00:23<00:00, 191.30it/s]


R precision mean: 0.8852071534463589
R precision std: 0.2086283578160882


In [13]:
r_precision = generate_rprecision_results_for_models(model="declutr", model_type="untrained")

100%|██████████| 4487/4487 [00:49<00:00, 89.89it/s] 
100%|██████████| 4487/4487 [00:24<00:00, 185.43it/s]


R precision mean: 0.16419733860200036
R precision std: 0.23623486751891842
