In [None]:
import pandas as pd
import glob, sys, os
import json, torch, random
import numpy as np
sys.path.append("../")
import utilities as utl
from sentence_transformers import SentenceTransformer
from pympler import asizeof
from sklearn.cluster import KMeans, AgglomerativeClustering
from bkmeans import BKMeans
from sklearn.metrics import pairwise_distances
import div_utilities as div_utl
import copy
from transformers import BertTokenizer, BertModel, RobertaTokenizerFast, RobertaModel
from model_classes import BertClassifierPretrained, BertClassifier
from glove_embeddings import GloveTransformer
import fasttext_embeddings as ft
from torch.nn.parallel import DataParallel
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)


In [None]:
# we need to input a query table and shortlisted k data lake tuples.
benchmark_name = r"ugen_benchmark"
benchmark_folder_path = r"/home/khatiwada/starmie/data/ugen_tuple"
algorithm = "Starmie"
union_query_folder_path = benchmark_folder_path + os.sep + "query" + os.sep 
union_datalake_folder_path = benchmark_folder_path + os.sep + "datalake" + os.sep 
if benchmark_name == r"ugen_benchmark":
    k = 30
    result_size = 100
elif benchmark_name == r"labeled_benchmark":
    k = 100
    result_size = 500
else:
    print(f"Unknown benchmark: {benchmark_name}")
    sys.exit()
lmda = 0.7
s_dict_max = 2500
q_dict_max = 100
metric = "cosine" # cosine, l1, l2
embedding_type = "dust"
eplot_folder_path = r"div_plots" + os.sep + "embedding_plots" + os.sep 
cplot_folder_path = r"div_plots" + os.sep + "cluster_plots" + os.sep 
result_folder_path = r"div_result_tables" + os.sep
stats_df_path = r"div_stats" + os.sep + benchmark_name + "__" + metric + "__" + embedding_type + ".csv"
updated_stats_df_path = r"final_stats" + os.sep + benchmark_name + "__" + metric + "__" + embedding_type + "__starmie.csv"
normalize = True
max_metric = False
compute_metric = True
full_dust = False
save_results = False

In [None]:
div_result_path = os.path.join(r"div_result_tables", benchmark_name, metric, embedding_type)
# Create directory if it does not exist
if not os.path.exists(div_result_path):
    os.makedirs(div_result_path)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# diversity results for starmie
model_path = r'../out_model/tus_benchmark_corrected_roberta/checkpoints/best-checkpoint.pt'
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained('roberta-base')
model = BertClassifier(model, num_labels = 2, hidden_size = 768, output_size = 768)
model = DataParallel(model, device_ids=[0, 1, 2, 3])
#print(model)   
model.load_state_dict(torch.load(model_path))
model.to(device)

In [None]:
# load the results by starmie
output_tuples = utl.loadDictionaryFromPickleFile(f"starmie_results/{benchmark_name}_top-{result_size}_results_by_starmie.pickle")
output_tuples_pr = {key: value[:k] for key, value in output_tuples.items()}
output_tuples_for_accuracy = {}
for key, value in output_tuples_pr.items():
    new_value = []
    for val in value:
        split_val = val.rsplit("_",1)[0]
        split_ext = val.rsplit(".",1)[-1]
        new_value.append(split_val + "." + split_ext)
    output_tuples_for_accuracy[key] = new_value
union_groundtruth_file_path = f"../groundtruth/{benchmark_name}_union_groundtruth.pickle"
union_groundtruth = utl.loadDictionaryFromPickleFile(union_groundtruth_file_path)
union_groundtruth = {key: set(value) for key, value in union_groundtruth.items()}

In [None]:
all_query_precision = {}
all_query_map = {}
for query in output_tuples_for_accuracy:
    current_tuples = output_tuples_for_accuracy[query]
    true_positives = 0
    false_positives = 0
    current_truth = [0 for i in range(0, k)]
    for idx, each in enumerate(current_tuples):
        if each in union_groundtruth[query]:
            true_positives += 1
            current_truth[idx] = 1                
        else:
            false_positives += 1
            current_truth[idx] = 0
    if true_positives + false_positives == 0:
        current_precision = 0
    else:
        current_precision = true_positives / k
    all_query_precision[query] = current_precision
    each_precision = []
    for k_dash in range(0, k):
        current_true_positives = sum(current_truth[:k_dash])
        each_precision.append(current_true_positives / (k_dash + 1))
    current_map = sum(each_precision) / len(each_precision)
    all_query_map[query] = current_map
print("Average Precision:", sum(list(all_query_precision.values()))/ len(all_query_precision)) 
print("MAP: ", sum(list(all_query_map.values()))/ len(all_query_map))

In [None]:
all_query_map

In [None]:
stats_df = pd.DataFrame(columns = ["algorithm", "embedding_type", "query_name", "|S|", "|q|", "k", "algorithm_distance_function", "evaluation_distance_function", "with_query_flag", "normalized", "max_div_score", "max-min_div_score", "avg_div_score", "time_taken_(s)"])
for query_name in output_tuples:
    try:
        query_table = utl.read_csv_file(union_query_folder_path + query_name)
        columns_in_query = set(query_table.columns.astype(str))
        # print("Current dl columns:", query_table.columns)
        tuple_id = 0
        dl_tuple_dict = {}
        unionable_tables = output_tuples[query_name]  
        for dl_table in unionable_tables:
            current_dl_table = utl.read_csv_file(union_datalake_folder_path + dl_table)
            current_dl_table.columns = current_dl_table.columns.astype(str)
            # print("Current dl columns:", current_dl_table.columns)
            if full_dust == True: 
                #alignment in dataset is already done in previous phase, we only need to drop the columns not present in query.
                columns_to_drop = set(current_dl_table.columns.astype(str)) - columns_in_query
                current_dl_table = current_dl_table.drop(columns=columns_to_drop)
            serialized_tuples = utl.SerializeTable(current_dl_table)
            for tup in serialized_tuples:
                dl_tuple_dict[tuple_id] = tup
                tuple_id += 1
                if len(dl_tuple_dict) >= k:
                    break
            if len(dl_tuple_dict) >= k:
                break
        
        S_dict = utl.EmbedTuples(list(dl_tuple_dict.values()), model, embedding_type,tokenizer, 1000)
        S_dict = dict(zip(list(dl_tuple_dict.keys()), S_dict))
        print("Total data lake tuples:", len(dl_tuple_dict))
        if len(S_dict) < k and len(unionable_tables) >= k:
            # the method is not returning tuples to embed
                # each = {"metric": "l2", "with_query" : "yes", "max_score": l2_with_query_max_scores, "max-min_score": min(l2_with_query_min_scores), "avg_score": l2_with_query_avg_scores}
            append_list = [algorithm, embedding_type, query_name, len(serialized_tuples), len(query_table), k, metric, "cosine", "mix", normalize, np.nan, 0, 0, "n/a"]
            stats_df.loc[len(stats_df)] = append_list
            continue
        query_tuple_dict = {}
        serialized_tuples = utl.SerializeTable(query_table)
        for tup in serialized_tuples:
            query_tuple_dict[tuple_id] = tup
            tuple_id += 1
        if len(query_tuple_dict) > q_dict_max:
            random.seed(random_seed)
            sampled_keys = random.sample(query_tuple_dict.keys(), q_dict_max)
            sampled_dict = {key: query_tuple_dict[key] for key in sampled_keys}
            query_tuple_dict = sampled_dict 
        q_dict = utl.EmbedTuples(list(query_tuple_dict.values()), model, embedding_type,tokenizer, 1000)
        q_dict = dict(zip(list(query_tuple_dict.keys()), q_dict))
        print("Total query tuples:", len(query_tuple_dict))
        if len(q_dict) < 3:
            print(f"Query table: {query_name} has only {len(q_dict)} rows. So, ignoring this table.")
            continue
        computed_metrics, embedding_plot = div_utl.compute_metrics(set(S_dict.keys()), S_dict, q_dict, lmda, k, print_results = False, normalize=normalize, metric=metric, max_metric = max_metric)
        # columns = ["algorithm", "embedding_type", "query_name", "|S|", "|q|", "k", "algorithm_distance_function", "evaluation_distance_function", "with_query_flag", "normalized", "max_div_score", "max-min_div_score", "avg_div_score", "time_taken_(s)"]
        for each in computed_metrics:
            # each = {"metric": "l2", "with_query" : "yes", "max_score": l2_with_query_max_scores, "max-min_score": min(l2_with_query_min_scores), "avg_score": l2_with_query_avg_scores}
            append_list = [algorithm, embedding_type, query_name, len(S_dict), len(q_dict), k, metric, each['metric'], each["with_query"], normalize, each["max_score"], each["max-min_score"], each["avg_score"], "n/a"]
            stats_df.loc[len(stats_df)] = append_list
        if save_results == True:
            f_path = os.path.join(div_result_path, "starmie")
            if not os.path.exists(f_path):
                os.makedirs(f_path)
            # print("Dl dict:", dl_tuple_dict)
            current_div_results_path = f_path + os.sep + query_name.rsplit(".",1)[0] + ".txt"
            with open(current_div_results_path, "w") as f:
                for i in range(0, len(dl_tuple_dict)):
                #div_tuple in dl_tuple_dict:
                    f.write(dl_tuple_dict[i] + "\n")
    except:
        print("failed on: ", query_name)
        continue

In [None]:
stats_df.to_csv(updated_stats_df_path, index = False)