In [None]:
import pandas as pd
import glob, sys, os
import json, torch, random
import numpy as np
sys.path.append("../")
import utilities as utl
from sentence_transformers import SentenceTransformer
from pympler import asizeof
from sklearn.cluster import KMeans, AgglomerativeClustering
from bkmeans import BKMeans
from sklearn.metrics import pairwise_distances
import div_utilities as div_utl
import copy
from transformers import BertTokenizer, BertModel, RobertaTokenizerFast, RobertaModel
from model_classes import BertClassifierPretrained, BertClassifier
from glove_embeddings import GloveTransformer
import fasttext_embeddings as ft
from torch.nn.parallel import DataParallel
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)


In [None]:
# we need to input a query table and shortlisted k data lake tuples.
benchmark_name = r"ugen_benchmark"
benchmark_folder_path = r"../data" + os.sep + benchmark_name
algorithm = "LLM"
llm_output_path = r"gpt3_generated_tuples"
union_query_folder_path = benchmark_folder_path + os.sep + "query" + os.sep 
llm_result_folder_path = benchmark_folder_path + os.sep + llm_output_path + os.sep 
if benchmark_name == r"ugen_benchmark":
    k = 30
elif benchmark_name == r"labeled_benchmark":
    k = 100
else:
    print(f"Unknown benchmark: {benchmark_name}")
    sys.exit()
lmda = 0.7
s_dict_max = 2500
q_dict_max = 100
metric = "cosine" # cosine, l1, l2
embedding_type = "dust"
eplot_folder_path = r"div_plots" + os.sep + "embedding_plots" + os.sep 
cplot_folder_path = r"div_plots" + os.sep + "cluster_plots" + os.sep 
result_folder_path = r"div_result_tables" + os.sep
updated_stats_df_path = r"final_stats" + os.sep + benchmark_name + "__" + metric + "__" + embedding_type + "__llm.csv"
normalize = True
max_metric = False
compute_metric = True
full_dust = False
query_tables = glob.glob(union_query_folder_path + "*.csv")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# diversity results for starmie
model_path = r'../out_model/tus_benchmark_corrected_roberta/checkpoints/best-checkpoint.pt'
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained('roberta-base')
model = BertClassifier(model, num_labels = 2, hidden_size = 768, output_size = 768)
model = DataParallel(model, device_ids=[0, 1, 2, 3])
#print(model)   
model.load_state_dict(torch.load(model_path))
model.to(device)

In [None]:
query_tables

In [None]:

stats_df = pd.DataFrame(columns = ["algorithm", "embedding_type", "query_name", "|S|", "|q|", "k", "algorithm_distance_function", "evaluation_distance_function", "with_query_flag", "normalized", "max_div_score", "max-min_div_score", "avg_div_score", "time_taken_(s)"])
for query_table in query_tables:
    query_name = query_table.rsplit(os.sep,1)[-1]
    print("Query table name: ", query_name)
    tuple_id = 0
    dl_tuple_dict = {}
    query_table = utl.read_csv_file(union_query_folder_path + query_name)
    columns_in_query = set(query_table.columns.astype(str))
    columns_in_query = {col.strip() for col in columns_in_query}
    unionable_tuples = utl.read_csv_file(llm_result_folder_path + query_name) 
    unionable_tuples.columns = unionable_tuples.columns.astype(str)
    unionable_tuples.columns = unionable_tuples.columns.str.strip()
    # print("query columns:", columns_in_query)
    before_col_drop = unionable_tuples.copy() 
    if full_dust == True: 
        #alignment in dataset is already done in previous phase, we only need to drop the columns not in query.
        columns_to_drop = set(unionable_tuples.columns.astype(str)) - columns_in_query
        unionable_tuples = unionable_tuples.drop(columns=columns_to_drop)
    # print("unionable tuples:", unionable_tuples)               
    serialized_tuples = utl.SerializeTable(unionable_tuples)
    if len(serialized_tuples) == 0:
        # each = {"metric": "l2", "with_query" : "yes", "max_score": l2_with_query_max_scores, "max-min_score": min(l2_with_query_min_scores), "avg_score": l2_with_query_avg_scores}
        append_list = [algorithm, embedding_type, query_name, len(serialized_tuples), len(query_table), k, metric, "cosine", "mix", normalize, np.nan, 0, 0, "n/a"]
        stats_df.loc[len(stats_df)] = append_list
        continue
    # print("serialized_tuples: ", serialized_tuples)    
    for tup in serialized_tuples:
        dl_tuple_dict[tuple_id] = tup
        tuple_id += 1
        if len(dl_tuple_dict) >= k:
            break
    # print("Dl tuple dict:", dl_tuple_dict)
    # if len(dl_tuple_dict == 0) :
    #     append_list = [algorithm, embedding_type, query_name, len(S_dict), len(q_dict), k, metric, each['metric'], each["with_query"], normalize, each["max_score"], each["max-min_score"], each["avg_score"], "n/a"]

    S_dict = utl.EmbedTuples(list(dl_tuple_dict.values()), model, embedding_type,tokenizer, 1000)
    S_dict = dict(zip(list(dl_tuple_dict.keys()), S_dict))
    print("Total data lake tuples:", len(dl_tuple_dict))
    query_tuple_dict = {}
    serialized_tuples = utl.SerializeTable(query_table)
    for tup in serialized_tuples:
        query_tuple_dict[tuple_id] = tup
        tuple_id += 1
    if len(query_tuple_dict) > q_dict_max:
        random.seed(random_seed)
        sampled_keys = random.sample(query_tuple_dict.keys(), q_dict_max)
        sampled_dict = {key: query_tuple_dict[key] for key in sampled_keys}
        query_tuple_dict = sampled_dict 
    q_dict = utl.EmbedTuples(list(query_tuple_dict.values()), model, embedding_type,tokenizer, 1000)
    q_dict = dict(zip(list(query_tuple_dict.keys()), q_dict))
    print("Total query tuples:", len(query_tuple_dict))
    if len(q_dict) < 3:
        print(f"Query table: {query_name} has only {len(q_dict)} rows. So, ignoring this table.")
        continue
    computed_metrics, embedding_plot = div_utl.compute_metrics(set(S_dict.keys()), S_dict, q_dict, lmda, k, print_results = False, normalize=normalize, metric=metric, max_metric = max_metric)
    # columns = ["algorithm", "embedding_type", "query_name", "|S|", "|q|", "k", "algorithm_distance_function", "evaluation_distance_function", "with_query_flag", "normalized", "max_div_score", "max-min_div_score", "avg_div_score", "time_taken_(s)"]
    for each in computed_metrics:
        # each = {"metric": "l2", "with_query" : "yes", "max_score": l2_with_query_max_scores, "max-min_score": min(l2_with_query_min_scores), "avg_score": l2_with_query_avg_scores}
        append_list = [algorithm, embedding_type, query_name, len(S_dict), len(q_dict), k, metric, each['metric'], each["with_query"], normalize, each["max_score"], each["max-min_score"], each["avg_score"], "n/a"]
        stats_df.loc[len(stats_df)] = append_list

In [None]:
stats_df.to_csv(updated_stats_df_path, index = False)