In [None]:
import pandas as pd
import numpy as np
import os
import shutil
import pickle
import re
import copy
from tqdm import tqdm
from pandas.errors import EmptyDataError

In [None]:
def create_datalake_and_query_folders(in_dir, out_dir, gtfile, query_col="query_table", datalake_col="data_lake_table"):
    os.makedirs(out_dir, exist_ok=True)
    query_folder = os.path.join(out_dir, "query")
    datalake_folder = os.path.join(out_dir, "datalake")
    os.makedirs(query_folder, exist_ok=True)
    os.makedirs(datalake_folder, exist_ok=True)
    gt_df = pd.read_csv(gtfile)
    counter = 0
    for _, row in gt_df.iterrows():
        query_table = row[query_col]
        datalake_table = row[datalake_col]
        qt_file = os.path.join(query_folder, query_table)
        dlt_file = os.path.join(datalake_folder, datalake_table)
        if not os.path.isfile(qt_file):
            counter += 1
            in_file = os.path.join(in_dir, query_table)
            shutil.copy(in_file, query_folder)
        if not os.path.isfile(dlt_file):
            in_file = os.path.join(in_dir, datalake_table)
            shutil.copy(in_file, datalake_folder)

In [None]:
in_dir = "../data/ugen_v2/data/"
out_dir = "../data/ugen_v2/"
gtfile = "../data/ugen_v2/groundtruth.csv"
#create_datalake_and_query_folders(in_dir, out_dir, gtfile, query_col="query_table", datalake_col="data_lake_table")

In [None]:
def create_gt_pickle(gtfile, out_pickle, query_col="query_table", datalake_col="data_lake_table", label="unionable"):
    query_datalake_dict = {}
    gt_df = pd.read_csv(gtfile)
    for _, row in gt_df.iterrows():
        query_table = row[query_col]
        datalake_table = row[datalake_col]
        is_unionable = row[label]
        if (is_unionable == 1):
            if query_table not in query_datalake_dict:
                query_datalake_dict[query_table] = [datalake_table]
            else:
                curr_tables = set(query_datalake_dict[query_table])
                curr_tables.add(datalake_table)
                query_datalake_dict[query_table] = list(curr_tables)
    for key, value in query_datalake_dict.items():
        print(key, value)
    with open(out_pickle, 'wb') as handle:
        pickle.dump(query_datalake_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
gtfile = "../data/ugen_v2/groundtruth.csv"
out_pickle = "../data/ugen_v2/groundtruth.pickle"
#create_gt_pickle(gtfile, out_pickle)

In [None]:
gtfile= "../ugen_v1_sparse_20/groundtruth.csv"
out_pickle = "../ugen_v1_sparse_20/groundtruth.pickle"
#create_datalake_and_query_folders(in_dir,out_dir,gtfile)
#create_gt_pickle(gtfile, out_pickle)

In [None]:
import csv

def read_file(gen_file):
    if "ugen_v2" in gen_file:
        return pd.read_csv(gen_file, sep=';')
    data = []
    try:
        data = pd.read_csv(gen_file, lineterminator='\n', low_memory=False)
        if data.shape[1] < 2:
            data = pd.read_csv(gen_file, sep='|')
    except:
        try:
            data = pd.read_csv(gen_file, sep='|')
        except:
            with open(gen_file) as curr_csv:
                curr_data = curr_csv.read().splitlines()
                curr_data = [len(row.split('|')) for row in curr_data]
                max_col_num = 0
                if len(curr_data) != 0:
                    max_col_num = max(curr_data)
                try:
                    if max_col_num != 0:
                        df = pd.read_csv(gen_file, sep='|', header=None, names=range(max_col_num), low_memory=False)
                        data = df
                        return data
                    else:
                        df = pd.read_csv(gen_file, lineterminator='\n', low_memory=False)
                        data = df
                        return data
                except:
                    df = pd.read_csv(gen_file, lineterminator='\n', low_memory=False)
                    data = df
                    return data
    return data

in_dir = "../ugen_v1/datalake"

In [None]:
in_dir = "../labeled_benchmark"
in_query = in_dir + "/query"
in_datalake = in_dir + "/datalake"
in_gpt_dir = "../data/ugen_v2"
in_gpt_query = in_gpt_dir + "/query"
in_gpt_datalake = in_gpt_dir + "/datalake"

def find_avg_shape(in_dir):
    rows = []
    cols = []
    csv_files = [file for file in os.listdir(in_dir)]
    for file in csv_files:
        file_path = os.path.join(in_dir, file)
        df = read_file(file_path)
        rows.append(df.shape[0])
        cols.append(df.shape[1])
    return sum(rows)/len(rows), sum(cols)/len(rows)
print(find_avg_shape(in_gpt_query))
print(find_avg_shape(in_gpt_datalake))

In [None]:
def getAvgLength(feature_vector, column, mode):
    column_values= copy.copy(feature_vector[column])
    column_values.fillna('nan')
    lengths = []       
    for i in column_values.values:
        if i!='nan': 
            if mode == 'tokens' : lengths.append(len(str(i)) )
            elif mode == 'words' : lengths.append(len(re.split(';|,|_|\|', str(i))))

    avg = 0 if len(lengths) == 0 else round(float(sum(lengths) / len(lengths)),2)
    return avg

In [None]:
def get_words(text):
    words = re.split(';|,|_|\|', str(text))
    return words

In [None]:
def get_avg_len_all(in_dir):
    short_string_counter = 0
    medium_string_counter = 0
    long_string_counter = 0
    csv_files = [file for file in os.listdir(in_dir) if file.endswith('.csv')]
    total_column_counter = 0
    for file in csv_files:
        average_lengths = {}
        file_path = os.path.join(in_dir, file)
        df = read_file(file_path)
        for column in df.columns:
            total_column_counter += 1
            if df[column].dtype == 'object':  # Check if column contains text data
                df[str(column) + '_Word_Length'] = df[column].apply(lambda x: np.mean([len(word) for word in get_words(x)]))
                average_lengths[column] = df[str(column) + '_Word_Length'].mean()
        for key, value in average_lengths.items():
            if value >= 6:
                long_string_counter += 1
            elif value >= 3 and value < 6:
                medium_string_counter += 1 
            else:
                short_string_counter += 1
    total_column_counter = float(total_column_counter)
    return short_string_counter*100/total_column_counter, medium_string_counter*100/total_column_counter, long_string_counter*100/total_column_counter

In [None]:
def get_numbers(in_dir):
    total_numeric_columns = 0
    csv_files = [file for file in os.listdir(in_dir) if file.endswith('.csv')]
    total_column_counter = 0
    for file in csv_files:
        file_path = os.path.join(in_dir, file)
        df = read_file(file_path)
        numeric_columns = df.select_dtypes(include=['number']).columns
        non_null_numeric_columns = df[numeric_columns].notnull().any().sum()
        total_numeric_columns += non_null_numeric_columns
        total_column_counter += len(df.columns)
        #     if df[column].dtype != 'object' and not(df[column].isnull().all()):  # Check if column contains text data
        #         num_column_counter += 1
    return total_numeric_columns*100/float(total_column_counter)

In [None]:
def get_densities(in_dir):
    densities = []
    csv_files = [file for file in os.listdir(in_dir) if file.endswith('.csv')]
    for file in csv_files:
        file_path = os.path.join(in_dir, file)
        df = read_file(file_path)
        column_densities = df.notnull().mean()
        densities.append(column_densities)
    concatenated_densities = pd.concat(densities, axis=1)
    average_density = concatenated_densities.mean().mean()
    return average_density

def get_all_nulls(in_dir):
    densities = []
    csv_files = [file for file in os.listdir(in_dir) if file.endswith('.csv')]
    for file in csv_files:
        file_path = os.path.join(in_dir, file)
        df = read_file(file_path)
        column_densities = df.isnull().sum()
        densities.append(column_densities)
    concatenated_densities = pd.concat(densities, axis=1)
    all_nulls = concatenated_densities.sum().sum()
    return all_nulls

In [None]:
def get_unique_count(in_dir):
    less_than_20 = 0
    between_20_50 = 0
    greater_than_50 = 0
    csv_files = [file for file in os.listdir(in_dir) if file.endswith('.csv')]
    total_column_counter = 0
    for file in csv_files:
        file_path = os.path.join(in_dir, file)
        df = read_file(file_path)
        total_rows = len(df)
        total_column_counter += len(df.columns)
        unique_value_percentages = df.nunique() / total_rows * 100
        for i in unique_value_percentages:
            if i < 20.0:
                less_than_20 += 1
            elif i > 50.0:
                greater_than_50 += 1
            else:
                between_20_50 += 1
    total_column_counter = float(total_column_counter)
    return (less_than_20*100/total_column_counter, between_20_50*100/total_column_counter, greater_than_50*100/total_column_counter)

In [None]:
def get_total_table_num(in_dir):
    file_count = 0
    for root, _, files in os.walk(in_dir):
        if root == in_dir:
            continue
        file_count += len(files)
    return file_count

In [None]:
in_dir = "../data/ugen_v2"
in_query = in_dir + "/query"
in_datalake = in_dir + "/datalake"
print("-------------------------------------------")
print("Total Number of Dataset Tables", get_total_table_num(in_dir))
print("-------------------------------------------")
print("Average Shape in Query", find_avg_shape(in_query))
print("-------------------------------------------")
print("Average Shape in Datalake", find_avg_shape(in_datalake))
print("-------------------------------------------")
print("Query avg length", get_avg_len_all(in_query))
print("-------------------------------------------")
print("Datalake avg length", get_avg_len_all(in_datalake))
print("-------------------------------------------")
print("number of Number type columns in Query", get_numbers(in_query))
print("-------------------------------------------")
print("number of Number type columns in Datalake", get_numbers(in_datalake))
print("-------------------------------------------")
print("average density in Query", get_densities(in_query))
print("-------------------------------------------")
print("average density in Datalake", get_densities(in_datalake))
print("-------------------------------------------")
print("uniqueness Query", get_unique_count(in_query))
print("-------------------------------------------")
print("uniqueness Datalake", get_unique_count(in_datalake))
print("-------------------------------------------")
print("total nulls in Query", get_all_nulls(in_query))
print("-------------------------------------------")
print("total nulls in Datalake", get_all_nulls(in_datalake))
print("-------------------------------------------")

In [None]:
# SANTOS has no labeled not-unionable
in_dir = "../../starmie/data/santos-large"
gt = in_dir + "/santos_large_benchmark_groundtruth.csv"
gt_df = pd.read_csv(gt)
print(len(gt_df))

In [None]:
# TUS
tus_gt = "../data/table-union-search-benchmark/small/tus-small-gt/recall_groundtruth.csv"
tus_gt_df = pd.read_csv(tus_gt)

# Calculate the sum of the numbers in the specified column
sum_of_column = tus_gt_df['unionable_count'].sum()

print("total labeled unionable pairs:", sum_of_column)