In [None]:
from utils.data.data_builder import load_data

data = load_data("spider", "benchmarks")

train_metadata = data.get_train_json()

## Format Results for Embedding Model Training

# Generate Embedding SFT Training Data

In [None]:
from tqdm.notebook import tqdm
from utils.linking.application import mask_question_with_schema_linking
from utils.utils import mask_query
from utils.utils import jaccard_similarity
from third_party.TSED import tsed_similarity
from tqdm import tqdm
import random
import json
import os


random.seed(42)

# Obtain the relevant the data, for easy referencing in the main body loop
temp = []

for instance in tqdm(train_metadata):
    
    question = instance["question"]
    sql = instance["query"]
    masked_question = mask_question_with_schema_linking([instance],"<mask>","<unk>")
    masked_sql = mask_query(sql)
    
    temp.append({"question":question, "masked_question":masked_question, "sql":sql, "masked_sql":masked_sql})

num_elements = len(temp)
num_pairs = 1000000

# Determine what sample of 100000 pairs of Spider questions to use in train set
sampled_pairs = set()
while len(sampled_pairs) < num_pairs:
    i, j = random.randint(0, num_elements - 1), random.randint(0, num_elements - 1)
    if i != j and (i, j) not in pairs:
        sampled_pairs.add((i, j))


# Create the dataset of 100000 masked (sql1, sql2, similarity_score) tuples
dataset = []

for indices in tqdm(sampled_pairs):
    i = indices[0]
    j = indices[1]
    data_dict = {
        "sentence1": temp[i]["masked_sql"],
        "sentence2": temp[j]["masked_sql"],
        "score": (tsed_similarity("sql", temp[i]["masked_sql"], temp[j]["masked_sql"], 1, 0.8, 1) + jaccard_similarity(temp[i]["masked_sql"], temp[j]["masked_sql"])) / 2
    }
    dataset.append(data_dict)

OUT_FILE = os.path.join("data", "spider", "train-sets", "sql-embedding-train-set.json")
with open(OUT_FILE, 'w') as f:
    json.dump(dataset, f, indent=4)