In [1]:
from AstridEmbed import *

In [2]:
query_type = "substring" 
dataset = "imdb_movie_titles"
def get_model(query_type, dataset):
    random_seed = 1234
    misc_utils.initialize_random_seeds(random_seed)
    
    #Set the configs
    embedding_learner_configs, frequency_configs, selectivity_learner_configs = setup_configs(query_type, dataset)
    
    embedding_model_file_name = selectivity_learner_configs.embedding_model_file_name
    selectivity_model_file_name = selectivity_learner_configs.selectivity_model_file_name
    
    string_helper = misc_utils.setup_vocabulary(frequency_configs.string_list_file_name)
    embedding_model = load_embedding_model(embedding_model_file_name, string_helper).to("cuda")
    selectivity_model = load_selectivity_estimation_model(selectivity_model_file_name, string_helper).to("cuda")
    # to load max min
    df = pd.read_csv(frequency_configs.selectivity_file_name)
    df["string"] = df["string"].astype(str)
    df = compute_normalized_selectivities(df)
    return embedding_model, selectivity_model, string_helper
embed_model, sel_model, str_helper = get_model(query_type, dataset)

In [3]:
# #Load the input file and split into 50-50 train, test split
# df = pd.read_csv(frequency_configs.selectivity_file_name)
# df["string"] = df["string"].astype(str)
# df = compute_normalized_selectivities(df)
# train_indices, test_indices = train_test_split(df.index, random_state=random_seed, test_size=0.5)
# train_df, test_df = df.iloc[train_indices], df.iloc[test_indices]

In [4]:
# #Get the predictions from the learned model and compute basic summary statistics
# normalized_predictions, denormalized_predictions = get_selectivity_for_strings(
#     test_df["string"].values, embedding_model, selectivity_model, string_helper)
# actual = torch.tensor(test_df["normalized_selectivities"].values)
# test_q_error = misc_utils.compute_qerrors(normalized_predictions, actual,
#     selectivity_learner_configs.min_val, selectivity_learner_configs.max_val)
# print("Test data: Mean q-error loss ", np.mean(test_q_error))
# print("Test data: Summary stats of Loss: Percentile: [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99] ", [np.quantile(test_q_error, q) for q in [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]])

In [5]:
normalized_predictions, denormalized_predictions = get_selectivity_for_strings(
    ["the", "a", "method"], embed_model, sel_model, str_helper)
denormalized_predictions


0.0 15.007574937622


tensor([[4.0475e+02],
        [1.7626e+05],
        [2.1463e+00]])

In [6]:
default_preds_path = "benchmark/preds.txt"
output_preds_path = "benchmark/preds_ASTRID.txt"
columns_to_replace = ['t.title', 'n.name',]

pred_lines = [
    pred
    for pred in
    open(default_preds_path)
    .read().strip()
    .splitlines()
]

print(len(pred_lines), pred_lines[1])

776 n.name ~~ '%konen',0.000320


In [7]:
def detect_like_type(line):
    w:str = line.split(",")[0].split(" ")[2]
    if w.count("%") == 2:
        return "substring"
    elif w[1] == "%":
        return "suffix"
    elif w[-2] == "%":
        return "prefix"
    else:
        raise ValueError(f"Undefined like type: {w} {line}")

In [8]:
def get_astrid_word(line):
    w:str = line.split(",")[0].split(" ")[2]
    return w.replace("'","").replace("%","")

In [9]:
get_astrid_word("n.name ~~ 'Christi%',0.005000")

'Christi'

In [10]:
query_types = ["prefix", "suffix", "substring"]
datasets = ["imdb_movie_actors", "imdb_movie_titles"]
n_rows = [4167486, 2527952]
column_names = ["n.name", "t.title"]

for query_type in query_types:
    for dataset, column_name, n_row in zip(datasets,column_names,n_rows):
        print(dataset, query_type, column_name)
        embed_model, sel_model, str_helper = get_model(query_type, dataset)
        for i, line in enumerate(pred_lines):
            if line.startswith("-- query"):
                continue
            if line.startswith(column_name) and " ~~ " in line:
                if detect_like_type(line) == query_type:
                    norm_pred, denorm_pred = get_selectivity_for_strings(
                        [get_astrid_word(line)], embed_model, sel_model, str_helper)
                    parts = line.split(",")
                    sel= denorm_pred.item()/n_row
                    print(i, line, f"{sel:.6f}")
                    pred_lines[i] = f"{parts[0]},{sel:.6f}"

# write the updated output
f = open(output_preds_path,"w+")
f.writelines(f'{s}\n' for s in pred_lines)
f.close

imdb_movie_actors prefix n.name
0.0 13.641690361981155
73 n.name ~~ 'Eero%',0.005000 0.000001
0.0 13.641690361981155
91 n.name ~~ 'Mille%',0.005000 0.000009
0.0 13.641690361981155
103 n.name ~~ 'Trembl%',0.005000 0.000001
0.0 13.641690361981155
157 n.name ~~ 'Renor%',0.005000 0.000005
0.0 13.641690361981155
170 n.name ~~ 'Camer%',0.005000 0.000007
0.0 13.641690361981155
176 n.name ~~ 'Connell%',0.005000 0.000002
0.0 13.641690361981155
292 n.name ~~ 'Batis%',0.005000 0.000002
0.0 13.641690361981155
362 n.name ~~ 'Ts%',0.005000 0.000024
0.0 13.641690361981155
379 n.name ~~ 'Gerha%',0.005000 0.000001
0.0 13.641690361981155
384 n.name ~~ 'Summe%',0.005000 0.000004
0.0 13.641690361981155
424 n.name ~~ 'Timoth%',0.005000 0.000001
0.0 13.641690361981155
429 n.name ~~ 'Saza%',0.005000 0.000003
0.0 13.641690361981155
460 n.name ~~ 'Ma%',0.005000 0.000112
0.0 13.641690361981155
468 n.name ~~ 'M%',0.005000 0.001185
0.0 13.641690361981155
551 n.name ~~ 'Sophie%',0.005000 0.000005
0.0 13.6416903619

<function TextIOWrapper.close()>