In [7]:
import polars as pl
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from collections import defaultdict

from tools.utils.settings import DefaultPath as defpath

In [4]:
test_name = 'a_test'
dataset = 'wikipedia'
size = 'standard'


test_dir = f'{defpath.data_path.tests}/{test_name}/{dataset}'
results_extr_dir = test_dir + '/results/extracted'
q = '100K'

solvers = [('josie', 'set'), ('josie', 'bag'), ('lshforest', 'set'), ('lshforest', 'bag'), ('embedding', 'fasttext')]

results = pl.read_csv(f'{results_extr_dir}/final_results_q{q}.csv')
results = results.drop_nulls() 
results = results.filter(pl.col('sloth_overlap') != -1)

results = results.with_columns((pl.col('algorithm_overlap') - pl.col('sloth_overlap')).alias('difference_overlap'))
results = results.with_columns((pl.col('algorithm_overlap') / (pl.col('sloth_overlap') + 1)).alias('difference_overlap_norm'))

In [4]:
from tools.utils.parallel_worker import worker_fp_per_query


with mp.get_context('spawn').Pool(len(solvers)) as pool:
    r = pool.map(worker_fp_per_query, 
                results.select(['algorithm', 'mode', 'query_id', 'result_id', 'sloth_overlap'])
                .to_pandas().groupby(['algorithm', 'mode'], group_keys=True), chunksize=1)

In [5]:
x = pd.DataFrame([z for y in r for z in y], columns=['algorithm', 'mode', 'FP_count', 'FP_rate'])
fp_per_query_pivot = x.pivot_table(values=['FP_rate'], index=['algorithm', 'mode'], aggfunc=['mean', 'std'])

In [6]:
fp_per_query_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Unnamed: 0_level_1,Unnamed: 1_level_1,FP_rate,FP_rate
algorithm,mode,Unnamed: 2_level_2,Unnamed: 3_level_2
embedding,fasttext,0.653241,0.392916
josie,bag,0.0,0.0
josie,set,0.0,0.0
lshforest,bag,0.004055,0.056136
lshforest,set,0.004673,0.058694


In [5]:
results

query_id,result_id,algorithm,mode,algorithm_overlap,sloth_overlap,query_size,res_tab_size,intersection_mode_size,sloth_time(s),difference_overlap,difference_overlap_norm
i64,i64,str,str,i64,i64,i64,i64,i64,f64,i64,f64
1802503,1801468,"""josie""","""bag""",40,40,33,33,33,0.001,0,0.97561
1802503,1818378,"""josie""","""bag""",22,18,33,24,22,0.0,4,1.157895
1802503,1802124,"""josie""","""bag""",14,10,33,42,7,0.0,4,1.272727
1802503,1800989,"""josie""","""bag""",14,7,33,223,10,0.001,7,1.75
1802503,1802543,"""josie""","""bag""",13,9,33,43,7,0.0,4,1.3
…,…,…,…,…,…,…,…,…,…,…,…
339208,365099,"""josie""","""set""",7,6,12,13,7,0.0,1,1.0
339208,179019,"""josie""","""set""",7,6,12,19,7,0.0,1,1.0
339208,117055,"""josie""","""set""",7,5,12,23,7,0.0,2,1.166667
339208,78548,"""josie""","""set""",7,6,12,12,7,0.0,1,1.0


In [5]:
# results.filter((pl.col('query_id') == 350254) & (pl.col('algorithm') == 'josie') & (pl.col('mode') == 'set')).select('result_id').to_numpy()

In [8]:
silver_standard = defaultdict(set)
nqueries = results.select('query_id').unique().shape[0]
results_ids = results.select(['query_id', 'result_id', 'sloth_overlap']).unique().group_by(['query_id'])

for query_id, ids_overlaps in tqdm(results_ids, total=nqueries):
    s = set()
    s.update(map(tuple, ids_overlaps.to_numpy()[:, 1:].tolist()))
    silver_standard[query_id[0]] = sorted([x for x in list(s) if x[1] > 0], key=lambda x: x[1], reverse=True)

100%|██████████| 99990/99990 [00:22<00:00, 4458.94it/s]


In [5]:
from tools.utils import parallel_worker
import importlib

importlib.reload(parallel_worker)

p_values = [1, 3, 5, 10]
precision_at_p_results = []
query_groups = results.select('query_id', 'algorithm', 'mode', 'result_id').to_pandas().groupby("query_id", group_keys=True)
# Parallel version needed for large query sets
with mp.Pool(processes=72) as pool:
    precision_at_p_results = pool.map(
        parallel_worker.worker_precision, 
        ((name, data, p_values, silver_standard[name]) for name, data in query_groups), 
        )
    
    precision_at_p_results = [x for qres in precision_at_p_results for x in qres]

In [6]:
len(precision_at_p_results)

986712

In [33]:
results.sample()

query_id,result_id,algorithm,mode,algorithm_overlap,sloth_overlap,query_size,res_tab_size,intersection_mode_size,sloth_time(s),difference_overlap,difference_overlap_norm
i64,i64,str,str,i64,i64,i64,i64,i64,f64,i64,f64
425751,955188,"""josie""","""set""",5,5,6,123,5,0.0,0,0.833333


In [39]:
results.select('query_id').sample()['query_id'][0]

200928

In [40]:
query_id = results.select('query_id').sample()['query_id'][0]
query_silver_standard = silver_standard[query_id]
true_relevances = [x[1] for x in query_silver_standard]
query_id, true_relevances[:10]

(450718, [20, 20, 15, 15, 15, 15, 15, 15, 15, 15])

In [41]:
from collections import Counter as multiset

for name, data in results.filter(pl.col('query_id') == query_id).to_pandas().groupby(['algorithm', 'mode'], group_keys=True):
    result_relevances = data['sloth_overlap'].values.tolist()
    print(name)
    tr = multiset(true_relevances[:3])
    rr = multiset(result_relevances[:10])
    p = sum(x[1] for x in (tr & rr).items())
    print(result_relevances[:10], p)

('embedding', 'fasttext')
[0, 0, 0, 0, 0, 0, 15, 12, 15, 15] 1
('josie', 'bag')
[20, 20, 15, 15, 15, 15, 15, 15, 15, 15] 3
('josie', 'set')
[15, 12, 15, 15, 15, 10, 12, 15, 12, 12] 1
('lshforest', 'bag')
[1, 1, 1, 10, 1, 1, 1, 1, 1, 1] 0
('lshforest', 'set')
[1, 1, 1, 1, 1, 1, 1, 1, 1, 6] 0


In [31]:
from collections import Counter as multiset

tr = multiset(true_relevances[:10])

rr = multiset([40, 18, 10, 7, 9, 9, 9, 6, 9, 8])

sum(x[1] for x in (tr & rr).items())

8

In [8]:
import matplotlib.colors as mcolors

colors = list(mcolors.TABLEAU_COLORS.keys())
methods = [('josie', 'set'), ('josie', 'bag'), ('lshforest', 'set'), ('lshforest', 'bag'), ('embedding', 'fasttext')]
methods = {m: c for m, c in zip(methods, colors[:len(methods)])}
methods

{('josie', 'set'): 'tab:blue',
 ('josie', 'bag'): 'tab:orange',
 ('lshforest', 'set'): 'tab:green',
 ('lshforest', 'bag'): 'tab:red',
 ('embedding', 'fasttext'): 'tab:purple'}