In [113]:
import polars.selectors as cs
import numpy as np
import json
import polars as pl

import matplotlib.pyplot as plt

In [114]:
index_cols = {
    1: ['l_returnflag', 'l_linestatus'],
    3: ['l_orderkey','o_orderdate','o_shippriority'],
    4: ['o_orderpriority'],
    5: ['n_name'],
    6: [],
    7: ['supp_nation', 'cust_nation', 'l_year'], 
    8: ['o_year'],
    9: ['nation', 'o_year'],
    10: ['c_custkey', 'c_name', 'c_acctbal', 'c_phone', 'n_name', 'c_address', 'c_comment'],
    12: ['l_shipmode'],
    13: ['c_count'],
    14: [],
    
    17: [],
    19: [],
    20: [],
    21: ['s_name'],
    22: ['cntrycode']
}
scale_required = set([4, 5, 6, 7, 9, 12, 13, 17, 19, 21, 22])
# not 8, 14
# skip 3, skip 20
# skip 13, 19, 21 for now
queries_to_run = [1, 4, 5, 6, 7, 8, 9, 12, 14, 17, 22]

In [164]:
def get_m_diffs(dfs, index_cols, pct_diff_cols):
    """
    Filter and join approach: separate m=128 and m=1024, then join and calculate
    """
    # Split into two dataframes based on m value
    df_128=dfs[0]
    df_1024 = dfs[1]
    # Join the two dataframes
    if len(index_cols) > 0:
        comparison_df = df_128.join(df_1024, on=index_cols, suffix="_1024")
    else:
        comparison_df = df_128.join(df_1024, how='cross', suffix="_1024")
    
    # Calculate differences for each pct_diff column
    diff_expressions = []
    for col in pct_diff_cols:
        diff_expr = (100*abs(pl.col(f"{col}_1024") - pl.col(col)) / pl.col(col)).alias(f"m_diff_{col}")
        diff_expressions.append(diff_expr)
    
    result = comparison_df.with_columns(diff_expressions)
    
    return result

In [166]:
mi=1/128.
for query_ind in queries_to_run:
    dfs = []
    for m in [128, 1024]:
        null_info = {}
        orig = pl.read_csv(f'../unnoised/q{query_ind}.csv')
        if query_ind == 22:
            orig = orig.select(
            pl.col("cntrycode").cast(str),
            pl.col("numcust"),
            pl.col("totacctbal"))
        if query_ind == 10:
            orig = orig.select(
                pl.col("c_custkey"), pl.col("c_name"),
                pl.col("c_acctbal").cast(str),
                pl.col("n_name"), pl.col("c_address"), pl.col("c_phone"), pl.col("c_comment")
            )
        noised = pl.read_json(f'../outputs/m={m}/ap-duckdb-q{query_ind}-customer-{mi}-step3/output.json')
        if index_cols[query_ind]:
            merged_df = orig.join(noised, on=index_cols[query_ind], suffix='_noised')
        else:
            merged_df = orig.join(noised, suffix='_noised', how='cross')
        suffix1 = ''
        suffix2 = '_noised'
        cols_with_suffixes = [col for col in merged_df.columns if suffix1 in col or suffix2 in col]

        base_names = set([col.replace(suffix1, '').replace(suffix2, '') for col in cols_with_suffixes])
        for base_name in base_names:
            nulls_exist = False
            null_inds, null_vals = [], []

            orig = base_name + suffix1
            noised = base_name + suffix2
            if orig in merged_df.columns and noised in merged_df.columns:
                rel_errors = []
                for ind in range(len(merged_df[orig])):
                    if query_ind in scale_required:
                        const = 2 # subsampling scaling
                    else:
                        const = 1
                    noised_vals = [merged_df[noised][ind][tmp_ind] for tmp_ind in range(
                        len(merged_df[noised][ind])) if merged_df[noised][ind][tmp_ind] is not None]
                    if len(noised_vals) != 1000 and len(noised_vals) != 0:
                        print('reached', query_ind)
                    new = np.average(
                        [100*abs(
                            const*noised_vals[tmp_ind] - merged_df[orig][ind]
                            ) / merged_df[orig][ind] for tmp_ind in range(
                                len(noised_vals))]
                    )
                    rel_errors.append(new)
                merged_df = merged_df.with_columns(pl.Series('pct_diff_' + base_name, rel_errors))
                merged_df = merged_df.with_columns(pl.Series('mi', [mi]*len(rel_errors)))
                merged_df = merged_df.with_columns(pl.Series('m', [m]*len(rel_errors)))
        dfs.append(merged_df)
    pct_diff_columns = [col for col in dfs[0].columns if col.startswith('pct_diff_')]
    m_diffs = get_m_diffs(dfs, index_cols[query_ind], pct_diff_columns)
    rel_results = [col for col in m_diffs.columns if col.startswith('m_diff')]
    df = m_diffs[rel_results]
    global_min = df.select(pl.min_horizontal(df.columns)).min().item()
    global_max = df.select(pl.max_horizontal(df.columns)).max().item()
    print(query_ind, global_max)
    if query_ind == 8:
        print(m_diffs)
    print('\n\n')

1 8.604729996657483



4 10.847832801849622



5 11.186415272367968



6 1.2364715322252289



7 5.560724143586848



8 6.7786366395635875
shape: (2, 12)
┌────────┬───────────┬─────────────┬────────────┬───┬────────────┬───────────┬────────┬────────────┐
│ o_year ┆ mkt_share ┆ mkt_share_n ┆ pct_diff_m ┆ … ┆ pct_diff_m ┆ mi_1024   ┆ m_1024 ┆ m_diff_pct │
│ ---    ┆ ---       ┆ oised       ┆ kt_share   ┆   ┆ kt_share_1 ┆ ---       ┆ ---    ┆ _diff_mkt_ │
│ i64    ┆ f64       ┆ ---         ┆ ---        ┆   ┆ 024        ┆ f64       ┆ i64    ┆ share      │
│        ┆           ┆ list[f64]   ┆ f64        ┆   ┆ ---        ┆           ┆        ┆ ---        │
│        ┆           ┆             ┆            ┆   ┆ f64        ┆           ┆        ┆ f64        │
╞════════╪═══════════╪═════════════╪════════════╪═══╪════════════╪═══════════╪════════╪════════════╡
│ 1995   ┆ 0.028649  ┆ [0.121319,  ┆ 322.935627 ┆ … ┆ 315.175816 ┆ 0.0078125 ┆ 1024   ┆ 2.402897   │
│        ┆           ┆ 0.088126, … ┆  

In [90]:
m_diffs

o_year,mkt_share,mkt_share_noised,pct_diff_mkt_share,mi,m,mkt_share_1024,mkt_share_noised_1024,pct_diff_mkt_share_1024,mi_1024,m_1024,m_diff_pct_diff_mkt_share
i64,f64,list[f64],f64,f64,i64,f64,list[f64],f64,f64,i64,f64
1995,0.028649,"[0.121319, 0.088126, … 0.152377]",322.935627,0.0078125,128,0.028649,"[0.059623, 0.045622, … 0.060671]",315.175816,0.0078125,1024,-7.759811
1996,0.01825,"[0.020605, 0.081351, … -0.026852]",307.259321,0.0078125,128,0.01825,"[-0.019248, 0.025014, … 0.056986]",328.087314,0.0078125,1024,20.827993


In [78]:
global_min = df.select(pl.min_horizontal(df.columns)).min().item()
global_max = df.select(pl.max_horizontal(df.columns)).max().item()

In [172]:
for query_ind in queries_to_run:
    print(f'query {query_ind}')
    groups = [int(k[:-5]) for k in os.listdir(f'../outputs/m=1024/ap-duckdb-q{query_ind}-customer-0.5-step2/')]
    max_diff = None
    for group in groups:
        scales = []
        for m in [128, 1024]:
            fname = f'../outputs/m={m}/ap-duckdb-q{query_ind}-customer-0.5-step2/{group}.json'
            d = json.load(open(fname, 'rb'))
            scales.append(np.sqrt(d['scale']))
        scale_diff = 100*abs(scales[0] - scales[1]) / scales[1]
        print(group, scales[0], scales[1], scale_diff)
        if max_diff is None or scale_diff > max_diff:
            max_diff=scale_diff
        
    print(query_ind, max_diff)
    print('----')

query 1
20 54.68733222134188 56.73366931676471 3.606918290437701
16 0.03566332987729239 0.03641475999276939 2.063531698756785
6 119860142.10519603 116921672.71366768 2.5131947938552353
7 58142197.766879775 58898077.558953196 1.2833692089811148
17 0.23363176920773504 0.2350423748448233 0.6001494998591401
21 359.84933632054964 356.68836352110293 0.8862001463245651
10 114000357.33902246 111094560.42104848 2.6156068370593553
0 43297.388812710444 42747.14299567188 1.2872107431700812
26 6.436745787248903e-05 5.916325462888581e-05 8.796343737760374
30 3255.3117260246754 3235.511660791688 0.6119608676713165
31 1628.7074099261774 1623.585641850151 0.315460296273005
27 8.578309188401074e-05 8.405767354373924e-05 2.052660117191655
1 2329.605518971707 2184.9717413281755 6.619480467770869
11 55308806.15941343 55977398.942036025 1.1943977306178848
2 84930.0770489626 83016.28081298493 2.305326397708637
28 1670.9137551350786 1662.0944886781958 0.5306116178687568
12 60383832.562896125 59514766.72122111

In [173]:
348056.9509657247/4497840

0.0773831330073379

In [145]:
from scipy.stats import bernoulli

member_priors = {}
for _ in range(1024):
    test_vec = bernoulli.rvs(0.5, loc=0, size=150000)
    for ind in range(len(test_vec)):
        if ind not in member_priors: 
            member_priors[ind] = 0
        member_priors[ind] += test_vec[ind]



In [146]:
max(member_priors.values()), min(member_priors.values())

(582, 448)

In [148]:
582/1024

0.568359375

In [149]:
448/1024

0.4375