In [191]:
import polars.selectors as cs
import numpy as np
import json
import polars as pl

import matplotlib.pyplot as plt

In [192]:
index_cols = {
    1: ['l_returnflag', 'l_linestatus'],
    3: ['l_orderkey','o_orderdate','o_shippriority'],
    4: ['o_orderpriority'],
    5: ['n_name'],
    6: [],
    7: ['supp_nation', 'cust_nation', 'l_year'], 
    8: ['o_year'],
    9: ['nation', 'o_year'],
    10: ['c_custkey', 'c_name', 'c_acctbal', 'c_phone', 'n_name', 'c_address', 'c_comment'],
    12: ['l_shipmode'],
    13: ['c_count'],
    14: [],
    
    17: [],
    19: [],
    20: [],
    21: ['s_name'],
    22: ['cntrycode']
}
scale_required = set([4, 5, 6, 7, 9, 12, 13, 17, 19, 21, 22])
# not 8, 14
# skip 3, skip 20
# skip 13, 19, 21 for now
queries_to_run = [1, 4, 5, 6, 7, 8, 9, 12, 14, 17, 22]

In [193]:
def get_m_diffs(dfs, index_cols, pct_diff_cols):
    """
    Filter and join approach: separate m=128 and m=1024, then join and calculate
    """
    # Split into two dataframes based on m value
    df_128=dfs[0]
    df_1024 = dfs[1]
    # Join the two dataframes
    if len(index_cols) > 0:
        comparison_df = df_128.join(df_1024, on=index_cols, suffix="_1024")
    else:
        comparison_df = df_128.join(df_1024, how='cross', suffix="_1024")
    
    # Calculate differences for each pct_diff column
    diff_expressions = []
    for col in pct_diff_cols:
        diff_expr = (pl.col(f"{col}_1024") - pl.col(col)).alias(f"m_diff_{col}")
        diff_expressions.append(diff_expr)
    
    result = comparison_df.with_columns(diff_expressions)
    
    return result

In [194]:
mi=1/16.
for query_ind in queries_to_run:
    dfs = []
    for m in [128, 1024]:
        null_info = {}
        orig = pl.read_csv(f'../unnoised/q{query_ind}.csv')
        if query_ind == 22:
            orig = orig.select(
            pl.col("cntrycode").cast(str),
            pl.col("numcust"),
            pl.col("totacctbal"))
        if query_ind == 10:
            orig = orig.select(
                pl.col("c_custkey"), pl.col("c_name"),
                pl.col("c_acctbal").cast(str),
                pl.col("n_name"), pl.col("c_address"), pl.col("c_phone"), pl.col("c_comment")
            )
        noised = pl.read_json(f'../outputs/m={m}/ap-duckdb-q{query_ind}-customer-{mi}-step3/output.json')
        if index_cols[query_ind]:
            merged_df = orig.join(noised, on=index_cols[query_ind], suffix='_noised')
        else:
            merged_df = orig.join(noised, suffix='_noised', how='cross')
        suffix1 = ''
        suffix2 = '_noised'
        cols_with_suffixes = [col for col in merged_df.columns if suffix1 in col or suffix2 in col]

        base_names = set([col.replace(suffix1, '').replace(suffix2, '') for col in cols_with_suffixes])
        for base_name in base_names:
            nulls_exist = False
            null_inds, null_vals = [], []

            orig = base_name + suffix1
            noised = base_name + suffix2
            if orig in merged_df.columns and noised in merged_df.columns:
                rel_errors = []
                for ind in range(len(merged_df[orig])):
                    if query_ind in scale_required:
                        const = 2 # subsampling scaling
                    else:
                        const = 1
                    noised_vals = [merged_df[noised][ind][tmp_ind] for tmp_ind in range(
                        len(merged_df[noised][ind])) if merged_df[noised][ind][tmp_ind] is not None]
                    if len(noised_vals) != 1000 and len(noised_vals) != 0:
                        print('reached', query_ind)
                    new = np.average(
                        [100*abs(
                            const*noised_vals[tmp_ind] - merged_df[orig][ind]
                            ) / merged_df[orig][ind] for tmp_ind in range(
                                len(noised_vals))]
                    )
                    rel_errors.append(new)
                merged_df = merged_df.with_columns(pl.Series('pct_diff_' + base_name, rel_errors))
                merged_df = merged_df.with_columns(pl.Series('mi', [mi]*len(rel_errors)))
                merged_df = merged_df.with_columns(pl.Series('m', [m]*len(rel_errors)))
        dfs.append(merged_df)
    pct_diff_columns = [col for col in dfs[0].columns if col.startswith('pct_diff_')]
    m_diffs = get_m_diffs(dfs, index_cols[query_ind], pct_diff_columns)
    rel_results = [col for col in m_diffs.columns if col.startswith('m_diff')]
    df = m_diffs[rel_results]
    global_min = df.select(pl.min_horizontal(df.columns)).min().item()
    global_max = df.select(pl.max_horizontal(df.columns)).max().item()
    print(query_ind, global_max, global_min)
    if query_ind == 8:
        print(m_diffs)
    print('\n\n')

1 0.14970810098791532 -0.4891350586952967



4 0.029338246300750548 -0.906178110889087



5 2.6850459664211 -0.76946879691803



6 -0.15236767563117004 -0.15236767563117004



7 3.4557523048117638 -1.4371711407582772



8 11.37580973514926 2.745425504196646
shape: (2, 12)
┌────────┬───────────┬─────────────┬─────────────┬───┬─────────────┬─────────┬────────┬────────────┐
│ o_year ┆ mkt_share ┆ mkt_share_n ┆ pct_diff_mk ┆ … ┆ pct_diff_mk ┆ mi_1024 ┆ m_1024 ┆ m_diff_pct │
│ ---    ┆ ---       ┆ oised       ┆ t_share     ┆   ┆ t_share_102 ┆ ---     ┆ ---    ┆ _diff_mkt_ │
│ i64    ┆ f64       ┆ ---         ┆ ---         ┆   ┆ 4           ┆ f64     ┆ i64    ┆ share      │
│        ┆           ┆ list[f64]   ┆ f64         ┆   ┆ ---         ┆         ┆        ┆ ---        │
│        ┆           ┆             ┆             ┆   ┆ f64         ┆         ┆        ┆ f64        │
╞════════╪═══════════╪═════════════╪═════════════╪═══╪═════════════╪═════════╪════════╪════════════╡
│ 1995   ┆ 0.028649 