In [42]:
import polars.selectors as cs
import numpy as np

import polars as pl

import matplotlib.pyplot as plt

In [43]:
def hist_of_numerical_values(df: pl.DataFrame):
    """
    Generates histograms for all numerical columns in a Polars DataFrame.

    Args:
        df: The Polars DataFrame.

    Returns:
        A dictionary where keys are column names and values are the histogram data.
    """
    histograms = {}
    for col_name in df.columns:
        if df[col_name].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]:
            hist_data = df[col_name].hist()
            histograms[col_name] = hist_data
    return histograms

In [44]:
index_cols = {
    1: ['l_returnflag', 'l_linestatus'],
    3: ['l_orderkey','o_orderdate','o_shippriority'],
    4: ['o_orderpriority'],
    5: ['n_name'],
    6: [],
    7: ['supp_nation', 'cust_nation', 'l_year'], 
    8: ['o_year'],
    9: ['nation', 'o_year'],
    12: ['l_shipmode'],
    13: ['c_count'],
    14: [],
    15: [],
    17: [],
    19: [],
    20: [],
    21: ['s_name'],
    22: ['cntrycode']
}
scale_required = set([1,4, 5, 6, 7, 9, 12, 13, 17, 19, 21, 22])
# not 8, 14
# skip 3, skip 20
queries_to_run = [1, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 17, 19, 21, 22]

In [68]:
# for query_ind in queries_to_run:
num_trials = 1000
avg_errors = {}
all_nulls = {}
for query_ind in queries_to_run:

    num_nulls = 0.
    total_error = []
    for mi in [1/16]:
        null_info = {}
        orig = pl.read_csv(f'../unnoised/q{query_ind}.csv')
        if query_ind == 22:
            orig = orig.select(
            pl.col("cntrycode").cast(str),
            pl.col("numcust"),
            pl.col("totacctbal"))
        noised = pl.read_json(f'../outputs/ap-duckdb-q{query_ind}-customer-{mi}-step3/output.json')
        if index_cols[query_ind]:
            merged_df = orig.join(noised, on=index_cols[query_ind], suffix='_noised', how='left')
        else:
            merged_df = orig.join(noised, suffix='_noised', how='cross')
        suffix1 = ''
        suffix2 = '_noised'
        cols_with_suffixes = [col for col in merged_df.columns if suffix1 in col or suffix2 in col]

        base_names = set([col.replace(suffix1, '').replace(suffix2, '') for col in cols_with_suffixes])
        if query_ind == 15:
            base_names = ['total_revenue'] # only look at numeric
        all_outputs = 0.
        for base_name in base_names:
            nulls_exist = False
            null_inds, null_vals = [], []

            orig = base_name + suffix1
            noised = base_name + suffix2
            if orig in merged_df.columns and noised in merged_df.columns:
                rel_errors = []
                for ind in range(len(merged_df[orig])):
                    if query_ind in scale_required:
                        const = 2 # subsampling scaling
                        if query_ind == 1 and not (base_name[:3] == 'sum' or base_name[:5] == 'count'):
                            const = 1 # remove for parts of q1
                    else:
                        const = 1
                    noised_vals = [merged_df[noised][ind][tmp_ind] for tmp_ind in range(
                        len(merged_df[noised][ind])) if merged_df[noised][ind][tmp_ind] is not None]
                    if len(noised_vals) != 1000 and len(noised_vals) != 0:
                        print('reached', query_ind)
                    new = np.average(
                        [100*abs(
                            const*noised_vals[tmp_ind] - merged_df[orig][ind]
                            ) / merged_df[orig][ind] for tmp_ind in range(
                                len(noised_vals))]
                    )
                    if not np.isnan(new):
                        total_error.append(new)
                    rel_errors.append(new)
                merged_df = merged_df.with_columns(pl.Series('pct_diff_' + base_name, rel_errors))
                num_outputs = len(list(merged_df[noised]))
                all_outputs+= num_outputs
                for ind in range(num_outputs):
                    full_list = [x for x in list(merged_df[noised][ind]) if x is not None]
                    if len(full_list) < num_trials:
                        print(full_list, noised, ind, query_ind)
                        print('reached')
                        num_nulls += 1
                        nulls_exist = True
                        null_inds.append(ind)
                        null_vals.append(100*(num_trials-len(full_list))/num_trials)
#                         print(query_ind, ind, null_vals)
            null_info[base_name] = (nulls_exist, null_inds, null_vals)
        all_nulls[query_ind] = 100*num_nulls/all_outputs
        avg_errors[query_ind] = np.average(total_error)
        print(query_ind, len(total_error))


1 32
[] revenue_noised 0 3
reached
[] revenue_noised 1 3
reached
[] revenue_noised 2 3
reached
[] revenue_noised 3 3
reached
[] revenue_noised 4 3
reached
[] revenue_noised 5 3
reached
[] revenue_noised 6 3
reached
[] revenue_noised 7 3
reached
[] revenue_noised 8 3
reached
[] revenue_noised 9 3
reached
3 0
4 5
5 5
6 1
7 4
8 2
9 175
12 4
[] custdist_noised 38 13
reached
[] custdist_noised 39 13
reached
[] custdist_noised 40 13
reached
[] custdist_noised 41 13
reached
13 38
14 1
15 1
17 1
19 1
[] numwait_noised 72 21
reached
[] numwait_noised 80 21
reached
[] numwait_noised 83 21
reached
21 97
22 14


In [70]:
avg_errors

{1: 0.7640571939467615,
 3: nan,
 4: 2.5469958836135804,
 5: 9.263597485973447,
 6: 1.3989311870034076,
 7: 8.66999545826608,
 8: 38.25116485495772,
 9: 7.175120996019211,
 12: 2.9308897873029274,
 13: 12.95213016495732,
 14: 2.3091034711389145,
 15: 36.05825185631173,
 17: 11.230387448796444,
 19: 22.53806528302791,
 21: 63.421272685664235,
 22: 7.9831995583919175}

In [71]:
all_nulls

{1: 0.0,
 3: 100.0,
 4: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 12: 0.0,
 13: 9.523809523809524,
 14: 0.0,
 15: 0.0,
 17: 0.0,
 19: 0.0,
 21: 3.0,
 22: 0.0}