In [47]:
import polars.selectors as cs
import numpy as np

import polars as pl

import matplotlib.pyplot as plt

In [48]:
def hist_of_numerical_values(df: pl.DataFrame):
    """
    Generates histograms for all numerical columns in a Polars DataFrame.

    Args:
        df: The Polars DataFrame.

    Returns:
        A dictionary where keys are column names and values are the histogram data.
    """
    histograms = {}
    for col_name in df.columns:
        if df[col_name].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]:
            hist_data = df[col_name].hist()
            histograms[col_name] = hist_data
    return histograms

In [49]:
index_cols = {
    1: ['l_returnflag', 'l_linestatus'],
    3: ['l_orderkey','o_orderdate','o_shippriority'],
    4: ['o_orderpriority'],
    5: ['n_name'],
    6: [],
    7: ['supp_nation', 'cust_nation', 'l_year'], 
    8: ['o_year'],
    9: ['nation', 'o_year'],
    12: ['l_shipmode'],
    13: ['c_count'],
    14: [],
    15: [],
    17: [],
    19: [],
    20: [],
    21: ['s_name'],
    22: ['cntrycode']
}
scale_required = set([1,4, 5, 6, 7, 9, 12, 13, 17, 19, 21, 22])
# not 8, 14
# skip 3, skip 20
queries_to_run = [1, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 17, 19, 21, 22]

In [52]:
# for query_ind in queries_to_run:
num_trials = 1000
avg_errors = {}
all_nulls = {}
for query_ind in queries_to_run:
#     print(query_ind)
#     1/128, 1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1.0, 2.0, 4.0
    num_nulls = 0.
    total_error = []
    for mi in [1/16]:
        null_info = {}
        orig = pl.read_csv(f'../unnoised/q{query_ind}.csv')
        if query_ind == 22:
            orig = orig.select(
            pl.col("cntrycode").cast(str),
            pl.col("numcust"),
            pl.col("totacctbal"))
        noised = pl.read_json(f'../outputs/ap-duckdb-q{query_ind}-customer-{mi}-step3/output.json')
        if index_cols[query_ind]:
            merged_df = orig.join(noised, on=index_cols[query_ind], suffix='_noised')
        else:
            merged_df = orig.join(noised, suffix='_noised', how='cross')
        suffix1 = ''
        suffix2 = '_noised'
        cols_with_suffixes = [col for col in merged_df.columns if suffix1 in col or suffix2 in col]

        base_names = set([col.replace(suffix1, '').replace(suffix2, '') for col in cols_with_suffixes])
        if query_ind == 15:
            base_names = ['total_revenue'] # only look at numeric
        all_outputs = 0.
        for base_name in base_names:
            nulls_exist = False
            null_inds, null_vals = [], []

            orig = base_name + suffix1
            noised = base_name + suffix2
            if orig in merged_df.columns and noised in merged_df.columns:
                rel_errors = []
                for ind in range(len(merged_df[orig])):
                    if query_ind in scale_required:
                        const = 2 # subsampling scaling
                        if query_ind == 1 and not (base_name[:3] == 'sum' or base_name[:5] == 'count'):
                            const = 1 # remove for parts of q1
                    else:
                        const = 1
    #                     print(merged_df[noised][ind])
                    noised_vals = [merged_df[noised][ind][tmp_ind] for tmp_ind in range(
                        len(merged_df[noised][ind])) if merged_df[noised][ind][tmp_ind] is not None]
                    if len(noised_vals) != 1000 and len(noised_vals) != 0:
                        print('reached', query_ind)
                    new = np.average(
                        [100*abs(
                            const*noised_vals[tmp_ind] - merged_df[orig][ind]
                            ) / merged_df[orig][ind] for tmp_ind in range(
                                len(noised_vals))]
                    )
                    if not np.isnan(new):
                        total_error.append(new)
                    rel_errors.append(new)
                merged_df = merged_df.with_columns(pl.Series('pct_diff_' + base_name, rel_errors))
                num_outputs = len(list(merged_df[noised]))
                all_outputs+= num_outputs
                for ind in range(num_outputs):
                    full_list = [x for x in list(merged_df[noised][ind]) if x is not None]
                    if len(full_list) < num_trials:
                        print(full_list, noised, ind, query_ind)
                        print('reached')
                        num_nulls += 1
                        nulls_exist = True
                        null_inds.append(ind)
                        null_vals.append(100*(num_trials-len(full_list))/num_trials)
#                         print(query_ind, ind, null_vals)
            null_info[base_name] = (nulls_exist, null_inds, null_vals)
        all_nulls[query_ind] = 100*num_nulls/all_outputs
        avg_errors[query_ind] = np.average(total_error)
        print(query_ind, len(total_error))

#         # Get columns that start with 'pct_diff'
#         pct_diff_cols = [col for col in merged_df.columns if col.startswith('pct_diff')]

#         # Create subplots based on number of pct_diff columns
#         n_cols = len(pct_diff_cols)
#         fig, axes = plt.subplots(1, n_cols, figsize=(5*n_cols, 4))

#         # Handle case where there's only one column (axes won't be a list)
#         if n_cols == 1:
#             axes = [axes]

#         # Create histogram for each column
#         for i, col in enumerate(pct_diff_cols):
#             # Convert to pandas for easier plotting, or use polars plot method
#             values = merged_df[col].to_pandas()

#             axes[i].hist(values, bins=30, alpha=0.7, edgecolor='black')
#             axes[i].set_title(f'Histogram of {col}')
#             axes[i].set_xlabel(col)
#             axes[i].set_ylabel('Frequency')
#             axes[i].grid(True, alpha=0.3)

#         plt.tight_layout()
#         plt.savefig(f'tmp/q{query_ind}_{mi}_{col}_hist.png', facecolor='white', bbox_inches='tight')

#         for base in null_info:
#             (nulls_exist, null_inds, null_vals) = null_info[base]
#             if nulls_exist:
#                 fig, axes = plt.subplots(1, 1, figsize=(5, 4))
#                 # Handle case where there's only one column (axes won't be a list)
#                 if n_cols == 1:
#                     axes = [axes]
#                 axes[0].hist(null_vals, bins=30, alpha=0.7, edgecolor='black')
#                 axes[0].set_title(f'Histogram of nulls for column {base}')
#                 axes[0].set_xlabel('Fraction of nulls (%)')
#                 axes[0].set_ylabel('Frequency')
#                 axes[0].grid(True, alpha=0.3)
#                 plt.savefig(f'tmp/q{query_ind}_{mi}_{base}_nulls.png', facecolor='white', bbox_inches='tight')

1 32
[] revenue_noised 0 3
reached
[] revenue_noised 1 3
reached
[] revenue_noised 2 3
reached
[] revenue_noised 3 3
reached
[] revenue_noised 4 3
reached
[] revenue_noised 5 3
reached
[] revenue_noised 6 3
reached
[] revenue_noised 7 3
reached
[] revenue_noised 8 3
reached
[] revenue_noised 9 3
reached
3 0
4 5
5 5
6 1
7 4
8 2
9 175
12 4
[] custdist_noised 38 13
reached
[] custdist_noised 39 13
reached
[] custdist_noised 40 13
reached
[] custdist_noised 41 13
reached
13 38
14 1
15 1
17 1
19 1
[] numwait_noised 82 21
reached
[] numwait_noised 96 21
reached
21 98
22 14


In [53]:
all_nulls

{1: 0.0,
 3: 100.0,
 4: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 12: 0.0,
 13: 9.523809523809524,
 14: 0.0,
 15: 0.0,
 17: 0.0,
 19: 0.0,
 21: 2.0,
 22: 0.0}

In [54]:
avg_errors

{1: 0.7732860751185819,
 3: nan,
 4: 2.457698262655051,
 5: 8.947538868703782,
 6: 1.2230274679801414,
 7: 9.0981388689475,
 8: 36.03536047108541,
 9: 7.156608821035725,
 12: 2.876811213196448,
 13: 13.029334768282093,
 14: 2.2566754193439857,
 15: 35.55656505822582,
 17: 10.695317937297462,
 19: 23.116914035307033,
 21: 63.31536583248483,
 22: 7.882563610303036}

In [27]:
merged_df['numwait_noised'][52]

null


In [11]:
avg_errors

{21: 64.20572510180371}

In [21]:
null_info

{'numwait': (True, [52], [100.0]), 's_name': (False, [], [])}

In [19]:
num_nulls

1.0

In [22]:
print(merged_df[52]['numwait'])

shape: (1,)
Series: 'numwait' [i64]
[
	13
]


In [80]:
for i in range(len(merged_df)):
    print(i, merged_df[i]['custdist'].to_list()[0], merged_df[i]['pct_diff_custdist'].to_list())

0 50004 [1.1196400910320623]
1 6668 [2.973363810267819]
2 6563 [2.7433035361358535]
3 6004 [2.983284372430054]
4 5890 [3.0222579261386073]
5 5600 [3.3500042940348767]
6 5029 [2.9536883946210755]
7 4680 [3.190575314304907]
8 4805 [3.5212760490062824]
9 4473 [3.6883829004876216]
10 4531 [3.8092959565633637]
11 4507 [3.61348632804923]
12 4410 [3.5748872996624077]
13 4445 [3.5487593956018784]
14 4463 [3.7854498280879487]
15 4168 [4.2912288755703845]
16 3742 [4.1767616705118975]
17 3273 [4.090101358815831]
18 3189 [4.7625144900816325]
19 2700 [4.0835136679844135]
20 2090 [5.855311957853457]
21 1957 [5.200809020599325]
22 1653 [5.882728613775029]
23 1177 [6.876115153650721]
24 1010 [7.396572650238582]
25 901 [8.461689110478826]
26 564 [9.933316303031944]
27 408 [11.934576952863091]
28 378 [13.456236907308611]
29 242 [16.59228775168772]
30 128 [20.615426875523436]
31 133 [21.341722719137696]
32 72 [34.20882338231368]
33 52 [30.408288854492405]
34 32 [45.807240242035306]
35 20 [63.949857792059

In [81]:
merged_df

c_count,custdist,custdist_noised,pct_diff_custdist
i64,i64,list[f64],f64
0,50004,"[24929.992711, 24309.892527, … 25344.932356]",1.11964
10,6668,"[3360.383482, 3185.948872, … 3301.054971]",2.973364
9,6563,"[3316.546404, 3407.662363, … 3412.462173]",2.743304
11,6004,"[2952.667351, 3047.830932, … 3084.504024]",2.983284
8,5890,"[3070.698199, 3047.307231, … 2929.17853]",3.022258
…,…,…,…
37,8,"[2.740054, 8.15166, … 2.051058]",87.932131
41,3,[null],
40,3,[null],
39,1,[null],


In [75]:
merged_df[0]['custdist'].to_list()


[50004]

In [62]:
all_nulls

{1: 0.0,
 3: 100.0,
 4: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 12: 0.0,
 13: 9.523809523809524,
 14: 0.0,
 15: 0.0,
 17: 0.0,
 19: 0.0,
 21: 4.0,
 22: 0.0}

In [63]:
avg_errors

{1: 0.7951532286166187,
 3: nan,
 4: 2.4076477798448783,
 5: 9.045629271965053,
 6: 1.2860405483375585,
 7: 9.017410097004426,
 8: 35.822894463656,
 9: 7.160786752071281,
 12: 2.9457904782988082,
 13: 13.58162708957928,
 14: 2.2317893717722153,
 15: 34.50384808901249,
 17: 11.455715445346588,
 19: 23.48917728195702,
 21: 63.56633656447392,
 22: 7.795469260205711}