In [1]:
import polars.selectors as cs
import numpy as np

import polars as pl

import matplotlib.pyplot as plt

In [2]:
def hist_of_numerical_values(df: pl.DataFrame):
    """
    Generates histograms for all numerical columns in a Polars DataFrame.

    Args:
        df: The Polars DataFrame.

    Returns:
        A dictionary where keys are column names and values are the histogram data.
    """
    histograms = {}
    for col_name in df.columns:
        if df[col_name].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]:
            hist_data = df[col_name].hist()
            histograms[col_name] = hist_data
    return histograms

In [3]:
index_cols = {
    1: ['l_returnflag', 'l_linestatus'],
    3: ['l_orderkey','o_orderdate','o_shippriority'],
    4: ['o_orderpriority'],
    5: ['n_name'],
    6: [],
    7: ['supp_nation', 'cust_nation', 'l_year'], 
    8: ['o_year'],
    9: ['nation', 'o_year'],
    12: ['l_shipmode'],
    13: ['c_count'],
    14: [],
    17: [],
    19: [],
    21: ['s_name'],
    22: ['cntrycode']
}
scale_required = set([4, 5, 6, 7, 9, 12, 13, 17, 19, 21, 22])
# not 8, 14
# skip 3, skip 20
queries_to_run = [1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 17, 19, 21, 22]

In [4]:
for query_ind in [3]:
    null_info = {}
    orig = pl.read_csv(f'unnoised/q{query_ind}.csv')
    if query_ind == 22:
        orig = orig.select(
        pl.col("cntrycode").cast(str),
        pl.col("numcust"),
        pl.col("totacctbal"))
    noised = pl.read_json(f'outputs/ap-duckdb-q{query_ind}-customer-step3/output.json')
    if index_cols[query_ind]:
        merged_df = orig.join(noised, on=index_cols[query_ind], suffix='_noised')
    else:
        merged_df = orig.join(noised, suffix='_noised', how='cross')
    suffix1 = ''
    suffix2 = '_noised'
    cols_with_suffixes = [col for col in merged_df.columns if suffix1 in col or suffix2 in col]

    base_names = set([col.replace(suffix1, '').replace(suffix2, '') for col in cols_with_suffixes])
    for base_name in base_names:
        nulls_exist = False
        null_inds, null_vals = [], []
        
        orig = base_name + suffix1
        noised = base_name + suffix2
        if orig in merged_df.columns and noised in merged_df.columns:
            rel_errors = []
            for ind in range(len(merged_df[orig])):
                if query_ind in scale_required:
                    const = 2 # subsampling scaling
                else:
                    const = 1
#                     print(merged_df[noised][ind])
                new = np.average(
                    [100*abs(
                        const*merged_df[noised][ind][tmp_ind] - merged_df[orig][ind]
                        ) / merged_df[orig][ind] for tmp_ind in range(
                            len(merged_df[noised][ind]))]
                )
                rel_errors.append(new)
            merged_df = merged_df.with_columns(pl.Series('pct_diff_' + base_name, rel_errors))
            num_outputs = len(list(merged_df[noised]))
            for ind in range(num_outputs):
                if len(list(merged_df[noised][ind])) < 100:
                    nulls_exist = True
                    null_inds.append(ind)
                    null_vals.append((100-len(list(merged_df[noised][ind]))))
                    print(query_ind, ind)
        null_info[base_name] = (nulls_exist, null_inds, null_vals)
        
    # Get columns that start with 'pct_diff'
    pct_diff_cols = [col for col in merged_df.columns if col.startswith('pct_diff')]

    # Create subplots based on number of pct_diff columns
    n_cols = len(pct_diff_cols)
    fig, axes = plt.subplots(1, n_cols, figsize=(5*n_cols, 4))

    # Handle case where there's only one column (axes won't be a list)
    if n_cols == 1:
        axes = [axes]

    # Create histogram for each column
    for i, col in enumerate(pct_diff_cols):
        # Convert to pandas for easier plotting, or use polars plot method
        values = merged_df[col].to_pandas()

        axes[i].hist(values, bins=30, alpha=0.7, edgecolor='black')
        axes[i].set_title(f'Histogram of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    for base in null_info:
        (nulls_exist, null_inds, null_vals) = null_info[base]
        if nulls_exist:
            fig, axes = plt.subplots(1, 1, figsize=(5, 4))
            # Handle case where there's only one column (axes won't be a list)
            if n_cols == 1:
                axes = [axes]
            axes[0].hist(null_vals, bins=30, alpha=0.7, edgecolor='black')
            axes[0].set_title(f'Histogram of nulls for column {base}')
            axes[0].set_xlabel('Fraction of nulls (%)')
            axes[0].set_ylabel('Frequency')
            axes[0].grid(True, alpha=0.3)
            plt.show()


TypeError: unsupported operand type(s) for *: 'int' and 'NoneType'

In [7]:
noised = pl.read_json(f'outputs/ap-duckdb-q{query_ind}-customer-step3/output.json')

In [34]:
t = noised[4]['revenue'].to_list()[0]
t

[312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542,
 312604.542]

In [36]:
noised

l_orderkey,o_orderdate,o_shippriority,revenue
i64,str,i64,list[f64]
223140,"""1995-03-14""",0,[null]
405063,"""1995-03-03""",0,"[353125.4577, 353125.4577, … 353125.4577]"
573861,"""1995-03-09""",0,"[351238.277, 351238.277, … 351238.277]"
121604,"""1995-03-07""",0,"[318576.4154, 318576.4154, … 318576.4154]"
462502,"""1995-03-08""",0,"[312604.542, 312604.542, … 312604.542]"
…,…,…,…
405286,"""1995-02-09""",0,[null]
216517,"""1994-11-27""",0,[null]
391552,"""1995-01-29""",0,[null]
506661,"""1994-12-02""",0,"[27420.0, 27420.0, … 27420.0]"
