# 13. Wide-Table Coverage Filter (Additive)

This notebook creates a coverage-filtered wide training table as an additive artifact for faster follow-up DP-SGD experiments.

It does not modify existing wide-table artifacts.

In [1]:
from pathlib import Path
import sys
import pandas as pd
from IPython.display import display, Markdown

ROOT = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd().resolve()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.experiments.wide_filter import build_coverage_filtered_wide_table, summarize_numeric_sparsity

INPUT_WIDE = ROOT / 'data' / 'reporting' / 'wide_training_table.parquet'
OUT_DIR = ROOT / 'data' / 'experiments_additive' / 'wide_filter'
OUT_DIR.mkdir(parents=True, exist_ok=True)
FILTERED_WIDE = OUT_DIR / 'wide_training_table_cov3.parquet'

display(Markdown(f"Input wide table: `{INPUT_WIDE}`"))

Input wide table: `/Users/enscribe/Repositories/School/dsc180-q2/data/reporting/wide_training_table.parquet`

In [2]:
summary = build_coverage_filtered_wide_table(
    input_path=INPUT_WIDE,
    output_path=FILTERED_WIDE,
    min_nonzero_numeric=3,
)

display(Markdown('## Coverage filter summary'))
display(pd.DataFrame([{
    'input_rows': summary.input_rows,
    'output_rows': summary.output_rows,
    'retention_rate': summary.output_rows / summary.input_rows if summary.input_rows else 0.0,
    'numeric_cols': summary.numeric_cols,
    'threshold': summary.threshold,
    'output_path': str(summary.output_path),
}]))

## Coverage filter summary

Unnamed: 0,input_rows,output_rows,retention_rate,numeric_cols,threshold,output_path
0,1000000,282315,0.282315,59,3,/Users/enscribe/Repositories/School/dsc180-q2/...


In [3]:
display(Markdown('## Lowest nonzero-rate numeric columns in original wide table'))
display(summarize_numeric_sparsity(INPUT_WIDE, top_k=20))

display(Markdown('## Lowest nonzero-rate numeric columns in filtered wide table'))
display(summarize_numeric_sparsity(FILTERED_WIDE, top_k=20))

## Lowest nonzero-rate numeric columns in original wide table

Unnamed: 0,column,nonzero_rows,nonzero_rate
49,psys_rap_nrs,611,0.000611
50,psys_rap_avg,611,0.000611
54,avg_freq_avg,613,0.000613
53,avg_freq_nrs,613,0.000613
56,temp_avg,621,0.000621
55,temp_nrs,622,0.000622
58,pkg_power_avg,800,0.0008
57,pkg_power_nrs,800,0.0008
28,webcat_productivity_crm,874,0.000874
32,webcat_productivity_project_management,3627,0.003627


## Lowest nonzero-rate numeric columns in filtered wide table

Unnamed: 0,column,nonzero_rows,nonzero_rate
49,psys_rap_nrs,611,0.002164
50,psys_rap_avg,611,0.002164
54,avg_freq_avg,613,0.002171
53,avg_freq_nrs,613,0.002171
56,temp_avg,621,0.0022
55,temp_nrs,622,0.002203
58,pkg_power_avg,800,0.002834
57,pkg_power_nrs,800,0.002834
28,webcat_productivity_crm,874,0.003096
32,webcat_productivity_project_management,3627,0.012847
