# 14. PE Support Diagnostics (Additive)

This notebook analyzes category support and coverage gaps in PE outputs to guide constrained reruns.

It reads existing artifacts only and writes diagnostic CSVs to additive paths.

In [1]:
from pathlib import Path
import pandas as pd
from IPython.display import display, Markdown

ROOT = Path.cwd().resolve().parent if Path.cwd().name == 'notebooks' else Path.cwd().resolve()
REAL_SYSINFO = ROOT / 'data' / 'reporting' / 'system_sysinfo_unique_normalized.parquet'
PE_SYSINFO = ROOT / 'data' / 'reporting' / 'pe' / 'system_sysinfo_unique_normalized.parquet'
OUT_DIR = ROOT / 'data' / 'experiments_additive' / 'pe_support_diagnostics'
OUT_DIR.mkdir(parents=True, exist_ok=True)

display(Markdown(f"Real sysinfo: `{REAL_SYSINFO}`"))
display(Markdown(f"PE sysinfo: `{PE_SYSINFO}`"))

Real sysinfo: `/Users/enscribe/Repositories/School/dsc180-q2/data/reporting/system_sysinfo_unique_normalized.parquet`

PE sysinfo: `/Users/enscribe/Repositories/School/dsc180-q2/data/reporting/pe/system_sysinfo_unique_normalized.parquet`

In [2]:
real = pd.read_parquet(REAL_SYSINFO, columns=['countryname_normalized', 'chassistype', 'os', 'persona'])
pe = pd.read_parquet(PE_SYSINFO, columns=['countryname_normalized', 'chassistype', 'os', 'persona'])

cols = ['countryname_normalized', 'chassistype', 'os', 'persona']
rows = []
for c in cols:
    real_set = set(real[c].dropna().astype(str).str.strip())
    pe_set = set(pe[c].dropna().astype(str).str.strip())
    overlap = real_set & pe_set
    rows.append({
        'column': c,
        'real_unique': len(real_set),
        'pe_unique': len(pe_set),
        'overlap_unique': len(overlap),
        'coverage_of_real': len(overlap) / len(real_set) if real_set else 0.0,
        'extra_in_pe': len(pe_set - real_set),
    })

coverage_df = pd.DataFrame(rows)
coverage_df.to_csv(OUT_DIR / 'support_coverage_summary.csv', index=False)
display(Markdown('## Support coverage summary'))
display(coverage_df)

## Support coverage summary

Unnamed: 0,column,real_unique,pe_unique,overlap_unique,coverage_of_real,extra_in_pe
0,countryname_normalized,51,30,18,0.352941,12
1,chassistype,7,88,7,1.0,81
2,os,7,19,7,1.0,12
3,persona,11,60,10,0.909091,50


In [3]:
country_real = set(real['countryname_normalized'].dropna().astype(str).str.strip())
country_pe = set(pe['countryname_normalized'].dropna().astype(str).str.strip())
missing_countries = sorted(country_real - country_pe)
extra_countries = sorted(country_pe - country_real)

missing_df = pd.DataFrame({'missing_country': missing_countries})
extra_df = pd.DataFrame({'extra_country': extra_countries})
missing_df.to_csv(OUT_DIR / 'missing_countries.csv', index=False)
extra_df.to_csv(OUT_DIR / 'extra_countries.csv', index=False)

display(Markdown('## Missing countries in PE'))
display(missing_df.head(50))

display(Markdown('## Extra countries in PE'))
display(extra_df.head(50))

## Missing countries in PE

Unnamed: 0,missing_country
0,Argentina
1,Austria
2,Belgium
3,Colombia
4,Czech Republic
5,Denmark
6,Ecuador
7,Egypt
8,Greece
9,Hong Kong


## Extra countries in PE

Unnamed: 0,extra_country
0,-
1,Asia Pacific
2,Chinese Taipei
3,Indi a
4,Japanese
5,Republic of Korea
6,Russia
7,Russia Federation
8,"Russia',' Federation"
9,USA
