In [1]:
from pathlib import Path
import pandas as pd

#  Find project root automatically 
cwd = Path.cwd()
ROOT = None
for p in [cwd] + list(cwd.parents):
    if (p / "data" / "curated" / "applications_curated_full.csv").exists():
        ROOT = p
        break

if ROOT is None:
    raise FileNotFoundError("Project root not found (missing data/curated/applications_curated_full.csv).")

CURATED_FULL_PATH = ROOT / "data" / "curated" / "applications_curated_full.csv"
ANALYSIS_PATH = ROOT / "data" / "curated" / "applications_analysis.csv"

print("ROOT:", ROOT)
print("Curated exists?", CURATED_FULL_PATH.exists(), CURATED_FULL_PATH)
print("Analysis exists?", ANALYSIS_PATH.exists(), ANALYSIS_PATH)

df_curated = pd.read_csv(CURATED_FULL_PATH)
df_analysis = pd.read_csv(ANALYSIS_PATH)

print("curated_full rows:", len(df_curated))
print("analysis rows:", len(df_analysis))

ROOT: c:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03
Curated exists? True c:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\curated\applications_curated_full.csv
Analysis exists? True c:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\curated\applications_analysis.csv
curated_full rows: 502
analysis rows: 500


In [2]:
# Helper to compute missing rates + basic validity checks

def timestamp_profile(df, col, id_col="application_id"):
    n = len(df)

    if col not in df.columns:
        return {
            "dataset": None,
            "field": col,
            "n_rows": n,
            "has_field": 0,
            "missing_count": None,
            "missing_pct": None,
            "parse_failed_count": None,
            "parse_failed_pct": None,
            "future_ts_count": None,
            "future_ts_pct": None,
            "example_ids": None,
        }

    s = df[col]
    s_clean = s.astype("string").str.strip()
    missing_mask = s.isna() | (s_clean == "") | (s_clean.str.lower() == "nan")

    parsed = pd.to_datetime(s_clean, errors="coerce", utc=True)

    parse_failed_mask = (~missing_mask) & (parsed.isna())

    now = pd.Timestamp.now(tz="UTC")
    future_mask = (~parsed.isna()) & (parsed > (now + pd.Timedelta(days=1)))

    examples = None
    if id_col in df.columns:
        example_ids = df.loc[missing_mask | parse_failed_mask | future_mask, id_col].head(10).tolist()
        examples = ",".join(map(str, example_ids)) if example_ids else None

    return {
        "dataset": None,
        "field": col,
        "n_rows": n,
        "has_field": 1,
        "missing_count": int(missing_mask.sum()),
        "missing_pct": float(missing_mask.mean()),
        "parse_failed_count": int(parse_failed_mask.sum()),
        "parse_failed_pct": float(parse_failed_mask.mean()),
        "future_ts_count": int(future_mask.sum()),
        "future_ts_pct": float(future_mask.mean()),
        "example_ids": examples,
    }

In [3]:
rows = []

# Curated layer 
for col in ["raw_processing_timestamp", "clean_processing_timestamp"]:
    r = timestamp_profile(df_curated, col, id_col="application_id")
    r["dataset"] = "applications_curated_full"
    rows.append(r)

# Analysis layer 
for col in ["raw_processing_timestamp", "clean_processing_timestamp", "processing_timestamp"]:
    r = timestamp_profile(df_analysis, col, id_col="application_id")
    r["dataset"] = "applications_analysis"
    rows.append(r)

summary = pd.DataFrame(rows)

# Pretty display in %
show = summary.copy()
for c in ["missing_pct", "parse_failed_pct", "future_ts_pct"]:
    show[c] = (show[c] * 100).round(2)

display_cols = [
    "dataset", "field", "n_rows", "has_field",
    "missing_count", "missing_pct",
    "parse_failed_count", "parse_failed_pct",
    "future_ts_count", "future_ts_pct",
    "example_ids",
]

print(show[display_cols].to_string(index=False))

# Save next to the notebook 
OUT_PATH = Path.cwd() / "provenance_gap_summary_auditability.csv"
summary.to_csv(OUT_PATH, index=False)
print(f"\nSaved: {OUT_PATH}")

                  dataset                      field  n_rows  has_field  missing_count  missing_pct  parse_failed_count  parse_failed_pct  future_ts_count  future_ts_pct                                                                     example_ids
applications_curated_full   raw_processing_timestamp     502          1          440.0        87.65                 0.0               0.0              2.0            0.4 app_037,app_215,app_024,app_275,app_099,app_246,app_042,app_348,app_309,app_320
applications_curated_full clean_processing_timestamp     502          1          440.0        87.65                 0.0               0.0              2.0            0.4 app_037,app_215,app_024,app_275,app_099,app_246,app_042,app_348,app_309,app_320
    applications_analysis   raw_processing_timestamp     500          0            NaN          NaN                 NaN               NaN              NaN            NaN                                                                            None


# Graphic Construction 

In [4]:
curated_missing = float(
    summary[(summary["dataset"]=="applications_curated_full") & (summary["field"]=="clean_processing_timestamp")]["missing_pct"].iloc[0]
)

analysis_absent = float(
    1.0 if summary[(summary["dataset"]=="applications_analysis") & (summary["field"]=="processing_timestamp")]["has_field"].iloc[0] == 0 else 0.0
)

curated_missing, analysis_absent

(0.8764940239043825, 1.0)

In [6]:
import matplotlib.pyplot as plt

out_path = Path.cwd() / "provenance_gap.png"

labels = ["curated_full\n(clean_processing_timestamp)", "analysis\n(timestamp absent)"]
values = [curated_missing * 100, analysis_absent * 100]

plt.figure(figsize=(8, 4))
bars = plt.bar(labels, values)
plt.ylim(0, 100)
plt.ylabel("% records missing / absent")
plt.title("Provenance gap: timestamps missing in curated and absent in analysis", pad=12)

# Put labels inside bars (or just below the top if bar is too small)
for bar, v in zip(bars, values):
    x = bar.get_x() + bar.get_width() / 2
    if v >= 12:
        y = v - 5
        va = "top"
    else:
        y = v + 2
        va = "bottom"
    plt.text(x, y, f"{v:.1f}%", ha="center", va=va, color="white")

plt.tight_layout()
plt.savefig(out_path, dpi=200)
plt.close()

print("Saved:", out_path)

Saved: c:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\auditability_provenance_study\provenance_gap.png
