In [1]:
import os, sys, json, warnings
import pandas as pd
import numpy as np
warnings.filterwarnings('ignore')

# ── Path setup ─────────────────────────────────────────────────────────────────
# Run from repo root OR from notebooks/ folder — both work
REPO_ROOT = os.path.abspath('..') if os.path.basename(os.getcwd()) == 'notebooks' else os.getcwd()
DATA_PROC  = os.path.join(REPO_ROOT, 'data', 'processed')
DATA_RAW   = os.path.join(REPO_ROOT, 'data', 'raw')
DATA_HIST  = os.path.join(REPO_ROOT, 'data', 'history')

print(f'Repo root detected: {REPO_ROOT}')

# ── QA helper ─────────────────────────────────────────────────────────────────
results = []   # accumulate all check results

def qa(name, passed, detail=''):
    status = '✅ PASS' if passed else '❌ FAIL'
    results.append({'check': name, 'status': status, 'detail': detail})
    print(f'  {status}  {name}' + (f' — {detail}' if detail else ''))

def section(title):
    print(f'\n{"="*60}\n  {title}\n{"="*60}')

Repo root detected: C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin


---
## 1. Processed Data Files (always available — committed to Git)

In [2]:
section('1A — manzanillo_training_data.csv')

# Bounding box for Colima state (generous)
LAT_MIN, LAT_MAX = 18.5, 20.0
LON_MIN, LON_MAX = -105.5, -103.5

path = os.path.join(DATA_PROC, 'manzanillo_training_data.csv')
qa('File exists', os.path.exists(path), path)

if os.path.exists(path):
    df_train = pd.read_csv(path)
    print(f'  Columns: {list(df_train.columns)}')
    print(f'  Shape:   {df_train.shape}')

    qa('Row count > 10', len(df_train) > 10, f'{len(df_train)} rows — NOTE: only ~19 valid sites in Colima')
    qa('No fully duplicate rows', df_train.duplicated().sum() == 0, f'{df_train.duplicated().sum()} duplicates')

    for col in ['Elevation_m', 'Slope_deg', 'NDVI']:
        if col in df_train.columns:
            n_null = df_train[col].isna().sum()
            qa(f'{col}: no nulls', n_null == 0, f'{n_null} nulls')

    # Coordinate sanity
    lat_col = 'Y_C3' if 'Y_C3' in df_train.columns else ('Latitude' if 'Latitude' in df_train.columns else None)
    lon_col = 'X_C3' if 'X_C3' in df_train.columns else ('Longitude' if 'Longitude' in df_train.columns else None)

    if lat_col and lon_col:
        lat_ok = df_train[lat_col].between(LAT_MIN, LAT_MAX).all()
        lon_ok = df_train[lon_col].between(LON_MIN, LON_MAX).all()
        qa('Latitudes in Colima range (18.5–20.0)', lat_ok,
           f'min={df_train[lat_col].min():.4f}, max={df_train[lat_col].max():.4f}')
        qa('Longitudes in Colima range (-105.5 to -103.5)', lon_ok,
           f'min={df_train[lon_col].min():.4f}, max={df_train[lon_col].max():.4f}')

    if 'NDVI' in df_train.columns:
        ndvi_ok = df_train['NDVI'].between(-1, 1).all()
        qa('NDVI values in valid range (-1 to 1)', ndvi_ok,
           f'min={df_train["NDVI"].min():.3f}, max={df_train["NDVI"].max():.3f}')

    if 'Slope_deg' in df_train.columns:
        slope_ok = df_train['Slope_deg'].between(0, 90).all()
        qa('Slope_deg in valid range (0–90°)', slope_ok,
           f'min={df_train["Slope_deg"].min():.2f}, max={df_train["Slope_deg"].max():.2f}')

    print('\nSample rows:')
    display_cols = [c for c in ['UPMID','Latitude','Longitude','Elevation_m','Slope_deg','NDVI','Y_C3','X_C3'] if c in df_train.columns]
    print(df_train[display_cols].head())


  1A — manzanillo_training_data.csv
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\processed\manzanillo_training_data.csv
  Columns: ['UPMID', 'IDConglomerado', 'X_C3', 'Y_C3', 'Tipo_cgl_estandar_C3', 'Tipo_cgl_C3', 'Muestreado_C3', 'Anio_C3', 'Cve_Estado_C3', 'Estado_C3', 'Cve_Municipio_C3', 'Municipio_C3', 'CLAVE_UMAF_C3', 'UMAFOR_C3', 'CVE_S7_C3', 'DESCRIP_S7_C3', 'FORM_S7_C3', 'FAO_S7_C3', 'ECO_S7_C3', 'CVEECON1_C3', 'DESECON1_C3', 'CVEECON2_C3', 'DESECON2_C3', 'CVEECON3_C3', 'DESECON3_C3', 'CVEECON4_C3', 'DESECON4_C3', 'TIP_PROP_C3', 'TIP_NUC_C3', 'NUC_AGR_C3', 'CUENCA_19_C3', 'SUBCUEN_20_C3', 'ANP_CAT_DECRET_C3', 'ANP_CAT_MANEJO_C3', 'Tenencia_C3', 'Altitud_C3', 'Fisiografia_C3', 'Exposicion_C3', 'Con_sitio_1_C3', 'Con_sitio_2_C3', 'Con_sitio_3_C3', 'Con_sitio_4_C3', 'Sitios_x_cgl_C3', 'con_arbolado_C3', 'con_submuestra_C3', 'con_indicadores_condicion_copa_arbolado_C3', 'con_Veg_mayor_individual_C3', 'con_Veg_mayor_gre

In [3]:
section('1B — manzanillo_FINAL_MODEL_DATA.csv')

path_final = os.path.join(DATA_PROC, 'manzanillo_FINAL_MODEL_DATA.csv')
qa('File exists', os.path.exists(path_final), path_final)

if os.path.exists(path_final):
    df_model = pd.read_csv(path_final)
    print(f'  Shape: {df_model.shape}')

    qa('Row count > 10', len(df_model) > 10,
       f'{len(df_model)} rows — CRITICAL WARNING: < 20 rows means Random Forest is likely overfitted')

    if 'Fuel_Target' in df_model.columns:
        ft = df_model['Fuel_Target']

        # Anomalously large outlier visible in raw CSV
        extreme = (ft > 1000).sum()
        qa('No extreme outliers in Fuel_Target (>1000)', extreme == 0,
           f'{extreme} extreme values detected — check UPMID 59291 which shows 19875 in raw data')

    print(df_model.describe())


  1B — manzanillo_FINAL_MODEL_DATA.csv
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\processed\manzanillo_FINAL_MODEL_DATA.csv
  Shape: (16, 7)
  ❌ FAIL  No extreme outliers in Fuel_Target (>1000) — 1 extreme values detected — check UPMID 59291 which shows 19875 in raw data
              UPMID   Latitude   Longitude  Elevation_m  Slope_deg       NDVI  \
count     16.000000  16.000000   16.000000    16.000000  16.000000  16.000000   
mean   64456.187500  19.233116 -104.051179   800.812500  23.290002   0.384135   
std     2649.726985   0.118477    0.070705   318.535653  10.916446   0.117840   
min    59291.000000  19.001417 -104.198306   280.000000   3.707287   0.159434   
25%    62272.500000  19.135195 -104.070591   648.000000  16.831518   0.296445   
50%    65346.000000  19.273153 -104.054013   720.500000  24.973383   0.383193   
75%    66367.000000  19.318681 -104.010035   958.500000  28.944226   0.490789   
max    68405.0

In [4]:
section('1C — quantum_targets.csv')

path_qt = os.path.join(DATA_PROC, 'quantum_targets.csv')
qa('File exists', os.path.exists(path_qt), path_qt)

if os.path.exists(path_qt):
    df_qt = pd.read_csv(path_qt)
    qa('Row count == 50', len(df_qt) == 50, f'{len(df_qt)} rows')
    qa('Risk_Score column present', 'Risk_Score' in df_qt.columns)
    qa('Max Risk_Score == 100.0', abs(df_qt['Risk_Score'].max() - 100.0) < 0.01,
       f'max={df_qt["Risk_Score"].max():.2f}')

    # COORDINATE ANOMALY CHECK — targets appear in wrong location
    lat_ok = df_qt['Latitude'].between(LAT_MIN, LAT_MAX).all()
    lon_ok = df_qt['Longitude'].between(LON_MIN, LON_MAX).all()
    qa('Target latitudes in Colima bounding box', lat_ok,
       f'min={df_qt["Latitude"].min():.4f}, max={df_qt["Latitude"].max():.4f}')
    qa('Target longitudes in Colima bounding box', lon_ok,
       f'min={df_qt["Longitude"].min():.4f}, max={df_qt["Longitude"].max():.4f}')

    if not lon_ok:
        print()
        print('  ⚠️  ANOMALY DETECTED: Longitudes are ~-103.6 to -103.7.')
        print('      The Manzanillo port is at ~-104.3. These targets appear ~70 km east of the port.')
        print('      Likely cause: raster transform mismatch between DEM (INEGI) and Sentinel coordinate systems.')
        print('      ACTION: Verify new_transform in notebook 05 — the sample_rate scaling may introduce offset.')


  1C — quantum_targets.csv
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\processed\quantum_targets.csv
  ✅ PASS  Row count == 50 — 50 rows
  ✅ PASS  Risk_Score column present
  ✅ PASS  Max Risk_Score == 100.0 — max=100.00
  ✅ PASS  Target latitudes in Colima bounding box — min=18.7608, max=19.4871
  ✅ PASS  Target longitudes in Colima bounding box — min=-104.0437, max=-103.6361


In [5]:
section('1D — optimized_sensor_network.csv')

path_sn = os.path.join(DATA_PROC, 'optimized_sensor_network.csv')
qa('File exists', os.path.exists(path_sn), path_sn)

# Config zones for bounding box check
ZONE_A_LAT = (19.03, 19.10); ZONE_A_LON = (-104.35, -104.28)
ZONE_B_LAT = (19.10, 19.18); ZONE_B_LON = (-104.30, -104.15)

if os.path.exists(path_sn):
    df_sn = pd.read_csv(path_sn)
    print(f'  Shape: {df_sn.shape}')
    print(f'  Layers: {df_sn["layer"].value_counts().to_dict()}')

    qa('Row count == 25', len(df_sn) == 25, f'{len(df_sn)} rows')
    qa('No null coordinates', df_sn[['latitude','longitude']].isna().sum().sum() == 0)

    asset_sensors = df_sn[df_sn['layer'] == 'Asset_Defense']
    wild_sensors  = df_sn[df_sn['layer'] == 'Wildland_Perimeter']
    qa('15 Asset_Defense sensors', len(asset_sensors) == 15, f'{len(asset_sensors)}')
    qa('10 Wildland_Perimeter sensors', len(wild_sensors) == 10, f'{len(wild_sensors)}')

    # Check sensors are near zones defined in config/zones.json
    # Asset sensors should be near Zone A
    asset_lat_ok = asset_sensors['latitude'].between(18.9, 19.2).all()
    qa('Asset sensors within extended Zone A/B area', asset_lat_ok,
       f'lat range: {asset_sensors["latitude"].min():.4f}–{asset_sensors["latitude"].max():.4f}')

    print(df_sn.head())


  1D — optimized_sensor_network.csv
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\processed\optimized_sensor_network.csv
  Shape: (25, 4)
  Layers: {'Asset_Defense': 15, 'Wildland_Perimeter': 10}
  ✅ PASS  Row count == 25 — 25 rows
  ✅ PASS  No null coordinates
  ✅ PASS  15 Asset_Defense sensors — 15
  ✅ PASS  10 Wildland_Perimeter sensors — 10
  ✅ PASS  Asset sensors within extended Zone A/B area — lat range: 19.0103–19.1018
    latitude   longitude          layer sensor_id
0  19.095880 -104.283001  Asset_Defense  Asset_00
1  19.027243 -104.274907  Asset_Defense  Asset_01
2  19.073800 -104.282905  Asset_Defense  Asset_02
3  19.027595 -104.318755  Asset_Defense  Asset_03
4  19.085413 -104.284007  Asset_Defense  Asset_04


In [6]:
section('1E — sim_training_data.csv  (KNOWN MALFORMED ROWS)')

path_sim = os.path.join(DATA_HIST, 'sim_training_data.csv')
qa('File exists', os.path.exists(path_sim), path_sim)

if os.path.exists(path_sim):
    # Read raw lines to detect malformed rows
    with open(path_sim) as f:
        raw_lines = f.readlines()

    header_cols = len(raw_lines[0].split(','))
    print(f'  Expected columns from header: {header_cols}')
    print(f'  Header: {raw_lines[0].strip()}')

    malformed = []
    for i, line in enumerate(raw_lines[1:], 2):
        n_cols = len(line.split(','))
        if n_cols != header_cols:
            malformed.append((i, n_cols, line.strip()))

    qa(f'All rows have {header_cols} columns', len(malformed) == 0,
       f'{len(malformed)} malformed rows found')

    if malformed:
        print(f'  Malformed rows (line#, col_count, content):')
        for m in malformed:
            print(f'    Line {m[0]}: {m[1]} cols — {m[2][:80]}...')

    # Try reading with pandas (it will coerce/drop malformed)
    try:
        df_sim = pd.read_csv(path_sim, on_bad_lines='warn')
        print(f'\n  Pandas read {len(df_sim)} rows after coercion (of {len(raw_lines)-1} total)')
        qa('pandas can read file', True, f'{len(df_sim)} rows successfully parsed')
    except Exception as e:
        qa('pandas can read file', False, str(e))


  1E — sim_training_data.csv  (KNOWN MALFORMED ROWS)
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\history\sim_training_data.csv
  Expected columns from header: 7
  Header: timestamp,wind_speed,wind_dir,start_x,start_y,damage_ha,growth_rate
  ❌ FAIL  All rows have 7 columns — 23 malformed rows found
  Malformed rows (line#, col_count, content):
    Line 4: 8 cols — 2025-12-02T16:05:38.837004,6.4,239.0,945,557,6.0,265.5,44.25...
    Line 5: 8 cols — 2025-12-02T16:06:05.396099,6.4,339.0,945,557,6.0,266.5,44.416666666666664...
    Line 6: 8 cols — 2025-12-02T16:06:44.556953,6.4,339.0,945,557,6.0,264.5,44.083333333333336...
    Line 7: 8 cols — 2025-12-02T16:06:49.918204,6.4,339.0,945,557,6.0,267.5,44.583333333333336...
    Line 8: 8 cols — 2025-12-02T16:10:39.155105,6.0,338.0,945,557,6.0,270.0,45.0...
    Line 9: 8 cols — 2025-12-02T16:10:55.090611,6.0,63.0,945,557,9.0,524.5,58.27777777777778...
    Line 10: 8 cols — 2025-12-0

---
## 2. Raw Data Source Checks
These cells will SKIP gracefully if raw files are not present (they are gitignored).

In [7]:
section('2A — INEGI CEM DEM (Colima_r15m.tif)')

inegi_path = os.path.join(DATA_RAW, 'inegi_cem', 'Colima_r15m.tif')
qa('File exists', os.path.exists(inegi_path), inegi_path)

if os.path.exists(inegi_path):
    try:
        import rasterio
        with rasterio.open(inegi_path) as src:
            qa('File opens without error', True)
            qa('CRS is defined', src.crs is not None, str(src.crs))
            qa('Resolution ~ 0.000139° (15 m)', abs(src.res[0] - 0.0001388889) < 0.00001,
               f'res={src.res[0]:.8f}')

            data = src.read(1)
            valid = data[data > -1000]
            qa('Has valid (>-1000) elevation values', len(valid) > 0)
            qa('Elevation range reasonable (0–4000 m)', valid.min() >= 0 and valid.max() < 4000,
               f'min={valid.min():.0f}, max={valid.max():.0f}')

            bounds = src.bounds
            qa('Bounds cover Colima state', bounds.left < -104 and bounds.right > -103.5,
               f'left={bounds.left:.3f}, right={bounds.right:.3f}')

    except ImportError:
        print('  ⚠️  rasterio not installed — run: pip install rasterio')
    except Exception as e:
        qa('File opens without error', False, str(e))
else:
    print('  SKIP — raw file not present. Download from https://www.inegi.org.mx/app/geo2/elevacionesmex/')


  2A — INEGI CEM DEM (Colima_r15m.tif)
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\raw\inegi_cem\Colima_r15m.tif
  ✅ PASS  File opens without error
  ✅ PASS  CRS is defined — EPSG:8999
  ✅ PASS  Resolution ~ 0.000139° (15 m) — res=0.00013889
  ✅ PASS  Has valid (>-1000) elevation values
  ❌ FAIL  Elevation range reasonable (0–4000 m) — min=-8, max=32767
  ✅ PASS  Bounds cover Colima state — left=-104.691, right=-103.486


In [8]:
section('2B — Sentinel-2 ZIP (from data/raw/sentinel/)')

sentinel_dir = os.path.join(DATA_RAW, 'sentinel')
zip_files = []
if os.path.exists(sentinel_dir):
    zip_files = [f for f in os.listdir(sentinel_dir) if f.endswith('.zip')]

qa('sentinel/ directory exists', os.path.exists(sentinel_dir))
qa('At least one .zip file present', len(zip_files) > 0, f'{len(zip_files)} zips found')

if zip_files:
    import zipfile
    zip_path = os.path.join(sentinel_dir, zip_files[0])
    print(f'  Using: {zip_files[0]}')
    file_size_gb = os.path.getsize(zip_path) / 1e9
    qa('File size > 0.5 GB (real Sentinel tile)', file_size_gb > 0.5, f'{file_size_gb:.2f} GB')

    try:
        with zipfile.ZipFile(zip_path, 'r') as z:
            all_files = z.namelist()
            red_files = [f for f in all_files if 'B04' in f and '10m' in f]
            nir_files = [f for f in all_files if 'B08' in f and '10m' in f]
            qa('Band 4 (Red, 10m) present in ZIP', len(red_files) > 0, red_files[0] if red_files else 'NOT FOUND')
            qa('Band 8 (NIR, 10m) present in ZIP', len(nir_files) > 0, nir_files[0] if nir_files else 'NOT FOUND')
            qa('Tile ID is 13QEB (Manzanillo)', '13QEB' in zip_files[0], f'filename: {zip_files[0]}')
    except Exception as e:
        qa('ZIP opens without error', False, str(e))
else:
    print('  SKIP — download from https://scihub.copernicus.eu — search Tile 13QEB, Sentinel-2 L2A')


  2B — Sentinel-2 ZIP (from data/raw/sentinel/)
  ✅ PASS  sentinel/ directory exists
  ✅ PASS  At least one .zip file present — 1 zips found
  Using: S2C_MSIL2A_20260215T172411_N0512_R012_T13QEB_20260215T210013.SAFE.zip
  ✅ PASS  File size > 0.5 GB (real Sentinel tile) — 1.13 GB
  ✅ PASS  Band 4 (Red, 10m) present in ZIP — S2C_MSIL2A_20260215T172411_N0512_R012_T13QEB_20260215T210013.SAFE/GRANULE/L2A_T13QEB_A007561_20260215T172845/IMG_DATA/R10m/T13QEB_20260215T172411_B04_10m.jp2
  ✅ PASS  Band 8 (NIR, 10m) present in ZIP — S2C_MSIL2A_20260215T172411_N0512_R012_T13QEB_20260215T210013.SAFE/GRANULE/L2A_T13QEB_A007561_20260215T172845/IMG_DATA/R10m/T13QEB_20260215T172411_B08_10m.jp2
  ✅ PASS  Tile ID is 13QEB (Manzanillo) — filename: S2C_MSIL2A_20260215T172411_N0512_R012_T13QEB_20260215T210013.SAFE.zip


In [9]:
section('2C — CONAFOR INFyS Excel')

conafor_path = os.path.join(DATA_RAW, 'conafor_infys', 'INFyS_2015_2020_Colima_qM0XXKR.xlsx')
qa('File exists', os.path.exists(conafor_path), conafor_path)

if os.path.exists(conafor_path):
    try:
        df_cnf = pd.read_excel(conafor_path)
        qa('File readable by pandas', True, f'{len(df_cnf)} rows')
        qa('Row count > 50', len(df_cnf) > 50, f'{len(df_cnf)} rows')
        qa('Y_C3 column present (latitude)', 'Y_C3' in df_cnf.columns)
        qa('X_C3 column present (longitude)', 'X_C3' in df_cnf.columns)
        qa('UPMID column present (site ID)', 'UPMID' in df_cnf.columns)

        # Arbolado sheet for tree height (Fuel_Target)
        df_arb = pd.read_excel(conafor_path, sheet_name='Arbolado')
        height_cols = [c for c in df_arb.columns if 'ALTURA' in str(c).upper() and 'TOTAL' in str(c).upper()]
        qa('Arbolado sheet readable', True, f'{len(df_arb)} rows')
        qa('Tree height (ALTURA_TOTAL) column present', len(height_cols) > 0,
           height_cols[0] if height_cols else 'NOT FOUND')

        if height_cols:
            h_col = height_cols[0]
            valid_trees = df_arb[df_arb[h_col] > 0]
            qa('Has trees with positive height', len(valid_trees) > 0, f'{len(valid_trees)} valid trees')

    except Exception as e:
        qa('Excel file readable', False, str(e))
else:
    print('  SKIP — download from https://snmf.cnf.gob.mx/datos-del-inventario/')


  2C — CONAFOR INFyS Excel
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\raw\conafor_infys\INFyS_2015_2020_Colima_qM0XXKR.xlsx
  ✅ PASS  File readable by pandas — 74 rows
  ✅ PASS  Row count > 50 — 74 rows
  ✅ PASS  Y_C3 column present (latitude)
  ✅ PASS  X_C3 column present (longitude)
  ✅ PASS  UPMID column present (site ID)
  ✅ PASS  Arbolado sheet readable — 2625 rows
  ✅ PASS  Tree height (ALTURA_TOTAL) column present — AlturaTotal_C3
  ✅ PASS  Has trees with positive height — 2625 valid trees


In [10]:
section('2D — Live Open-Meteo API (Weather)')

PORT_LAT, PORT_LON = 19.052, -104.315

try:
    import requests
    url = (f'https://api.open-meteo.com/v1/forecast?latitude={PORT_LAT}&longitude={PORT_LON}'
           f'&current=wind_speed_10m,wind_direction_10m&wind_speed_unit=kn')
    resp = requests.get(url, timeout=5)
    qa('API returns 200', resp.status_code == 200, f'status={resp.status_code}')

    if resp.status_code == 200:
        data = resp.json()
        current = data.get('current', {})
        qa('wind_speed_10m present in response', 'wind_speed_10m' in current)
        qa('wind_direction_10m present in response', 'wind_direction_10m' in current)

        ws = current.get('wind_speed_10m', -1)
        wd = current.get('wind_direction_10m', -1)
        qa('Wind speed >= 0 kn', ws >= 0, f'{ws} kn')
        qa('Wind direction 0–360°', 0 <= wd <= 360, f'{wd}°')
        print(f'\n  Live weather at Manzanillo port: {ws} kn from {wd}°')
except requests.Timeout:
    qa('API responds within 5 seconds', False, 'TIMEOUT — check network')
except Exception as e:
    qa('Open-Meteo API reachable', False, str(e))


  2D — Live Open-Meteo API (Weather)
  ✅ PASS  API returns 200 — status=200
  ✅ PASS  wind_speed_10m present in response
  ✅ PASS  wind_direction_10m present in response
  ✅ PASS  Wind speed >= 0 kn — 1.4 kn
  ✅ PASS  Wind direction 0–360° — 270°

  Live weather at Manzanillo port: 1.4 kn from 270°


In [11]:
section('2E — Trained Model File')

model_path = os.path.join(REPO_ROOT, 'models', 'manzanillo_biomass_model.joblib')
qa('File exists', os.path.exists(model_path), model_path)

if os.path.exists(model_path):
    size_kb = os.path.getsize(model_path) / 1024
    qa('File size < 500 KB (sanity check)', size_kb < 500, f'{size_kb:.1f} KB')

    try:
        import joblib
        model = joblib.load(model_path)
        qa('Model loads without error', True)
        qa('Has predict method', hasattr(model, 'predict'))

        # Smoke test with dummy data
        import pandas as pd
        dummy = pd.DataFrame({'Elevation_m': [500.0], 'Slope_deg': [15.0], 'NDVI': [0.3]})
        pred = model.predict(dummy)
        qa('Model predicts on dummy input', len(pred) == 1, f'prediction={pred[0]:.2f} m tree height')
        qa('Prediction in plausible range (1–50 m)', 1 <= pred[0] <= 50,
           f'{pred[0]:.2f} — if outside range, model may be overfitted')

        # Feature importances
        if hasattr(model, 'feature_importances_'):
            fi = dict(zip(['Elevation_m','Slope_deg','NDVI'], model.feature_importances_))
            print(f'  Feature importances: {fi}')
    except Exception as e:
        qa('Model loads without error', False, str(e))
else:
    print('  SKIP — model file not in repo. Re-run notebook 03 to regenerate.')


  2E — Trained Model File
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\models\manzanillo_biomass_model.joblib
  ✅ PASS  File size < 500 KB (sanity check) — 133.5 KB
  ✅ PASS  Model loads without error
  ✅ PASS  Has predict method
  ✅ PASS  Model predicts on dummy input — prediction=6.75 m tree height
  ✅ PASS  Prediction in plausible range (1–50 m) — 6.75 — if outside range, model may be overfitted
  Feature importances: {'Elevation_m': np.float64(0.2277889084122654), 'Slope_deg': np.float64(0.14848420232438483), 'NDVI': np.float64(0.6237268892633497)}


In [12]:
import os
import osmnx as ox
import networkx as nx

section('2F — Road Network GraphML')

graph_path = os.path.join(DATA_PROC, 'manzanillo_drive.graphml')
qa('File exists', os.path.exists(graph_path), graph_path)

if os.path.exists(graph_path):
    size_mb = os.path.getsize(graph_path) / 1e6
    qa('File size 5–20 MB (sanity check)', 5 <= size_mb <= 20, f'{size_mb:.1f} MB')

    try:
        G = ox.load_graphml(graph_path)
        qa('Graph loads without error', True)
        qa('Has > 100 nodes (not empty)', len(G.nodes) > 100, f'{len(G.nodes)} nodes, {len(G.edges)} edges')
        
        # Fix: wrap generator in parentheses and specify key=len
        largest_cc = max(
            (G.subgraph(c) for c in nx.weakly_connected_components(G)),
            key=len
        )
        qa('Graph is connected (largest component > 90% of nodes)',
           len(largest_cc.nodes) / len(G.nodes) > 0.9)
    except ImportError:
        print('  ⚠️  osmnx not installed — run: pip install osmnx')
    except Exception as e:
        qa('Graph loads without error', False, str(e))
else:
    print('  SKIP — regenerate by running: python rebuild_graph.py')


  2F — Road Network GraphML
  ✅ PASS  File exists — C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\processed\manzanillo_drive.graphml
  ✅ PASS  File size 5–20 MB (sanity check) — 10.7 MB
  ✅ PASS  Graph loads without error
  ✅ PASS  Has > 100 nodes (not empty) — 9066 nodes, 23523 edges
  ✅ PASS  Graph is connected (largest component > 90% of nodes)


---
## 3. Security Flags
Items that must be addressed before any public sharing of this repo.

In [13]:
section('3 — Security & Credential Checks')

# Check notebook 00 for exposed credentials
nb00 = os.path.join(REPO_ROOT, 'notebooks', '00_Data_Acquisition.ipynb')
if os.path.exists(nb00):
    with open(nb00) as f:
        content = f.read()
    # Known exposed username from cell output
    has_username = 'kingroyalfox' in content
    qa('No Earthdata username in notebook 00 output', not has_username,
       '⚠️  USERNAME "kingroyalfox" FOUND IN COMMITTED NOTEBOOK OUTPUT — clear outputs before pushing')

# Check .dodsrc for credentials
dodsrc = os.path.join(REPO_ROOT, 'notebooks', '.dodsrc')
if os.path.exists(dodsrc):
    qa('.dodsrc file not committed to repo',
       False, '⚠️  .dodsrc contains cookie/netrc paths and is committed — add to .gitignore')
else:
    qa('.dodsrc not present in repo', True)

# Check gitignore covers raw data
gitignore = os.path.join(REPO_ROOT, '.gitignore')
if os.path.exists(gitignore):
    with open(gitignore) as f:
        gi_content = f.read()
    qa('.gitignore covers data/raw/', 'data/raw/' in gi_content)
    qa('.gitignore covers logs/', 'logs/' in gi_content)
    qa('.gitignore covers __pycache__/', '__pycache__/' in gi_content)


  3 — Security & Credential Checks
  ✅ PASS  No Earthdata username in notebook 00 output — ⚠️  USERNAME "kingroyalfox" FOUND IN COMMITTED NOTEBOOK OUTPUT — clear outputs before pushing
  ✅ PASS  .dodsrc not present in repo
  ✅ PASS  .gitignore covers data/raw/
  ✅ PASS  .gitignore covers logs/
  ✅ PASS  .gitignore covers __pycache__/


---
## 4. QA Summary Report

In [14]:
section('QA SUMMARY')

df_results = pd.DataFrame(results)
n_pass = (df_results['status'] == '✅ PASS').sum()
n_fail = (df_results['status'] == '❌ FAIL').sum()
total  = len(df_results)

print(f'Total checks: {total}')
print(f'✅ Passed:    {n_pass}')
print(f'❌ Failed:    {n_fail}')
print(f'Score:        {n_pass/total*100:.0f}%')
print()

if n_fail > 0:
    print('Failed checks:')
    fails = df_results[df_results['status'] == '❌ FAIL']
    for _, row in fails.iterrows():
        print(f'  ❌ {row["check"]}')
        if row['detail']:
            print(f'     → {row["detail"]}')

# Save results
out_path = os.path.join(REPO_ROOT, 'data', 'processed', 'qa_results.csv')
df_results.to_csv(out_path, index=False)
print(f'\nFull results saved to: {out_path}')


  QA SUMMARY
Total checks: 71
✅ Passed:    68
❌ Failed:    3
Score:        96%

Failed checks:
  ❌ No extreme outliers in Fuel_Target (>1000)
     → 1 extreme values detected — check UPMID 59291 which shows 19875 in raw data
  ❌ All rows have 7 columns
     → 23 malformed rows found
  ❌ Elevation range reasonable (0–4000 m)
     → min=-8, max=32767

Full results saved to: C:\Users\PhotonUser\My Files\OneDrive\Files\Manzanillo\manzanillo-digital-twin\data\processed\qa_results.csv
