In [32]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import pandas as pd
import geopandas as gpd

from dowser.config import (
    RAW_WPDX,
    INTERIM_BOUNDARIES,
    INTERIM_POINTS_TZA,
    INTERIM_POINTS_KEN,
    TABLES,
    PROCESSED,
)
from dowser.data_io import read_adm_any, read_wpdx_csv
from dowser.geo import spatial_join_points
from dowser.summaries import quality_report, admin_counts

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
# ==============================================================
# CELL 2: Load and explore data
# ==============================================================
wpdx_full_path = RAW_WPDX / "kenia+tanzania_wpdx.csv"
wpdx_full = pd.read_csv(wpdx_full_path, low_memory=False)

print(f"Total rows: {len(wpdx_full)}")
print(f"Columns: {wpdx_full.columns.tolist()[:20]}")

# Check water source types
print("\n--- Water Source Types ---")
print(wpdx_full["#water_source_clean"].value_counts().head(10))

# Check status distribution
print("\n--- Status Distribution ---")
print(wpdx_full["#status_clean"].value_counts())

Total rows: 39471
Columns: ['row_id', '#source', '#lat_deg', '#lon_deg', '#report_date', '#status_id', '#water_source_clean', '#water_source_category', '#water_tech_clean', '#water_tech_category', '#facility_type', '#clean_country_name', '#clean_country_id', '#clean_adm1', '#clean_adm2', '#clean_adm3', '#clean_adm4', '#install_year', '#installer', '#rehab_year']

--- Water Source Types ---
#water_source_clean
Protected Well             17655
Borehole/Tubewell          12487
Protected Spring            5175
Undefined Well              1366
Sand or Sub-surface Dam      953
Rainwater Harvesting         850
Piped Water                  834
Delivered Water               13
Unprotected Well               7
Undefined Spring               1
Name: count, dtype: int64

--- Status Distribution ---
#status_clean
Non-Functional                22218
Non-Functional, dry season     5327
Functional                     4942
Functional, not in use         4773
Functional, needs repair       1836
Abandone

In [34]:
# ==============================================================
# CELL 3: Create 3-class dataset (WATER DEPTH)
# ==============================================================
# 
# Classes represent WATER DEPTH (all points have water):
#   - Class 1: SURFACE (0-5m) - Springs
#   - Class 2: SHALLOW (5-20m) - Wells
#   - Class 3: DEEP (30-100m) - Boreholes
#
# NO class 0 - all WPDx points are locations where water was found
# ==============================================================

# Filter by source type
borehole_df = wpdx_full[wpdx_full["#water_source_clean"] == "Borehole/Tubewell"].copy()
well_df = wpdx_full[wpdx_full["#water_source_clean"] == "Protected Well"].copy()
spring_df = wpdx_full[wpdx_full["#water_source_clean"] == "Protected Spring"].copy()

# Remove rows without valid coordinates
for df in [borehole_df, well_df, spring_df]:
    df.dropna(subset=["#lon_deg", "#lat_deg"], inplace=True)

# Assign classes based on SOURCE TYPE (water depth)
borehole_df["class"] = 3  # Deep (30-100m)
well_df["class"] = 2      # Shallow (5-20m)
spring_df["class"] = 1    # Surface (0-5m)

# Add source type column
borehole_df["source_type"] = "borehole"
well_df["source_type"] = "well"
spring_df["source_type"] = "spring"

# Keep functional status for analysis (NOT for labels)
def get_functional_status(status):
    if pd.isna(status):
        return None
    status_lower = status.lower()
    if "functional" in status_lower and "non" not in status_lower:
        return "functional"
    elif "non-functional" in status_lower:
        return "non_functional"
    elif "abandoned" in status_lower:
        return "abandoned"
    return "other"

for df in [borehole_df, well_df, spring_df]:
    df["functional_status"] = df["#status_clean"].apply(get_functional_status)

# Combine all
combined_df = pd.concat([borehole_df, well_df, spring_df], ignore_index=True)

# Remove unclear status rows
combined_df = combined_df[combined_df["functional_status"] != "other"].copy()
combined_df = combined_df[combined_df["functional_status"].notna()].copy()

# Convert to GeoDataFrame
combined_gdf = gpd.GeoDataFrame(
    combined_df,
    geometry=gpd.points_from_xy(combined_df["#lon_deg"], combined_df["#lat_deg"]),
    crs="EPSG:4326"
)

# Add country
combined_gdf["country"] = combined_gdf["#clean_country_name"].apply(
    lambda x: "TZA" if "tanzania" in str(x).lower() else "KEN"
)

print(f"Total points: {len(combined_gdf)}")

Total points: 35273


In [35]:
# ==============================================================
# CELL 4: Summary
# ==============================================================
print("=" * 60)
print("MULTI-CLASS DATASET (3 classes - water depth)")
print("=" * 60)

class_names = {1: "SURFACE (0-5m)", 2: "SHALLOW (5-20m)", 3: "DEEP (30-100m)"}

print(f"\n{'Class':<25} {'Count':>10} {'%':>10}")
print("-" * 45)
for cls in [1, 2, 3]:
    count = (combined_gdf["class"] == cls).sum()
    pct = count / len(combined_gdf) * 100
    print(f"{class_names[cls]:<25} {count:>10} {pct:>9.1f}%")
print("-" * 45)
print(f"{'TOTAL':<25} {len(combined_gdf):>10}")

# Functional status breakdown (for reference)
print("\n" + "=" * 60)
print("FUNCTIONAL STATUS (for reference, NOT used for labels)")
print("=" * 60)
for cls in [1, 2, 3]:
    subset = combined_gdf[combined_gdf["class"] == cls]
    print(f"\n{class_names[cls]}:")
    for status in ["functional", "non_functional", "abandoned"]:
        count = (subset["functional_status"] == status).sum()
        pct = count / len(subset) * 100 if len(subset) > 0 else 0
        print(f"  {status:<15}: {count:>6} ({pct:>5.1f}%)")

# Per country
print("\n" + "=" * 60)
print("PER COUNTRY")
print("=" * 60)
for country in ["TZA", "KEN"]:
    subset = combined_gdf[combined_gdf["country"] == country]
    print(f"\n{country}: {len(subset)} points")
    for cls in [1, 2, 3]:
        count = (subset["class"] == cls).sum()
        print(f"  {class_names[cls]}: {count}")

MULTI-CLASS DATASET (3 classes - water depth)

Class                          Count          %
---------------------------------------------
SURFACE (0-5m)                  5171      14.7%
SHALLOW (5-20m)                17633      50.0%
DEEP (30-100m)                 12469      35.3%
---------------------------------------------
TOTAL                          35273

FUNCTIONAL STATUS (for reference, NOT used for labels)

SURFACE (0-5m):
  functional     :   1138 ( 22.0%)
  non_functional :   3891 ( 75.2%)
  abandoned      :    142 (  2.7%)

SHALLOW (5-20m):
  functional     :   4273 ( 24.2%)
  non_functional :  13264 ( 75.2%)
  abandoned      :     96 (  0.5%)

DEEP (30-100m):
  functional     :   3306 ( 26.5%)
  non_functional :   9089 ( 72.9%)
  abandoned      :     74 (  0.6%)

PER COUNTRY

TZA: 16958 points
  SURFACE (0-5m): 4973
  SHALLOW (5-20m): 5940
  DEEP (30-100m): 6045

KEN: 18315 points
  SURFACE (0-5m): 198
  SHALLOW (5-20m): 11693
  DEEP (30-100m): 6424


In [36]:
# ==============================================================
# CELL 5: Save dataset
# ==============================================================
output_path = PROCESSED / "waterpoints_multiclass.parquet"
combined_gdf.to_parquet(output_path, index=False)
print(f"✅ Saved: {output_path}")

print("""
NOTE: This dataset has 3 classes for WATER DEPTH prediction.
All points are locations where water EXISTS.

  - Class 1: Surface (0-5m) - springs
  - Class 2: Shallow (5-20m) - wells  
  - Class 3: Deep (30-100m) - boreholes

The 'functional_status' column is for analysis only.
"non_functional" = broken infrastructure, NOT dry borehole.
""")

✅ Saved: /Users/leonardovannoli/work/dowser/dowser-v0/data/processed/waterpoints_multiclass.parquet

NOTE: This dataset has 3 classes for WATER DEPTH prediction.
All points are locations where water EXISTS.

  - Class 1: Surface (0-5m) - springs
  - Class 2: Shallow (5-20m) - wells  
  - Class 3: Deep (30-100m) - boreholes

The 'functional_status' column is for analysis only.
"non_functional" = broken infrastructure, NOT dry borehole.



TZA ADM1 name col: ADM1_EN
KEN ADM1 name col: adm1_name
