In [None]:
# === 1. Configuration & Setup ===

# --- Core Libraries ---
from __future__ import annotations
import os
import sys
import yaml
import logging
from pathlib import Path

# --- Project-Specific Modules ---
# Add project's src directory to path to allow imports
def find_project_root(marker='config.yml'):
    path = Path.cwd().resolve()
    while path.parent != path:
        if (path / marker).exists(): return path
        path = path.parent
    raise FileNotFoundError(f"Project root with marker '{marker}' not found.")

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT / 'src'))

from utils import setup_colored_logging

# --- Geospatial & Data Libraries ---
import geopandas as gpd
import pandas as pd
from tqdm.auto import tqdm

# --- Gold-Standard Logging Setup ---
setup_colored_logging()
log = logging.getLogger("1.4_wepp_soil_preparation")

# --- Configuration Loading ---
CONFIG_PATH = PROJECT_ROOT / "config.yml"
with open(CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

# --- Path Configuration (from config) ---
STUDY_AREAS_GPKG = PROJECT_ROOT / config['paths']['study_areas']
RAW_SOILS_DIR = PROJECT_ROOT / config['paths']['soils_dir']
PROCESSED_SOILS_DIR = PROJECT_ROOT / config['paths']['processed_dir'] / 'soils_sol'
GNATSGO_JOB_NAME = config['data_sources']['gnatsgo']['job_name']

# The primary input file from notebook 1.2
TABULAR_SOIL_DATA_CSV = RAW_SOILS_DIR / f"{GNATSGO_JOB_NAME}_tabular_data.csv"

# Create output directory if it doesn't exist
PROCESSED_SOILS_DIR.mkdir(parents=True, exist_ok=True)

log.info("--- Configuration Summary ---")
log.info(f"Project Root:          {PROJECT_ROOT}")
log.info(f"Input Study Areas:     {STUDY_AREAS_GPKG}")
log.info(f"Input Tabular Soils:   {TABULAR_SOIL_DATA_CSV}")
log.info(f"Output .sol Files:     {PROCESSED_SOILS_DIR}")
log.info("Setup complete.")


In [None]:
# === 2. Load Input Data ===

def load_input_data(gpkg_path: Path, csv_path: Path) -> tuple[gpd.GeoDataFrame, pd.DataFrame]:
    """Loads the study area polygons and the tabular soil data."""
    log.info(f"Loading study area polygons from {gpkg_path}")
    if not gpkg_path.exists():
        raise FileNotFoundError(f"Study areas file not found. Please run notebook 1.1.")
    study_areas = gpd.read_file(gpkg_path, layer='cv_provinces')
    log.info(f"Loaded {len(study_areas)} study area polygons.")

    log.info(f"Loading tabular soil data from {csv_path}")
    if not csv_path.exists():
        raise FileNotFoundError(f"Tabular soil data not found. Please run notebook 1.2.")
    soil_data = pd.read_csv(csv_path)
    log.info(f"Loaded {len(soil_data)} soil horizon records.")
    
    return study_areas, soil_data

# --- Execute ---
study_areas_gdf, soil_data_df = load_input_data(STUDY_AREAS_GPKG, TABULAR_SOIL_DATA_CSV)
display(study_areas_gdf.head())
display(soil_data_df.head())


### Soil Profile Generation Strategy

The core challenge in creating a `.sol` file is converting the complex, multi-component gNATSGO data into a single, representative soil profile for each analysis area. WEPP models a single hillslope with one set of soil parameters, so an aggregation is required.

Our strategy will be:

1.  **Identify the Dominant Soil Component:** For each study area polygon, we must first determine which soil map units fall within it. The tabular data we fetched already links map unit keys (`mukey`) to soil components (`cokey`). The most straightforward approach is to select the single most dominant component (by `comppct_r`) within the area of interest. A more advanced method would involve area-weighting the properties of several top components.

2.  **Extract the Horizon Profile:** Once the dominant component (`cokey`) is identified, we extract all of its associated horizons (`chkey`) from the tabular data, ordered by depth.

3.  **Estimate WEPP Parameters (PTFs):** The WEPP model requires specific erosion and hydraulic parameters that are not directly provided in gNATSGO. These must be estimated from basic soil properties using **Pedotransfer Functions (PTFs)**. Key parameters to estimate include:
    *   **`Ki` (Interrill Erodibility)**
    *   **`Kr` (Rill Erodibility)**
    *   **`shcrit` (Critical Shear Stress)**
    *   **`Ksat` (Saturated Hydraulic Conductivity)**
    This notebook will put placeholders for these values, but a full scientific implementation requires researching and applying appropriate PTFs (e.g., from the WEPP technical documentation or literature).

4.  **Format and Write the `.sol` File:** A dedicated function will take the final, processed profile for a study area and write it to the strict ASCII format that WEPP requires.


In [None]:
# === 3. Core Soil Processing Logic ===

# This is a placeholder for the core data processing logic.
# A full implementation would involve:
# 1. Iterating through each polygon in `study_areas_gdf`.
# 2. Spatially intersecting the polygon with soil map unit geometry (requires more data from SDA or a GIS layer).
# 3. Based on the intersection, identifying the dominant component `cokey` from `soil_data_df`.
# 4. Filtering `soil_data_df` to get all horizons for that dominant `cokey`.
# 5. Creating a new DataFrame representing the single, representative profile.
# 6. Applying PTFs to estimate erodibility and conductivity parameters.
# 7. Passing the final profile DataFrame to the formatting function below.

log.warning("Placeholder cell: Core data processing logic needs to be implemented.")

# Example structure:
# for index, area in study_areas_gdf.iterrows():
#     area_id = area['some_unique_id']
#     log.info(f"Processing soil profile for {area_id}...")
#     
#     # Check if final .sol file already exists
#     sol_output_path = PROCESSED_SOILS_DIR / f"{area_id}.sol"
#     if sol_output_path.exists():
#         log.info(f"  -> Skipping, {sol_output_path.name} already exists.")
#         continue
#
#     # ... implementation of steps 1-6 ...
#     representative_profile_df = ...
#
#     # ... call formatter ...
#     write_wepp_sol_file(representative_profile_df, sol_output_path)



In [None]:
# === 4. WEPP .sol Formatter ===

def write_wepp_sol_file(profile_df: pd.DataFrame, out_path: Path):
    """
    Formats a DataFrame of a representative soil profile into a WEPP .sol file.

    Args:
        profile_df (pd.DataFrame): Must contain horizon data with columns like 
                                   'hzdept_r', 'hzdepb_r', 'sandtotal_r', 'claytotal_r', 
                                   'om_r', 'cec7_r', 'fragvol_r'.
        out_path (Path): The full path for the output .sol file.
    """
    log.info(f"Generating WEPP .sol file: {out_path.name}")

    num_horizons = len(profile_df)
    if num_horizons == 0:
        log.warning(f"Cannot write {out_path.name}, profile data is empty.")
        return

    # --- Placeholder for PTF-derived parameters ---
    # These values should be calculated based on soil properties.
    Ki_placeholder = 300000  # Interrill erodibility
    Kr_placeholder = 0.000003 # Rill erodibility
    shcrit_placeholder = 3.0   # Critical shear
    Ksat_placeholder = 5.0     # Saturated hydraulic conductivity (mm/hr)

    with open(out_path, 'w') as f:
        # --- Write Header ---
        f.write("98.4\n") # WEPP version
        f.write("#\n")
        f.write(f"# Soil file generated by notebook 1.4 on {pd.Timestamp.now().strftime('%Y-%m-%d')}\n")
        f.write(f"# Profile based on dominant component from gNATSGO.\n")
        f.write("# Erodibility and conductivity are PLACEHOLDERS and require PTFs.\n")
        f.write("\n")
        f.write(f"{num_horizons} # Number of horizons\n")
        f.write("\n")

        # --- Write Horizon Data ---
        for i, row in enumerate(profile_df.itertuples()):
            # Calculate horizon thickness in meters
            thickness_m = (row.hzdepb_r - row.hzdept_r) / 100.0
            
            f.write(f"'Horizon {i+1}'\n")
            f.write(f"{thickness_m:.3f} {row.sandtotal_r:.1f} {row.claytotal_r:.1f} {row.om_r:.2f} {row.cec7_r:.2f} {row.fragvol_r:.1f}\n")
            f.write(f"{Ki_placeholder} {Kr_placeholder} {shcrit_placeholder} {Ksat_placeholder}\n")
            f.write("\n")

    log.info(f"✅ Successfully wrote {out_path.name}")

log.warning("Placeholder cell: Formatting function is defined but not yet called.")

'''