# Getting Started with the Data Engineering Environment

This notebook walks you through the basic workflow:
1. Retrieving data from APIs
2. Normalizing and transforming data
3. Storing data in Parquet format
4. Creating visualizations and maps

In [None]:
# Standard imports
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import sys
from pathlib import Path

# Add scripts to path
sys.path.insert(0, str(Path.cwd().parent))

# Import our modules
from scripts.data_retrieval import USGSWaterServices, EPAWaterQuality, RESTClient
from scripts.data_retrieval import generate_usgs_sites, generate_groundwater_levels  # Sample data
from scripts.data_storage import save_parquet, load_parquet, save_geoparquet, load_geoparquet
from scripts.normalization import normalize_water_data, standardize_coordinates
from scripts.visualization import (
    time_series_plot, scatter_plot, histogram, point_map, heatmap,
    set_theme, display_full_table
)

# Configuration: Set to True to use sample data instead of API calls
OFFLINE_MODE = False

# Set visualization theme
set_theme('light')

## 1. Data Retrieval

Let's start by fetching some groundwater data from USGS for the Colorado River Basin.

In [None]:
# Initialize the USGS client
usgs = USGSWaterServices()

if OFFLINE_MODE:
    # Use synthetic sample data
    sites = generate_usgs_sites(n_sites=50, state="CO")
    print(f"Generated {len(sites)} sample groundwater sites")
else:
    # Get groundwater monitoring sites in the Upper Colorado Basin
    try:
        sites = usgs.get_colorado_basin_sites(basin="upper")
        print(f"Found {len(sites)} groundwater sites in Upper Colorado Basin")
    except Exception as e:
        print(f"API request failed: {e}")
        print("Falling back to sample data...")
        sites = generate_usgs_sites(n_sites=50, state="CO")
        print(f"Generated {len(sites)} sample sites instead")

sites.head()

In [None]:
# Get recent groundwater levels for Colorado
end_date = datetime.now()
start_date = end_date - timedelta(days=30)

if OFFLINE_MODE:
    # Generate sample groundwater level data
    levels = generate_groundwater_levels(
        n_records=500,
        n_sites=20,
        start_date=start_date,
        end_date=end_date,
    )
    print(f"Generated {len(levels)} sample groundwater measurements")
else:
    try:
        levels = usgs.get_groundwater_levels(
            state_code="CO",
            start_date=start_date,
            end_date=end_date,
        )
        print(f"Retrieved {len(levels)} groundwater level measurements")
    except Exception as e:
        print(f"API request failed: {e}")
        print("Falling back to sample data...")
        levels = generate_groundwater_levels(n_records=500, start_date=start_date, end_date=end_date)
        print(f"Generated {len(levels)} sample measurements instead")

levels.head()

## 2. Data Normalization

Clean and standardize the data for analysis.

In [None]:
# Apply standard normalizations
if not levels.empty:
    normalized = normalize_water_data(levels, source='usgs')
    print("Column names after normalization:")
    print(normalized.columns.tolist())
    normalized.head()

In [None]:
# Create a GeoDataFrame with coordinates
if not levels.empty and 'latitude' in normalized.columns:
    geo_df = standardize_coordinates(normalized, create_geometry=True)
    print(f"Created GeoDataFrame with CRS: {geo_df.crs}")
    geo_df.head()

## 3. Data Storage

Save the data to Parquet format for efficient storage and retrieval.

In [None]:
# Save to Parquet
if not levels.empty:
    # Regular DataFrame
    path = save_parquet(normalized, "groundwater_levels_co")
    print(f"Saved to: {path}")
    
    # GeoDataFrame (preserves geometry)
    geo_path = save_geoparquet(geo_df, "groundwater_levels_co_geo")
    print(f"Saved GeoParquet to: {geo_path}")

In [None]:
# Load it back
loaded = load_parquet("groundwater_levels_co")
print(f"Loaded {len(loaded)} rows")
print(f"Data types preserved: {loaded.dtypes.to_dict()}")

## 4. Visualization

Create plots and maps to explore the data.

In [None]:
# Time series plot
if not levels.empty and 'datetime' in normalized.columns and 'value' in normalized.columns:
    fig = time_series_plot(
        normalized,
        date_col='datetime',
        value_col='value',
        title='Groundwater Levels in Colorado',
        ylabel='Depth to Water (ft)',
        rolling_window=7
    )

In [None]:
# Histogram of values
if not levels.empty and 'value' in normalized.columns:
    fig = histogram(
        normalized,
        column='value',
        title='Distribution of Groundwater Depths',
        xlabel='Depth to Water (ft)',
        show_stats=True
    )

In [None]:
# Interactive map of monitoring sites
if not levels.empty and 'latitude' in normalized.columns:
    m = point_map(
        normalized,
        lat_col='latitude',
        lon_col='longitude',
        popup_cols=['site_code', 'site_name', 'value'],
        size_col='value',
        center=[39.0, -105.5],
        zoom=7,
    )
    m

In [None]:
# Heatmap visualization
if not levels.empty and 'latitude' in normalized.columns:
    m = heatmap(
        normalized,
        lat_col='latitude',
        lon_col='longitude',
        value_col='value',
        center=[39.0, -105.5],
        zoom=7,
    )
    m

## Next Steps

Check out the example notebooks for more detailed workflows:

- `examples/data_retrieval_demo.ipynb` - Working with multiple data sources
- `examples/visualization_demo.ipynb` - Advanced plotting and charting
- `examples/geospatial_demo.ipynb` - Maps and spatial analysis

See the README for environment setup and dependency management.