In [18]:
import pandas as pd
import numpy as np
import os
import glob
import rasterio
import matplotlib.pyplot as plt
from tqdm import tqdm
import requests
from pyproj import Transformer
import xml.etree.ElementTree as ET
import json
import ipyparallel as ipp

In [3]:
def get_cdl_value(year, lon, lat):
    """
    Fetches and parses the CDL value for a given year, lon, and lat coordinates.

    Args:
        year (int): The year of the data.
        lon (float): The longitude.
        lat (float): The latitude.

    Returns:
        dict: A dictionary containing the result data.
              Returns None if the request fails or the Result element is not found.
    """
    transformer = Transformer.from_crs("epsg:4326", "epsg:5070", always_xy=True)
    x, y = transformer.transform(lon, lat)
    url = f"https://nassgeodata.gmu.edu/axis2/services/CDLService/GetCDLValue?year={year}&x={x}&y={y}"
    response = requests.get(url)

    if response.status_code == 200: # success
        root = ET.fromstring(response.content)
        result_element = root.find(".//Result")

        if result_element is not None:
            result_string = result_element.text
            # weird xml format fix
            result_string = result_string.replace("{", "{\"").replace(":", "\":").replace(", ", ", \"")
            result_dict = json.loads(result_string)
            return result_dict
        else:
            print("Result element not found in the XML response.")
            return None
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

In [4]:
central_5021 = pd.read_csv("../data/central_stations_observations.csv")
central_5021 = central_5021[central_5021.Basin_Subb.str.startswith("5-021")]

In [5]:
central_5021

Unnamed: 0,site_code,latitude,longitude,well_depth,well_use,geology,Region_Off,Basin_Subb,observations_2010,observations_2011,...,observations_2015,observations_2016,observations_2017,observations_2018,observations_2019,observations_2020,observations_2021,observations_2022,observations_2023,observations_2024
2039,366909N1221638W001,39.69090,-122.16384,320.0,Irrigation,['Q'],NRO,5-021.52,0,0,...,4,3,3,3,3,3,3,3,3,5
3433,380926N1215871W001,38.11300,-121.58719,23.0,Observation,['Q'],NCRO,5-021.66,0,0,...,0,0,0,0,0,0,0,0,0,0
3434,380926N1215871W002,38.11300,-121.58719,102.0,Observation,['Q'],NCRO,5-021.66,0,0,...,0,0,0,0,0,0,0,0,0,0
3458,381132N1216951W001,38.11322,-121.69513,416.0,Other,['Q'],NCRO,5-021.66,0,0,...,2,2,2,2,2,1,2,0,2,2
3462,381150N1215899W001,38.81150,-121.58993,340.0,Irrigation,['Q'],NCRO,5-021.64,0,0,...,23,3,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5862,402341N1222533W002,40.23410,-122.25329,140.0,Industrial,['QPc'],NRO,5-021.50,0,0,...,0,0,0,0,0,0,4,9,10,4
5865,402522N1222082W001,40.25221,-122.20818,200.0,Residential,['Tvp'],NRO,5-021.53,0,0,...,0,5,3,2,3,2,2,3,3,1
5866,402653N1222348W002,40.26517,-122.25339,320.0,Residential,['QPc'],NRO,5-021.50,0,0,...,0,0,0,0,0,0,3,0,0,0
5964,410921N1210855W001,40.09211,-122.08555,680.0,Observation,['QPc'],NRO,5-021.56,0,0,...,0,0,0,0,0,0,176,244,340,371


In [None]:
base_folder = '../ECOSTRESS'

yearly_dfs = {}
for year in tqdm(os.listdir(base_folder)):
    year_folder = os.path.join(base_folder, year)
    if os.path.isdir(year_folder):
        tif_files = glob.glob(os.path.join(year_folder, '*.tif'))
        df_list = []

        for tif_file in tif_files:
            with rasterio.open(tif_file) as src:
                data = src.read(1)
                transform = src.transform

            rows, cols = data.shape
            row_inds, col_inds = np.indices((rows, cols))
            xs, ys = rasterio.transform.xy(transform, row_inds, col_inds)
            xs = np.array(xs)
            ys = np.array(ys)
            df = pd.DataFrame({
                'x': xs.flatten(),
                'y': ys.flatten(),
                'value': data.flatten()
            })
            df_list.append(df)

        if df_list:
            combined_df = pd.concat(df_list, ignore_index=True)
            yearly_dfs[year] = combined_df

 14%|██████▍                                      | 1/7 [00:08<00:48,  8.04s/it]

In [13]:
obs_cols = [col for col in central_5021 if col.split("_")[0] == "observations"]
obs_cols = [col for col in obs_cols if int(col.split("_")[1]) >= 2018]

In [14]:
central_5021 = central_5021[central_5021[obs_cols].ge(25).all(axis=1)]

In [24]:
def parallel_fetch(args):
    """
    Helper that receives a tuple (index, row).  
    For each year from 2018 to 2024, it calls `get_cdl_value`.
    Returns (index, {year: <the CDL dictionary or None>}).
    """
    idx, row = args
    lon = row['longitude']
    lat = row['latitude']
    
    results = {}
    for year in range(2018, 2025):
        cdl_dict = get_cdl_value(year, lon, lat)
        results[year] = cdl_dict
    return idx, results

rc = ipp.Client()
lview = rc.load_balanced_view()
rc[:].push({"get_cdl_value":get_cdl_value})
rc[:].execute("""
import requests
from pyproj import Transformer
import xml.etree.ElementTree as ET
import json
""")
rows_to_process = [(idx, row) for idx, row in central_5021.iterrows()]
async_result = lview.map_async(parallel_fetch, rows_to_process)
results_list = async_result.get()

for idx, year_dict in results_list:
    for year, cdl in year_dict.items():
        col_name = f"CDL_{year}"
        if cdl is not None:
            central_5021.loc[idx, col_name] = str(cdl.get("category"))
        else:
            central_5021.loc[idx, col_name] = None

In [25]:
central_5021.head()

Unnamed: 0,site_code,latitude,longitude,well_depth,well_use,geology,Region_Off,Basin_Subb,observations_2010,observations_2011,...,observations_2022,observations_2023,observations_2024,CDL_2018,CDL_2019,CDL_2020,CDL_2021,CDL_2022,CDL_2023,CDL_2024
3750,384159N1217303W001,38.4159,-121.7303,43.0,Observation,['Q'],NCRO,5-021.66,391,384,...,384,378,257,Developed/Open Space,Broccoli,Almonds,Almonds,Almonds,Prunes,Grapes
3751,384159N1217303W002,38.41585,-121.73034,243.0,Observation,['Q'],NCRO,5-021.66,306,358,...,384,269,197,Developed/Open Space,Broccoli,Almonds,Almonds,Almonds,Prunes,Grapes
3752,384159N1217303W003,38.4159,-121.7303,445.0,Observation,['Q'],NCRO,5-021.66,364,384,...,384,378,197,Developed/Open Space,Broccoli,Almonds,Almonds,Almonds,Prunes,Grapes
4119,386464N1216675W002,38.6464,-121.6675,150.0,Observation,['Q'],NCRO,5-021.67,374,377,...,302,280,341,Walnuts,Fallow/Idle Cropland,Sunflowers,Winter Wheat,Safflower,Winter Wheat,Developed/Low Intensity
4120,386464N1216675W003,38.6464,-121.6675,280.0,Observation,['Q'],NCRO,5-021.67,374,377,...,163,262,341,Walnuts,Fallow/Idle Cropland,Sunflowers,Winter Wheat,Safflower,Winter Wheat,Developed/Low Intensity


In [26]:
def fetch_all_records(site_code, year):
    all_records = []
    offset = 0
    limit = 1000
    while True:
        url = f"https://data.cnra.ca.gov/api/3/action/datastore_search?resource_id=bfa9f262-24a1-45bd-8dc8-138bc8107266&q={site_code} {year}&limit={limit}&offset={offset}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            records = data['result']['records']
            all_records.extend(records)
            if len(records) < limit:
                break
            offset += limit
        else:
            print(f"Failed to fetch data for {site_code} in {year}")
            break
    return all_records

def get_readings_for_site(site_code, years):
    all_readings = []
    for year in years:
        records = fetch_all_records(site_code, year)
        for record in records:
            reading = {
                'site_code': record.get('site_code'),
                'msmt_date': record.get('msmt_date'),
                'wlm_rpe': record.get('wlm_rpe'),
                'wlm_gse': record.get('wlm_gse'),
                'gwe': record.get('gwe')
            }
            all_readings.append(reading)
    return pd.DataFrame(all_readings)

def interpolate_gwe(site_code, years, readings_df):
    site_data = readings_df
    site_data.rename(columns={"msmt_date": "ds", "gwe": "y"}, inplace=True)
    site_data['ds'] = pd.to_datetime(site_data['ds'])

    full_date_range = pd.date_range(start=f"{min(years)}-01-01", end=f"{max(years)}-12-31", freq="D")
    full_df = pd.DataFrame({'ds': full_date_range})
    full_df = full_df.merge(site_data, on='ds', how='left')

    m = Prophet(
        changepoint_prior_scale=0.01,
        seasonality_prior_scale=0.01,
        yearly_seasonality=True
    )
    m.fit(site_data.dropna())

    forecast = m.predict(full_df)

    full_df['y_interpolated'] = forecast['yhat']

    result_df = full_df[['ds', 'y_interpolated']].copy()
    result_df.rename(columns={"ds": "date", "y_interpolated": "gwe_interpolated"}, inplace=True)

    return result_df

def gwe_trend(site_code, start_yr, end_yr):
    years = range(start_yr, end_yr + 1)
    readings_df = get_readings_for_site(site_code, years)
    predicted = interpolate_gwe(site_code, years, readings_df)

    predicted['date'] = pd.to_datetime(predicted['date'])
    predicted.set_index('date', inplace=True)

    monthly_avg = predicted.resample('ME').mean()
    
    return monthly_avg

In [29]:
client = ipp.Client()
dview = client[:]
dview.push({'fetch_all_records': fetch_all_records,
           'get_readings_for_site':get_readings_for_site,
           'interpolate_gwe':interpolate_gwe,
           'gwe_trend':gwe_trend})

dview.execute("""
import pandas as pd
import requests
from prophet import Prophet
import logging
logging.getLogger("cmdstanpy").setLevel(logging.ERROR)
""")

view = client.load_balanced_view()

well_gwe_trends = {}

def process_well(well):
    return well, gwe_trend(well, 2018, 2025)

async_results = []
for well in central_5021.site_code.unique():
    async_result = view.apply_async(process_well, well)
    async_results.append(async_result)

for async_result in async_results:
    try:
        well, result = async_result.get()
        well_gwe_trends[well] = result
    except Exception as e:
        print(f"Error processing well {well}: {e}")

In [32]:
def calculate_groundwater_changes(data_dict):
    result_dict = {}
    for year in range(2018, 2025):
        year_dict = {}

        for site_code, df in data_dict.items():
            try:
                october_value = df.loc[f'{year}-10-31'].iloc[0]
                september_value = df.loc[f'{year+1}-09-30'].iloc[0]
                change = october_value - september_value
                year_dict[site_code] = change
            except KeyError:
                print(f"Warning: Missing data for site {site_code} in year {year} or {year+1}.")
                year_dict[site_code] = None

        result_dict[year] = year_dict

    return result_dict

In [33]:
well_changes = calculate_groundwater_changes(well_gwe_trends)

In [35]:
for year, site_map in well_changes.items():
    col_name = f"GWE_{year}"
    central_5021[col_name] = central_5021['site_code'].map(site_map)

In [37]:
cdl_cols = [col for col in central_5021.columns if col.split("_")[0] == "CDL"]

In [39]:
mask = (
    central_5021[cdl_cols]
    .apply(lambda row: row.astype(str).str.contains('almonds', case=False, na=False))
    .any(axis=1)
)

In [52]:
central_5021 = central_5021[mask].copy()

  central_5021 = central_5021[mask].copy()


In [46]:
def get_precipitation_data(latitude, longitude, start_date, end_date, parameter="PRECTOTCORR"):
    base_url = "https://power.larc.nasa.gov/api/temporal/daily/point"
    
    params = {
        "parameters": parameter,
        "community": "AG",
        "longitude": longitude,
        "latitude": latitude,
        "start": start_date,
        "end": end_date,
        "format": "JSON"
    }

    response = requests.get(base_url, params=params)
    response.raise_for_status()

    data = response.json()

    try:
        precip_data = data["properties"]["parameter"][parameter]
    except KeyError as e:
        raise ValueError(f"Unable to retrieve parameter '{parameter}' from response: {e}")

    return precip_data

def process_precipitation_data(precip_data):
    df = pd.DataFrame.from_dict(precip_data, orient="index", columns=["precipitation"])
    df.index = pd.to_datetime(df.index)
    monthly_avg = df.resample("ME").mean()
    
    return monthly_avg

def get_monthly_precipitation_for_sites(df, start_date=2008, end_date=2024):
    result_dict = {}

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing sites"):
        site_code = row["site_code"]
        latitude = row["latitude"]
        longitude = row["longitude"]

        try:
            precip_data = get_precipitation_data(latitude, longitude, start_date, end_date)
            monthly_avg = process_precipitation_data(precip_data)
            result_dict[site_code] = monthly_avg

        except Exception as e:
            print(f"Error processing site {site_code}: {e}")
            result_dict[site_code] = None

    return result_dict

In [47]:
precipitation_data = get_monthly_precipitation_for_sites(central_5021)

Processing sites: 100%|█████████████████████████| 47/47 [02:32<00:00,  3.24s/it]


In [48]:
def calculate_annual_precipitation_averages(precipitation_dict):
    result_dict = {}
    for year in range(2018, 2025):
        year_dict = {}
        for site_code, monthly_avg in precipitation_dict.items():
            if monthly_avg is not None:
                try:
                    start_date = f"{year}-10-31"
                    end_date = f"{year+1}-09-30"
                    period_data = monthly_avg.loc[start_date:end_date]
                    average_precip = period_data["precipitation"].mean()
                    year_dict[site_code] = average_precip
                except KeyError:
                    print(f"Warning: Missing data for site {site_code} in year {year}.")
                    year_dict[site_code] = None
        result_dict[year] = year_dict
    return result_dict

In [49]:
well_precipitation = calculate_annual_precipitation_averages(precipitation_data)

In [51]:
for year, site_precip_map in well_precipitation.items():
    col_name = f"precip_{year}"
    central_5021[col_name] = central_5021['site_code'].map(site_precip_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  central_5021[col_name] = central_5021['site_code'].map(site_precip_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  central_5021[col_name] = central_5021['site_code'].map(site_precip_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  central_5021[col_name] = central_5021['site_code'].map(site_p