In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

### Generate yearly dataframe after cleaning crop names

In [3]:
from generate_county_crop_dataset.ccdc import cdl_crop_legend, ProcessYieldObs

df = ProcessYieldObs(
    county_name = "Monterey"
)()

In [12]:
# df.to_csv('./all.csv')
# crop_name_mapping = df.groupby("crop_name")["key_crop_name"].unique().to_dict()
# crop_name_mapping = {k: list(v) for k, v in crop_name_mapping.items()}
# mapping_df = pd.DataFrame(crop_name_mapping.items(), columns=["crop_name", "key_crop_name"])
# mapping_df.to_csv("crop_name_mapping.csv", index=False)

### Generate Dataset County Based 

In [5]:
from generate_county_crop_dataset import ccdc
import pandas as pd
m = pd.read_csv('/data2/hkaman/Data/FoundationModel/Monterey/Yield/2008/yield_2008.csv')
crop_names = m['key_crop_name'].unique().tolist()

In [6]:
downloader = ccdc.TT(
    county_name = 'Monterey', 
    year = 2008, 
    crop_names = crop_names[:1])

In [42]:
output_dataset = downloader(output_type = "soil_data", daily_climate=True)

Matched Soil Attributes in Data: ['aws0100wta', 'slopegraddcp', 'awmmfpwwta', 'drclassdcd', 'hydgrpdcd']


In [45]:
import numpy as np
output_dataset['Herbs']['soil_data'].shape

(5, 963)

In [35]:
import dataLoader

In [85]:
dataLoader.dataloader(
    county_name='Monterey'
)

(177, 7) (60, 9) (39, 9)
(9, 6, 958) (12, 2, 958) (365, 8, 2) (1, 958) 18.31


In [81]:
import os
import numpy as np
import pandas as pd

def clean_and_update_dataframes(base_path, county_name):
    """
    Reads .npz files and their corresponding .csv files, checks if any of the four datasets
    have a last dimension of 0, removes those rows from the dataframe, and saves the cleaned file.
    
    Args:
        base_path (str): The base directory containing county-specific folders.
        county_name (str): The county name to process.
    """

    base_csv_path = os.path.join(base_path, county_name, "InD")

    # Loop over each year from 2008 to 2022 (except 2012)
    for year in range(2008, 2023):
        if year == 2012:
            continue  # Skip 2012 as it does not exist

        folder_path = os.path.join(base_csv_path, str(year))
        csv_file = os.path.join(folder_path, f"yield_{year}.csv")

        npz_file_path = os.path.join(folder_path, f"{county_name}_{year}.npz")

        # Check if both files exist
        if not os.path.exists(csv_file):
            print(f"CSV file not found for {year}: {csv_file}")
            continue
        if not os.path.exists(npz_file_path):
            print(f"NPZ file not found for {year}: {npz_file_path}")
            continue

        # Load the CSV file
        df = pd.read_csv(csv_file)
        df = df[df['key_crop_name'] != 'No Match']

        # Load the NPZ file
        try:
            loaded_data = np.load(npz_file_path, allow_pickle=True)["input"]
            loaded_data = loaded_data.item()  # Convert NumPy object to a dictionary
        except Exception as e:
            print(f"Error loading NPZ file for {year}: {e}")
            continue

        # Identify rows to keep
        valid_rows = []
        for idx, row in df.iterrows():
            crop_name = row["key_crop_name"]

            # Ensure crop exists in the dictionary
            if crop_name not in loaded_data:
                print(f"Warning: Crop '{crop_name}' not found in NPZ file for {year}. Skipping row {idx}.")
                continue

            try:
                # Extract data
                landsat = loaded_data[crop_name]['landsat_data']
                et = loaded_data[crop_name]['et_data']
                climate = loaded_data[crop_name]['climate_data']
                soil = loaded_data[crop_name]['soil_data']

                # Check last dimension size
                if (
                    landsat.shape[-1] > 0 and 
                    et.shape[-1] > 0 and 
                    climate.shape[-1] > 0 and 
                    soil.shape[-1] > 0
                ):
                    valid_rows.append(idx)  # Keep only valid rows

            except Exception as e:
                print(f"Error processing {crop_name} in {year}: {e}")
                continue

        # Filter the dataframe to keep only valid rows
        df_cleaned = df.loc[valid_rows].reset_index(drop=True)

        # Save the updated dataframe back to the same location
        df_cleaned.to_csv(csv_file, index=False)
        print(f"Updated CSV saved for {year}: {csv_file} (Removed {len(df) - len(df_cleaned)} rows)")

In [None]:
county_name = 'Monterey'
base_csv_path = f'/data2/hkaman/Data/FoundationModel'
clean_and_update_dataframes(base_csv_path, county_name)