In [18]:
import os
import xarray as xr
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [2]:
folder_path = '../data/NLDAS'

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    # Construct the full file path
    file_path = os.path.join(folder_path, filename)
    
    # Try to open the file as a NetCDF file
    try:
        # Load the file using xarray
        ds = xr.open_dataset(file_path)
        
        # Convert the xarray Dataset to a DataFrame
        df = ds.to_dataframe().reset_index()
        
        # Identify the variable column(s) (exclude time, bnds, lon, lat, time_bnds)
        excluded_columns = {'time', 'bnds', 'lon', 'lat', 'time_bnds'}
        variable_columns = [col for col in df.columns if col not in excluded_columns]
        
        # If no variable columns are found, skip this file
        if not variable_columns:
            continue
        
        # Extract the year and month from the filename (e.g., "A201011" for November 2010)
        # Example filename: "HTTP_services.cgi?FILENAME=%2Fdata%2FNLDAS%2FNLDAS_NOAH0125_M.2.0%2F2010%2FNLDAS_NOAH0125_M.A201011.020.nc&..."
        year_month = filename.split("A")[-1].split(".")[0]  # Extracts "201011" from "A201011.020.nc"
        year = year_month[:4]  # Extracts "2010"
        month = year_month[4:]  # Extracts "11"
        
        # Alternatively, extract year and month from the 'time' column (if available)
        if 'time' in df.columns:
            df['time'] = pd.to_datetime(df['time'], format='%Y%m')  # Convert to datetime
            df['year'] = df['time'].dt.year  # Extract year
            df['month'] = df['time'].dt.month  # Extract month
            year_month = f"{df['year'].iloc[0]:04d}{df['month'].iloc[0]:02d}"  # Format as "YYYYMM"
        
        # Rename the variable columns to include the year and month (e.g., "201011_Qg")
        for var in variable_columns:
            df.rename(columns={var: f"{year_month}_{var}"}, inplace=True)
        
        # Drop unnecessary columns (time, bnds, time_bnds, year, month)
        df.drop(columns=excluded_columns - {'lon', 'lat'}, inplace=True, errors='ignore')
        df.drop(columns={'year', 'month'}, inplace=True, errors='ignore')
        
        # Add the DataFrame to the list
        dataframes.append(df)
    except Exception as e:
        # Skip files that are not valid NetCDF files
        continue



In [4]:
for i in range(len(dataframes)):
    dataframes[i] = dataframes[i].set_index(['lat', 'lon']).sort_index()

In [7]:
merged_df = pd.concat(dataframes, axis=1, join='outer')

In [10]:
merged_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,202009_Qg,202009_Evap,202009_Qsb,202009_AvgSurfT,202009_SoilT_100_200cm,202009_RootMoist,202009_SMLiq_100_200cm,202407_Qg,202407_Evap,202407_Qsb,...,201704_SoilT_100_200cm,201704_RootMoist,201704_SMLiq_100_200cm,201912_Qg,201912_Evap,201912_Qsb,201912_AvgSurfT,201912_SoilT_100_200cm,201912_RootMoist,201912_SMLiq_100_200cm
lat,lon,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
36.0625,-123.3125,,,,,,,,,,,...,,,,,,,,,,
36.0625,-123.3125,,,,,,,,,,,...,,,,,,,,,,
36.0625,-123.1875,,,,,,,,,,,...,,,,,,,,,,
36.0625,-123.1875,,,,,,,,,,,...,,,,,,,,,,
36.0625,-123.0625,,,,,,,,,,,...,,,,,,,,,,


In [12]:
merged_df = merged_df[sorted(merged_df.columns, key=lambda col: int(col.split('_')[0]))]

In [13]:
merged_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,201010_Qg,201010_Evap,201010_Qsb,201010_AvgSurfT,201010_SoilT_100_200cm,201010_RootMoist,201010_SMLiq_100_200cm,201011_Qg,201011_Evap,201011_Qsb,...,202408_SoilT_100_200cm,202408_RootMoist,202408_SMLiq_100_200cm,202409_Qg,202409_Evap,202409_Qsb,202409_AvgSurfT,202409_SoilT_100_200cm,202409_RootMoist,202409_SMLiq_100_200cm
lat,lon,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
36.0625,-123.3125,,,,,,,,,,,...,,,,,,,,,,
36.0625,-123.3125,,,,,,,,,,,...,,,,,,,,,,
36.0625,-123.1875,,,,,,,,,,,...,,,,,,,,,,
36.0625,-123.1875,,,,,,,,,,,...,,,,,,,,,,
36.0625,-123.0625,,,,,,,,,,,...,,,,,,,,,,


In [20]:
SGMA = "../data/GWBasins.shp"
gdf = gpd.read_file(SGMA)
gdf.set_crs(epsg=3857, inplace=True)
gdf.set_index('OBJECTID', inplace=True)
gdf['Basin_Prefix'] = gdf['Basin_Numb'].str.split('-').str[0].astype(int)
gdf_5021 = gdf[gdf.Basin_Numb == "5-021"]

In [21]:
merged_df = merged_df.reset_index()

# Create a geometry column using the longitude and latitude columns.
merged_df['geometry'] = merged_df.apply(lambda row: Point(row['lon'], row['lat']), axis=1)

# Convert to a GeoDataFrame and set the coordinate reference system (CRS) to WGS84.
NLDAS_gdf = gpd.GeoDataFrame(merged_df, geometry='geometry')
NLDAS_gdf.crs = "EPSG:4326"

In [25]:
NLDAS_gdf = NLDAS_gdf.to_crs(gdf_5021.crs)
combined_area = gdf_5021.unary_union
NLDAS_5021 = NLDAS_gdf[NLDAS_gdf.geometry.within(combined_area)]

  combined_area = gdf_5021.unary_union


In [29]:
NLDAS_5021.to_file("NLDAS_5021.gpkg", driver="GPKG")