# Explore Mesonet Data

In [None]:
import os
import pandas as pd
import glob

from datetime import datetime
start_time = datetime.now()
# your code here


# Define the input directory for 5-minute rainfall data
path_5min_rainfall = os.path.join(os.getcwd(), "RAW")

# Define the output directory for saving storm data
path_each_station = os.path.join(os.getcwd(), "each_site")

# Create the output directory if it doesn't exist
os.makedirs(path_each_station, exist_ok=True)

# Process each CSV file in the input directory
for file_path in glob.glob(os.path.join(path_5min_rainfall, '*.csv')):
    try:
        # Load the dataset
        df = pd.read_csv(file_path)

        # Check for required columns
        required_columns = {'STID', 'TIME', 'RAIN'}
        if not required_columns.issubset(df.columns):
            print(f"Skipping file {file_path}. Missing required columns.")
            continue

        # Extract unique station IDs
        unique_stations = df['STID'].unique()

        # Loop through each unique station, filter the DataFrame, and save to a new CSV
        for station in unique_stations:
            # Filter the DataFrame for the current station
            df_station = df[df['STID'] == station][['STID', 'TIME', 'RAIN']]

            # Define the output file path using the station name, safely handle file names
            output_file_name = f"{station.replace('/', '_')}.csv"  # Replace '/' with '_' to avoid path issues
            output_file_path = os.path.join(path_each_station, output_file_name)

            # Save the filtered DataFrame to a CSV file
            df_station.to_csv(output_file_path, index=False)
            print(f"Saved data for station {station} to {output_file_path}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("All files have been processed and saved.")

end_time = datetime.now()
elapsed_time = end_time - start_time
print(f'Cell ran in {elapsed_time}')


In [109]:
import os
import glob
import pandas as pd

# ------------ paths ------------
input_dir = os.path.join(os.getcwd(), "each_site")
output_root = os.path.join(os.getcwd(), "Rain_Data")
os.makedirs(output_root, exist_ok=True)

START_YEAR, END_YEAR = 1994, 2024

# ------------ processing ------------
csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
if not csv_files:
    print(f"No CSV files found in {input_dir}")

for fp in csv_files:
    stid = os.path.splitext(os.path.basename(fp))[0]  # filename == STID
    try:
        df = pd.read_csv(fp)
    except Exception as e:
        print(f"❌ Failed to read {fp}: {e}")
        continue

    if "TIME" not in df.columns:
        print(f"⚠️ No 'TIME' column found in {fp}. Skipping.")
        continue

    # Parse TIME column (format: 1994-01-01T00:00)
    df["TIME"] = pd.to_datetime(df["TIME"], format="%Y-%m-%dT%H:%M", errors="coerce")
    df = df.dropna(subset=["TIME"]).copy()

    # Filter to requested year range
    df["year"] = df["TIME"].dt.year
    df["month"] = df["TIME"].dt.month
    df = df[(df["year"] >= START_YEAR) & (df["year"] <= END_YEAR)]

    if df.empty:
        print(f"ℹ️ No rows within {START_YEAR}-{END_YEAR} for {stid}.")
        continue

    # Convert RAIN from inch → mm
    if "RAIN" in df.columns:
        df["RAIN"] = pd.to_numeric(df["RAIN"], errors="coerce") * 25.4

    # Create station folder
    stid_dir = os.path.join(output_root, stid)
    os.makedirs(stid_dir, exist_ok=True)

    # Group by year and month
    for (yr, mo), g in df.groupby(["year", "month"]):
        year_dir = os.path.join(stid_dir, f"{yr:04d}")
        os.makedirs(year_dir, exist_ok=True)
        out_name = f"{stid}_{yr:04d}{mo:02d}.csv"
        out_path = os.path.join(year_dir, out_name)

        # Rename columns to lowercase
        g = g.rename(columns={"STID": "stid", "TIME": "time", "RAIN": "rain"})

        # Save without helper columns
        g.drop(columns=["year", "month"]).to_csv(out_path, index=False)

    print(f"✅ Done: {stid}")

print("🎉 Finished splitting all stations.")


✅ Done: OKEM
✅ Done: WIST
✅ Done: STUA
✅ Done: SALL
✅ Done: ADAX
✅ Done: ANT2
✅ Done: CARL
✅ Done: BEAV
✅ Done: WASH
✅ Done: NRMN
✅ Done: KENT
✅ Done: ELKC
✅ Done: FREE
✅ Done: ELRE
✅ Done: BURN
✅ Done: PUTN
✅ Done: GRA2
✅ Done: SEMI
✅ Done: ALTU
✅ Done: EUFA
✅ Done: NOWA
✅ Done: MEDF
✅ Done: NEWK
✅ Done: TALA
✅ Done: WILB
✅ Done: BESS
✅ Done: OILT
✅ Done: APAC
✅ Done: MANG
✅ Done: HOLL
✅ Done: LAHO
✅ Done: STIL
✅ Done: RING
✅ Done: SHAW
✅ Done: HINT
✅ Done: WEB3
✅ Done: BRIS
✅ Done: WATO
✅ Done: TAHL
✅ Done: CHEY
✅ Done: COOK
✅ Done: BYAR
✅ Done: SLAP
✅ Done: TIPT
✅ Done: MCAL
✅ Done: HOBA
✅ Done: PAWN
✅ Done: BREC
✅ Done: TULN
✅ Done: SULP
✅ Done: BUTL
✅ Done: OKMU
✅ Done: TISH
✅ Done: PRYO
✅ Done: SPEN
✅ Done: BOIS
✅ Done: CENT
✅ Done: INOL
✅ Done: FTCB
✅ Done: HUGO
✅ Done: SKIA
✅ Done: CHER
✅ Done: HOLD
✅ Done: JAYX
✅ Done: FITT
✅ Done: IDAB
✅ Done: YUKO
✅ Done: STIG
✅ Done: DURA
✅ Done: HOOK
✅ Done: BLAC
✅ Done: WEAT
✅ Done: OKCE
✅ Done: SEIL
✅ Done: ARNE
✅ Done: MAYR
✅ Done: HECT

## Count the recording period for each site

Explore the recording length based on 'RAIN' value.

In [1]:
import importlib, station_periods
importlib.reload(station_periods)
calculate_station_periods = station_periods.calculate_station_periods  # rebind the updated function

from station_periods import calculate_station_periods

input_folder = "Rain_Data"
output_csv = "Result/station_periods.csv"

df_periods = calculate_station_periods(
    root_dir=input_folder,
    time_col="time",
    rain_col="rain",
    stid_col="stid",
    save_csv_to=output_csv
)

df_periods


Unnamed: 0,STID,Start Date,End Date,Start Year,End Year,Period Days,Period Years,Valid Records
0,ACME,1994-02-18,2025-01-01,1994,2025,11275,30.87,3213630
1,ADAX,1994-01-01,2025-01-01,1994,2025,11323,31.00,3216965
2,ALTU,1994-01-01,2025-01-01,1994,2025,11323,31.00,3193774
3,ALV2,1998-12-17,2025-01-01,1998,2025,9512,26.04,2679820
4,ANT2,2011-04-15,2025-01-01,2011,2025,5010,13.72,1433795
...,...,...,...,...,...,...,...,...
116,WILB,1994-01-01,2025-01-01,1994,2025,11323,31.00,3189699
117,WIST,1994-01-01,2025-01-01,1994,2025,11323,31.00,3238740
118,WOOD,1994-01-01,2025-01-01,1994,2025,11323,31.00,3221234
119,WYNO,1994-01-01,2025-01-01,1994,2025,11323,31.00,3240335


## Remove Missing Data

Remove the sites with percentage of missing data(NaN values) > 5%. Remove the NaN values in the rainfall data from rest of the sites, and extract the sites with recording started from 1995-01-01 to 2023-12-31.

In [111]:
import importlib
import missing_data

importlib.reload(missing_data)  # if you edit the file and want to reload

from missing_data import compute_missing_stats

input_folder = "Rain_Data"     # root_dir (STID/YEAR/*.csv)
result_folder = "Result"       # outputs go here

overall_df, yearly_df = compute_missing_stats(
    root_dir=input_folder,
    result_dir=result_folder,
    negatives_are_missing=True,   # treat rain<0 as missing
    save_by_site=True             # also save one CSV per STID with yearly %s
)

overall_df.head(), yearly_df.head()



✅ Saved overall to: Result/missing_overall.csv
✅ Saved per-year (all sites) to: Result/missing_by_year_all_sites.csv
✅ Per-site yearly CSVs in: Result/missing_by_year


(   STID  Start Date    End Date  Missing_Percentage
 0  ACME  1994-01-01  2024-01-31                1.23
 1  ADAX  1994-01-01  2024-01-31                1.13
 2  ALTU  1994-01-01  2024-01-31                1.86
 3  ALV2  1998-12-17  2024-01-31                1.94
 4  ANT2  2011-04-15  2024-01-31                0.05,
    STID  Year  Missing_Percentage
 0  ACME  1994               13.16
 1  ACME  1995                4.67
 2  ACME  1996                6.28
 3  ACME  1997                0.04
 4  ACME  1998                0.16)

## Filter Out Sites with High Quality Data

In [None]:
import importlib, filter_high_quality_sites
importlib.reload(filter_high_quality_sites)

res = filter_high_quality_sites.filter_high_quality_sites(
    missing_overall_csv="Result/missing_overall.csv",
    rain_data_root="Rain_Data",
    output_root="Rain_Data_High_Quality",
    threshold=5.0,
    dry_run=False  # preview without copying
)


✅ Keeping 116 sites with Missing_Percentage ≤ 5.0%
   Copied ACME
   Copied ADAX
   Copied ALTU
   Copied ALV2
   Copied ANT2
   Copied ARD2
   Copied ARNE
   Copied BBOW
   Copied BEAV
   Copied BESS
   Copied BIXB
   Copied BLAC
   Copied BOIS
   Copied BREC
   Copied BRIS
   Copied BROK
   Copied BUFF
   Copied BURB
   Copied BURN
   Copied BUTL
   Copied BYAR
   Copied CAMA
   Copied CARL
   Copied CENT
   Copied CHAN
   Copied CHER
   Copied CHEY
   Copied CHIC
   Copied CLAY
   Copied CLOU
   Copied COOK
   Copied COPA
   Copied DURA
   Copied ELKC
   Copied ELRE
   Copied ERIC
   Copied EUFA
   Copied EVAX
   Copied FAI2
   Copied FITT
   Copied FORA
   Copied FREE
   Copied FTCB
   Copied GRA2
   Copied GUTH
   Copied HASK
   Copied HINT
   Copied HOBA
   Copied HOLD
   Copied HOLL
   Copied HOOK
   Copied HUGO
   Copied IDAB
   Copied INOL
   Copied JAYX
   Copied KENT
   Copied KETC
   Copied KIN2
   Copied LAHO
   Copied LANE
   Copied MADI
   Copied MANG
   Copied MARE
   C

# ------------------Computation-----------------------#

## Storm Idenfication

In [139]:
import importlib, identify_storms
importlib.reload(identify_storms)  # re-read if you just edited the .py

input_folder = "Rain_Data_High_Quality"
output_folder = "storms_identification"

summary_df = identify_storms.identify_storms(
    input_dir=input_folder,
    output_dir=output_folder,
    rain_col="rain",   # your column names are lowercase
    time_col="time",   # set to None if not present
    recursive=True
)

summary_df.head()


Unnamed: 0,Input_File,Output_File,Rows_Input,Rows_Kept,Pct_Kept
0,Rain_Data_High_Quality/ACME/1994/ACME_199402.csv,storms_identification/ACME/1994/ACME_199402.csv,8064,169,2.1
1,Rain_Data_High_Quality/ACME/1994/ACME_199403.csv,storms_identification/ACME/1994/ACME_199403.csv,8928,274,3.07
2,Rain_Data_High_Quality/ACME/1994/ACME_199404.csv,storms_identification/ACME/1994/ACME_199404.csv,8640,192,2.22
3,Rain_Data_High_Quality/ACME/1994/ACME_199405.csv,storms_identification/ACME/1994/ACME_199405.csv,8928,254,2.84
4,Rain_Data_High_Quality/ACME/1994/ACME_199406.csv,storms_identification/ACME/1994/ACME_199406.csv,8640,81,0.94


## Estimate rainfall intensity in each time interval

In [1]:
import importlib, process_intervals
importlib.reload(process_intervals)

process_intervals.process_directory(
    input_dir="storms_identification",
    output_dir="storm_interval_information",
    recursive=True
)


  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="c

Done!


## Separate Storms

In [3]:
import importlib, separate_storm_events
importlib.reload(separate_storm_events)

summary = separate_storm_events.separate_storm_events(
    input_dir="storm_interval_information",
    output_dir="single_storm",
    recursive=True,
    station_col="stid",
    cumulative_col="cumulative rain depth (mm)",
    columns_to_reset=None,   # or pass your own list (must match lowercase names)
    sort_by_time_col="time"
)
summary.head()


  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")
  df[t_col] = pd.to_datetime(df[t_col], errors="coerce")


Unnamed: 0,file,station_id,event_id,year,month,event_start,event_end,event_duration_min,event_total_depth_mm,event_peak_intensity_mm_hr,output_file,note
0,ACME_199402.csv,ACME,1.0,1994.0,2.0,1994-02-18 17:30:00,1994-02-18 17:40:00,10.0,2.54,15.24,single_storm/ACME/1994/ACME_1994_02_1.csv,
1,ACME_199402.csv,ACME,2.0,1994.0,2.0,1994-02-19 18:25:00,1994-02-19 22:10:00,225.0,10.16,51.82,single_storm/ACME/1994/ACME_1994_02_2.csv,
2,ACME_199402.csv,ACME,3.0,1994.0,2.0,1994-02-20 09:10:00,1994-02-20 09:15:00,5.0,0.25,3.05,single_storm/ACME/1994/ACME_1994_02_3.csv,
3,ACME_199402.csv,ACME,4.0,1994.0,2.0,1994-02-21 20:55:00,1994-02-22 00:00:00,185.0,18.54,15.24,single_storm/ACME/1994/ACME_1994_02_4.csv,
4,ACME_199402.csv,ACME,5.0,1994.0,2.0,1994-02-22 00:05:00,1994-02-22 22:15:00,1330.0,29.97,12.19,single_storm/ACME/1994/ACME_1994_02_5.csv,


## Erosivity Storm

In [4]:
import importlib, erosive_storms
importlib.reload(erosive_storms)

summary = erosive_storms.filter_erosive_storms(
    input_dir="single_storm",
    output_dir="erosive_storms",
    threshold_mm=12.7,  # 12.7 mm (~0.5 in)
    cumulative_col="cumulative rain depth (mm)",  # must match your files
    station_col="stid",
    time_col="time"
)
summary.head()


Unnamed: 0,file,station_id,kept,reason,output_file,max_cumulative_mm,threshold_mm
0,ACME_1994_02_1.csv,ACME,False,max(cumulative rain depth (mm))=2.54 < thresho...,,,
1,ACME_1994_02_2.csv,ACME,False,max(cumulative rain depth (mm))=10.16 < thresh...,,,
2,ACME_1994_02_3.csv,ACME,False,max(cumulative rain depth (mm))=0.254 < thresh...,,,
3,ACME_1994_02_4.csv,ACME,True,,erosive_storms/ACME/1994/ACME_1994_02_4.csv,18.542,12.7
4,ACME_1994_02_5.csv,ACME,True,,erosive_storms/ACME/1994/ACME_1994_02_5.csv,29.972,12.7


## rainfall erosivity

In [1]:
import importlib, rainfall_erosivity
importlib.reload(rainfall_erosivity)

summary = rainfall_erosivity.process_rainfall_erosivity(
    input_dir="erosive_storms",
    output_dir="rainfall_erosivity",
    recursive=True
)
summary.head()


  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
  df[time_col] = pd.to_datetime(df[time_col], errors="coerce")


Unnamed: 0,stid,year,month,day,total rainfall for single event (mm),rainfall @ max 30-min (mm),max 30-min intensity (mm/h),total energy (MJ/ha),rainfall erosivity ((MJ-mm)/(ha-hr)),storm file,output_file
0,ACME,1994,2,21,18.542,11.938,23.876,3.296564,78.708765,ACME_1994_02_4.csv,rainfall_erosivity/ACME/ACME_1994_02.csv
1,ACME,1994,2,22,29.972,21.336,42.672,4.869256,207.780905,ACME_1994_02_5.csv,rainfall_erosivity/ACME/ACME_1994_02.csv
2,ACME,1994,3,8,55.626,33.782,67.564,10.435005,705.030662,ACME_1994_03_9.csv,rainfall_erosivity/ACME/ACME_1994_03.csv
3,ACME,1994,4,11,13.462,9.906,19.812,3.172214,62.84791,ACME_1994_04_14.csv,rainfall_erosivity/ACME/ACME_1994_04.csv
4,ACME,1994,4,24,22.86,9.398,18.796,5.506361,103.497569,ACME_1994_04_15.csv,rainfall_erosivity/ACME/ACME_1994_04.csv


## Monthly Erosivity


In [7]:
from importlib import reload
import monthly_erosivity

reload(monthly_erosivity)  # only needed if you edit the .py file
monthly_erosivity.process_monthly_erosivity(
    input_dir="rainfall_erosivity",
    output_dir="monthly_erosivity"
)


✅ Monthly erosivity written under: monthly_erosivity


## make publish data

In [1]:
import pandas as pd
import rainfalltools as rt

# Example CSV from one station
df = pd.read_csv("Rain_Data_High_Quality/ACME/1994/ACME_199401.csv")

# Call a function from the package
storms = rt.identify_storms(df)

# Preview
print(storms.head())

# Save result
storms.to_csv("YUKO_storms.csv", index=False)


ImportError: attempted relative import beyond top-level package