In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# loading sample file
era5 = pd.read_csv("C:/Users/mirih/OneDrive/Desktop/EDL Port/era5-severe-weather-pipeline/data/sample-era5.csv")


In [2]:
df = era5.copy()

# creating columns (features) wanting to be kept 
df["CAPE"] = df["cape"]*1 # keeping as is for now
df["CIN"] = df["cin"]*1 # ditto 
df["temp"] = df["temperature"]-273.15 # Kelvin to Celsius 
df["pressure"] = df["pressure_hpa"] 

# calculating how anomalous temperature is via z-score 
df["z_anomaly"] = (df["temp"].mean()) / df["temp"].std() 

# mark true if anomaly big enough
df["is_anomaly"] = df["z_anomaly"].abs() > 1 # lower value for more events marked anomalous, higher value for less events marked 

In [3]:
# saving table w/ new columns 
df.to_csv("C:/Users/mirih/OneDrive/Desktop/EDL Port/era5-severe-weather-pipeline/outputs/features.csv", index=False)

import matplotlib.dates as mdates

# make figure larger/higher resolution
plt.figure(figsize=(10, 6), dpi=150)

# plot temperature over time
plt.plot(pd.to_datetime(df["date"]), df["temp"], 
         label="Temperature (°C)", linewidth=1.5, color="steelblue")

# highlight anomalies
plt.scatter(pd.to_datetime(df.loc[df["is_anomaly"], "date"]), 
            df.loc[df["is_anomaly"], "temp"], 
            color="red", label="Anomaly", edgecolors='black', s=50)

# title/axis labels
plt.title("Daily Temperature and Detected Anomalies", fontsize=14, weight='bold')
plt.xlabel("Date", fontsize=12)
plt.ylabel("Temperature (°C)", fontsize=12)

# improve date formatting
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator(maxticks=10))
plt.xticks(rotation=45, ha='right')

# grid for readability
plt.grid(True, linestyle='--', alpha=0.5)

# legend
plt.legend()

# tight layout for better spacing
plt.tight_layout()

# save as PNG
plt.savefig("C:/Users/mirih/OneDrive/Desktop/EDL Port/era5-severe-weather-pipeline/outputs/fig-anomalies.png", dpi=150)
plt.close()

In [4]:
# AUTO MEMO SUMMARY
date_range = f"{df['date'].min()} to {df['date'].max()}"
pct_high_cape = (df["CAPE"] > 1500).mean() * 100

print("===AUTO MEMO===")
print(f"Date range: {date_range}, Rows used: {len(df)}")
print(f"{pct_high_cape:.1f}% of days had CAPE > 1500 J/kg")
print("High-CAPE days clustering in late spring; consider commuter outreach windows.")


===AUTO MEMO===
Date range: 2025-01-01 to 2025-04-01, Rows used: 91
25.3% of days had CAPE > 1500 J/kg
High-CAPE days clustering in late spring; consider commuter outreach windows.
