# 1 – Data Exploration

This notebook performs initial data exploration for the
Extreme Precipitation Prediction project.

Daily weather observations from NOAA’s GHCN-Daily dataset
are downloaded, parsed, and examined to understand
precipitation patterns and feature availability.


## Import Required Libraries

Load core libraries for data processing, downloading NOAA data,
and performing exploratory data analysis.


In [12]:
# Core libraries
import os
import requests
import boto3
import pandas as pd
import numpy as np
from datetime import datetime

# SageMaker session setup
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
bucket = sess.default_bucket()
region = boto3.Session().region_name
role = get_execution_role()

s3 = boto3.client("s3")

print("Bucket:", bucket)
print("Region:", region)
print("Role:", role)


Bucket: sagemaker-us-east-1-083422367993
Region: us-east-1
Role: arn:aws:iam::083422367993:role/LabRole


2

In [13]:
# Project S3 structure
project_prefix = "ghcn-extreme"
raw_prefix = f"{project_prefix}/raw"
processed_prefix = f"{project_prefix}/processed"

print("Raw path:", f"s3://{bucket}/{raw_prefix}")
print("Processed path:", f"s3://{bucket}/{processed_prefix}")


Raw path: s3://sagemaker-us-east-1-083422367993/ghcn-extreme/raw
Processed path: s3://sagemaker-us-east-1-083422367993/ghcn-extreme/processed


## Define NOAA Data Source and Station List

Specify NOAA HTTPS endpoint and selected weather stations.
Stations were selected for long coverage and availability
of precipitation and temperature variables.


In [14]:
BASE_URL = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/all"

STATIONS = [
    "USW00023174",  # LAX
    "USW00012921",  # San Antonio
    "USW00094728",  # NYC Central Park
    "USW00023293",  # San Diego Miramar
    "USW00013904",  # Houston Hobby
]

LOCAL_TMP = "./tmp"
os.makedirs(LOCAL_TMP, exist_ok=True)


## Parse NOAA .dly File Format

Convert fixed-width NOAA daily files into structured format.
Each row will contain station_id, date, element, and value.


In [15]:
def parse_dly(filepath):
    rows = []
    with open(filepath, "r") as f:
        for line in f:
            station = line[0:11]
            year = int(line[11:15])
            month = int(line[15:17])
            element = line[17:21]

            for day in range(1, 32):
                value = int(line[21 + (day - 1) * 8 : 26 + (day - 1) * 8])
                if value == -9999:
                    continue

                rows.append({
                    "station_id": station,
                    "date": datetime(year, month, day),
                    "element": element,
                    "value": value / 10.0
                })

    return pd.DataFrame(rows)


## Download and Parse Selected Stations

Download daily records and combine into one dataset.


In [16]:
dfs = []

for station_id in STATIONS:
    print(f"Downloading {station_id}")

    url = f"{BASE_URL}/{station_id}.dly"
    local_path = f"{LOCAL_TMP}/{station_id}.dly"

    r = requests.get(url, timeout=120)
    r.raise_for_status()

    with open(local_path, "wb") as f:
        f.write(r.content)

    df = parse_dly(local_path)
    dfs.append(df)

weather_long = pd.concat(dfs, ignore_index=True)
weather_long.head()


Downloading USW00023174
Downloading USW00012921
Downloading USW00094728
Downloading USW00023293
Downloading USW00013904


Unnamed: 0,station_id,date,element,value
0,USW00023174,1944-01-01,TAVG,15.0
1,USW00023174,1944-01-02,TAVG,10.9
2,USW00023174,1944-01-03,TAVG,10.5
3,USW00023174,1944-01-04,TAVG,10.8
4,USW00023174,1944-01-05,TAVG,11.6


In [17]:
print("Total rows:", weather_long.shape[0])
print("Stations:", weather_long["station_id"].nunique())
print("Elements available:")
weather_long["element"].value_counts()


Total rows: 1645273
Stations: 5
Elements available:


element
TMAX    145821
TMIN    145783
PRCP    145707
SNOW    123189
SNWD    107521
         ...  
WT10         7
WV20         7
TOBS         2
WT12         1
FRGT         1
Name: count, Length: 68, dtype: int64

## Examine Available Weather Elements


In [18]:
weather_long = weather_long[weather_long["element"].isin(["PRCP", "TMAX", "TMIN"])]

weather_wide = (
    weather_long
    .pivot_table(index=["station_id", "date"], columns="element", values="value")
    .reset_index()
)

weather_wide.head()


element,station_id,date,PRCP,TMAX,TMIN
0,USW00012921,1946-09-01,0.5,33.3,21.1
1,USW00012921,1946-09-02,0.3,32.2,23.3
2,USW00012921,1946-09-03,0.0,32.8,23.3
3,USW00012921,1946-09-04,0.0,32.2,20.6
4,USW00012921,1946-09-05,0.0,32.2,21.1


In [19]:
weather_wide = weather_wide.dropna(subset=["PRCP"])

max_date = weather_wide["date"].max()
cutoff = max_date - pd.DateOffset(years=20)

weather_wide = weather_wide[weather_wide["date"] >= cutoff].copy()

print("Date range:",
      weather_wide["date"].min(),
      "to",
      weather_wide["date"].max())


Date range: 2006-02-12 00:00:00 to 2026-02-12 00:00:00


In [20]:
weather_wide = weather_wide.sort_values(["station_id", "date"])

threshold = weather_wide["PRCP"].quantile(0.95)

weather_wide["extreme_precip"] = (
    weather_wide["PRCP"] >= threshold
).astype(int)

weather_wide["extreme_precip_tomorrow"] = (
    weather_wide.groupby("station_id")["extreme_precip"].shift(-1)
)

weather_wide = weather_wide.dropna(subset=["extreme_precip_tomorrow"])
weather_wide["extreme_precip_tomorrow"] = weather_wide["extreme_precip_tomorrow"].astype(int)

weather_wide["extreme_precip_tomorrow"].value_counts(normalize=True)


extreme_precip_tomorrow
0    0.949875
1    0.050125
Name: proportion, dtype: float64

In [21]:
weather_wide["prcp_lag_1"] = (
    weather_wide.groupby("station_id")["PRCP"].shift(1)
)

weather_wide["prcp_roll_7"] = (
    weather_wide.groupby("station_id")["PRCP"]
    .rolling(7)
    .mean()
    .reset_index(0, drop=True)
)

weather_wide["year"] = weather_wide["date"].dt.year
weather_wide["month"] = weather_wide["date"].dt.month

final_df = weather_wide[
    [
        "station_id",
        "date",
        "year",
        "month",
        "TMAX",
        "TMIN",
        "prcp_lag_1",
        "prcp_roll_7",
        "extreme_precip_tomorrow"
    ]
].dropna().copy()

final_df.head()


element,station_id,date,year,month,TMAX,TMIN,prcp_lag_1,prcp_roll_7,extreme_precip_tomorrow
21720,USW00012921,2006-02-18,2006,2,3.9,-1.1,1.5,0.214286,0
21721,USW00012921,2006-02-19,2006,2,5.6,-1.7,0.0,0.257143,0
21722,USW00012921,2006-02-20,2006,2,8.9,1.7,0.3,0.4,0
21723,USW00012921,2006-02-21,2006,2,13.9,6.1,1.0,0.442857,0
21724,USW00012921,2006-02-22,2006,2,22.2,12.8,0.3,0.442857,0


In [22]:
# Separate storage prefixes by format
processed_csv_prefix = f"{project_prefix}/processed_csv"
processed_parquet_prefix = f"{project_prefix}/processed_parquet"

csv_key = f"{processed_csv_prefix}/extreme_precip_processed.csv"
parquet_key = f"{processed_parquet_prefix}/extreme_precip_processed.parquet"

# Save locally
final_df.to_csv("extreme_precip_processed.csv", index=False)
final_df.to_parquet("extreme_precip_processed.parquet", index=False)

# Upload separately
s3.upload_file("extreme_precip_processed.csv", bucket, csv_key)
s3.upload_file("extreme_precip_processed.parquet", bucket, parquet_key)

print("Saved CSV to:", f"s3://{bucket}/{csv_key}")
print("Saved Parquet to:", f"s3://{bucket}/{parquet_key}")


Saved CSV to: s3://sagemaker-us-east-1-083422367993/ghcn-extreme/processed_csv/extreme_precip_processed.csv
Saved Parquet to: s3://sagemaker-us-east-1-083422367993/ghcn-extreme/processed_parquet/extreme_precip_processed.parquet
