In [None]:
# =========================
# 1 - Imports, paths, config 
# =========================
import os
import json
import random
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 180)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")

# Project directories
ROOT = Path(".").resolve()
DATA_DIR = ROOT / "data"
REPORTS_DIR = ROOT / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

RAW_CSV = DATA_DIR / "Buoy_raw.csv"

# Placeholder values for missing data
PLACEHOLDERS = {-999, -999.0, np.nan}

# Dataset-specific columns
SENSOR_COLS = [
    'AtmosphericPressure',   # mbar
    'WindDirection',         # degrees_true
    'WindSpeed',             # knots
    'Gust',                  # knots
    'WaveHeight',            # metres
    'WavePeriod',            # seconds
    'MeanWaveDirection',     # degrees_true
    'Hmax',                  # metres
    'AirTemperature',        # degree_C
    'DewPoint',              # degree_C
    'SeaTemperature',        # degree_C
    'RelativeHumidity'       # percent
]

TIME_COL = 'time'
BUOY_COL = 'station_id'

print("==Environment==")
print(f"Root: {ROOT}")
print(f"Data dir: {DATA_DIR}")
print(f"Reports: {REPORTS_DIR}")
print(f"Seed: {SEED}")

if not RAW_CSV.exists():
    raise FileNotFoundError(f"CSV not found at {RAW_CSV}")

print(f"\n Found dataset: {RAW_CSV.name} ({RAW_CSV.stat().st_size/1024/1024:.2f} MB)")

# Preview
first_two = pd.read_csv(RAW_CSV, nrows=2)
if any(col.lower().startswith('unnamed') for col in first_two.columns):
    head = pd.read_csv(RAW_CSV, skiprows=[1], nrows=5)
else:
    if any(str(v).isalpha() for v in first_two.iloc[0].values):
        head = pd.read_csv(RAW_CSV, skiprows=[1], nrows=5)
    else:
        head = pd.read_csv(RAW_CSV, nrows=5)

print("\nPreview (first 5 data rows):")
display(head)

# Column validation
expected_cols = {TIME_COL, BUOY_COL} | set(SENSOR_COLS)
missing_cols = expected_cols - set(head.columns)
if missing_cols:
    raise ValueError(f"Missing expected columns: {missing_cols}")

print("\nColumns:", list(head.columns))

# Quick metadata
meta_df = pd.read_csv(RAW_CSV, skiprows=[1], usecols=[TIME_COL, BUOY_COL], parse_dates=[TIME_COL])
meta_df = meta_df.dropna(subset=[TIME_COL, BUOY_COL])
print(f"\nApprox. number of rows: {len(meta_df):,}")
print(f"Approx. stations: {meta_df[BUOY_COL].nunique()} → {meta_df[BUOY_COL].unique()}")
print(f"Date range: {meta_df[TIME_COL].min()} → {meta_df[TIME_COL].max()}")


==Environment==
Root: C:\Users\pesic\Desktop\GRU
Data dir: C:\Users\pesic\Desktop\GRU\data
Reports: C:\Users\pesic\Desktop\GRU\reports
Seed: 42

 Found dataset: Buoy_raw.csv (63.49 MB)

Preview (first 5 data rows):


Unnamed: 0,station_id,longitude,latitude,time,AtmosphericPressure,WindDirection,WindSpeed,Gust,WaveHeight,WavePeriod,MeanWaveDirection,Hmax,AirTemperature,DewPoint,SeaTemperature,RelativeHumidity,QC_Flag
0,M1,-11.2,53.1266,2001-02-06T13:00:00Z,967.6,270.0,21.98,33.1,,,,,8.9,,9.0,,1.0
1,M1,-11.2,53.1266,2001-02-06T14:00:00Z,969.8,270.0,23.93,35.0,,,,,8.5,,9.0,,1.0
2,M1,-11.2,53.1266,2001-02-06T15:00:00Z,972.0,270.0,19.07,31.1,,,,,8.9,,9.0,,1.0
3,M1,-11.2,53.1266,2001-02-06T16:00:00Z,973.6,270.0,15.95,25.3,,,,,9.1,,9.0,,1.0
4,M1,-11.2,53.1266,2001-02-06T18:00:00Z,976.4,270.0,12.06,19.5,,,,,8.8,,9.0,,1.0



Columns: ['station_id', 'longitude', 'latitude', 'time', 'AtmosphericPressure', 'WindDirection', 'WindSpeed', 'Gust', 'WaveHeight', 'WavePeriod', 'MeanWaveDirection', 'Hmax', 'AirTemperature', 'DewPoint', 'SeaTemperature', 'RelativeHumidity', 'QC_Flag']

Approx. number of rows: 613,392
Approx. stations: 9 → ['M1' 'M2' 'M3' 'FS1' 'M4-Archive' 'M5' 'M6' 'M4' 'Belmullet-AMETS']
Date range: 2001-02-06 13:00:00+00:00 → 2017-11-28 10:00:00+00:00


In [2]:
# =========================
# 2 -Load, clean, scale, and basic stats
# =========================
from sklearn.preprocessing import StandardScaler

# Load full dataset, skipping the unit row (row 1)
df_raw = pd.read_csv(
    RAW_CSV,
    skiprows=[1],
    parse_dates=[TIME_COL],
    low_memory=False
)

# Drop rows with missing buoy or time
df_raw = df_raw.dropna(subset=[TIME_COL, BUOY_COL])

# Replace placeholders with NaN
for col in SENSOR_COLS:
    if col in df_raw.columns:
        df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')
        df_raw[col] = df_raw[col].replace(list(PLACEHOLDERS), np.nan)

# Per-buoy scaling
def scale_per_buoy(df, feature_cols):
    scaled_parts = []
    for buoy, sub_df in df.groupby(BUOY_COL):
        scaler = StandardScaler()
        sub_df[feature_cols] = scaler.fit_transform(sub_df[feature_cols])
        scaled_parts.append(sub_df)
    return pd.concat(scaled_parts, axis=0)

df_raw = scale_per_buoy(df_raw, SENSOR_COLS)

# Sort by station and time
df_raw = df_raw.sort_values([BUOY_COL, TIME_COL]).reset_index(drop=True)

# Standardize timezone
if df_raw[TIME_COL].dt.tz is None:
    df_raw[TIME_COL] = df_raw[TIME_COL].dt.tz_localize('UTC')
else:
    df_raw[TIME_COL] = df_raw[TIME_COL].dt.tz_convert('UTC')

# Drop rows where all sensors are NaN
df_raw = df_raw.dropna(subset=SENSOR_COLS, how='all')

print(f"Total records after cleaning & scaling: {len(df_raw):,}")
print(f"Stations: {df_raw[BUOY_COL].nunique()} → {df_raw[BUOY_COL].unique()}")
print(f"Date range: {df_raw[TIME_COL].min()} → {df_raw[TIME_COL].max()}")

# Missingness per sensor
miss_stats = df_raw[SENSOR_COLS].isna().mean().mul(100).round(2)
print("\n==Missingness per sensor (%)==")
print(miss_stats)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction ** 2 / new_sample_count


Total records after cleaning & scaling: 613,392
Stations: 9 → ['Belmullet-AMETS' 'FS1' 'M1' 'M2' 'M3' 'M4' 'M4-Archive' 'M5' 'M6']
Date range: 2001-02-06 13:00:00+00:00 → 2017-11-28 10:00:00+00:00

==Missingness per sensor (%)==
AtmosphericPressure    2.3100
WindDirection          4.0400
WindSpeed              9.1600
Gust                  16.8100
WaveHeight            12.2400
WavePeriod            11.8800
MeanWaveDirection     78.7700
Hmax                  79.8200
AirTemperature         1.2900
DewPoint              36.5300
SeaTemperature         9.2600
RelativeHumidity      15.9200
dtype: float64


In [3]:
# =========================
# 3 — Per-station coverage, missingness, and danger rate
# =========================

SEQ_LEN = 72
HORIZON = 6

# Ensure numeric WaveHeight
df_raw['WaveHeight'] = pd.to_numeric(df_raw['WaveHeight'], errors='coerce')

# Add month column
df_raw['month'] = df_raw[TIME_COL].dt.month

# Per-buoy monthly thresholds
thr_lookup = {
    (row[BUOY_COL], row['month']): row['WaveHeight']
    for _, row in (
        df_raw.groupby([BUOY_COL, 'month'])['WaveHeight']
        .quantile(0.90)
        .reset_index()
        .rename(columns={'WaveHeight': 'WaveHeight'})
    ).iterrows()
}

stats = []
for station, grp in df_raw.groupby(BUOY_COL):
    grp = grp.sort_values(TIME_COL)
    start_date, end_date = grp[TIME_COL].min(), grp[TIME_COL].max()
    span_days = (end_date - start_date).days + 1
    expected_hours = span_days * 24
    actual_hours = grp[TIME_COL].nunique()
    coverage_pct = 100 * actual_hours / expected_hours if expected_hours > 0 else np.nan

    miss_rates = {
        f"miss_{col}": 100 * grp[col].isna().mean()
        for col in ['AtmosphericPressure', 'WindSpeed', 'Gust', 'WaveHeight', 'WavePeriod', 'SeaTemperature']
    }

    # Danger rate
    wh = grp['WaveHeight'].to_numpy()
    months = grp['month'].to_numpy()
    danger_count, safe_count = 0, 0

    for i in range(len(wh) - SEQ_LEN - HORIZON):
        future = wh[i+SEQ_LEN : i+SEQ_LEN+HORIZON]
        m = months[i]
        thr = thr_lookup.get((station, m), np.nan)
        if np.isnan(thr):
            continue
        if np.any(future >= thr):
            danger_count += 1
        else:
            safe_count += 1

    total_seq = danger_count + safe_count
    danger_rate = (100 * danger_count / total_seq) if total_seq > 0 else np.nan

    stats.append({
        BUOY_COL: station,
        'start': start_date,
        'end': end_date,
        'span_days': span_days,
        'expected_hours': expected_hours,
        'actual_hours': actual_hours,
        'coverage_%': coverage_pct,
        'danger_rate_%': danger_rate,
        **miss_rates
    })

df_stats = pd.DataFrame(stats).sort_values(['coverage_%', 'danger_rate_%'], ascending=[False, False])
display(df_stats)


Unnamed: 0,station_id,start,end,span_days,expected_hours,actual_hours,coverage_%,danger_rate_%,miss_AtmosphericPressure,miss_WindSpeed,miss_Gust,miss_WaveHeight,miss_WavePeriod,miss_SeaTemperature
2,M1,2001-02-06 13:00:00+00:00,2007-07-09 12:00:00+00:00,2344,56256,51626,91.7698,15.1082,2.799,14.1557,18.2389,3.0508,2.6169,1.2358
0,Belmullet-AMETS,2011-04-07 18:00:00+00:00,2011-08-10 16:00:00+00:00,125,3000,2731,91.0333,14.8511,0.0,0.2197,0.0,0.0,0.0,0.0
5,M4,2007-05-03 09:00:00+00:00,2017-11-28 10:00:00+00:00,3863,92712,82930,89.449,9.3987,8.3528,12.6794,12.6239,37.9139,37.8862,2.9012
6,M4-Archive,2003-04-16 09:00:00+00:00,2007-05-01 11:00:00+00:00,1477,35448,31215,88.0586,14.6931,1.5826,1.8805,9.9311,2.281,1.1981,0.9835
7,M5,2004-10-18 10:00:00+00:00,2017-11-28 10:00:00+00:00,4790,114960,101137,87.9758,12.4739,1.5642,7.9071,9.4624,16.4984,16.2305,1.8124
3,M2,2001-05-03 14:00:00+00:00,2017-11-28 10:00:00+00:00,6053,145272,125032,86.0675,15.861,1.1165,1.5148,4.3477,7.5549,7.3165,11.7122
8,M6,2006-09-25 13:00:00+00:00,2017-11-28 10:00:00+00:00,4082,97968,83695,85.431,13.758,0.84,21.5795,31.4045,14.5277,14.4704,2.4147
4,M3,2002-07-22 14:00:00+00:00,2017-06-30 03:00:00+00:00,5457,130968,106487,81.3076,14.7901,1.3607,8.8809,12.1827,2.7675,1.7767,5.9979
1,FS1,2003-01-23 12:00:00+00:00,2008-02-17 02:00:00+00:00,1851,44424,28539,64.2423,15.6916,0.5431,1.2194,90.5813,0.473,0.48,100.0


In [4]:
# =========================
# 4 — Automatic Main and Generalization Buoy Selection (lifespan-aware)
# =========================

# Config
MIN_COVERAGE = 70.0         # Min coverage % for main buoy
MAX_MISS_WAVE = 50.0        # Max WaveHeight missingness %
MIN_SPAN_DAYS = 365 * 3     # Require at least 3 years of data (~26,280 hours)
TOP_GEN_CANDIDATES = 5      # Number of gen candidates to inspect further

# Dynamic target danger based on eligible buoys
eligible_for_target = df_stats[df_stats['coverage_%'] >= MIN_COVERAGE]
TARGET_DANGER = np.nanmedian(eligible_for_target['danger_rate_%'])
print(f"Dynamic TARGET_DANGER set to {TARGET_DANGER:.2f}% based on median of eligible buoys")

# Filter for main buoy eligibility
df_filtered = df_stats[
    (df_stats['coverage_%'] >= MIN_COVERAGE) &
    (df_stats['miss_WaveHeight'] <= MAX_MISS_WAVE) &
    (df_stats['span_days'] >= MIN_SPAN_DAYS) &
    (df_stats['actual_hours'] >= (SEQ_LEN + HORIZON))
].copy()

if df_filtered.empty:
    raise ValueError("No buoys meet the minimum criteria. Lower MIN_SPAN_DAYS or adjust filters.")

# Composite score
df_filtered['score'] = (
    (df_filtered['coverage_%'] / df_filtered['coverage_%'].max()) * 0.30 +  # coverage
    ((MAX_MISS_WAVE - df_filtered['miss_WaveHeight']) / MAX_MISS_WAVE) * 0.20 +  # low missingness
    (1 - abs(df_filtered['danger_rate_%'] - TARGET_DANGER) / TARGET_DANGER) * 0.25 +  # danger rate closeness
    (df_filtered['span_days'] / df_filtered['span_days'].max()) * 0.25  # reward long lifespan
)

# Sort and select
df_filtered = df_filtered.sort_values(['score', 'coverage_%'], ascending=[False, False]).reset_index(drop=True)

main_candidate = df_filtered.iloc[0]['station_id']

# Select extra training buoy (next best after main)
extra_train_candidates = df_filtered[df_filtered['station_id'] != main_candidate].head(1)['station_id'].tolist()

# Candidate list for generalization (exclude main and extras)
generalization_candidates = df_filtered[
    ~df_filtered['station_id'].isin([main_candidate] + extra_train_candidates)
].head(TOP_GEN_CANDIDATES)

print(f"==Automatically Selected MAIN buoy: {main_candidate}==")
print(f"==Extra training buoy(s): {extra_train_candidates}==")
display(df_filtered.head(10))

print(f"==Top {TOP_GEN_CANDIDATES} GENERALIZATION candidates==")
display(generalization_candidates)

# Save selections for later
MAIN_BUOY = main_candidate
EXTRA_TRAIN_BUOYS = extra_train_candidates
GEN_CANDIDATES = generalization_candidates['station_id'].tolist()


Dynamic TARGET_DANGER set to 14.74% based on median of eligible buoys
==Automatically Selected MAIN buoy: M2==
==Extra training buoy(s): ['M3']==


Unnamed: 0,station_id,start,end,span_days,expected_hours,actual_hours,coverage_%,danger_rate_%,miss_AtmosphericPressure,miss_WindSpeed,miss_Gust,miss_WaveHeight,miss_WavePeriod,miss_SeaTemperature,score
0,M2,2001-05-03 14:00:00+00:00,2017-11-28 10:00:00+00:00,6053,145272,125032,86.0675,15.861,1.1165,1.5148,4.3477,7.5549,7.3165,11.7122,0.9322
1,M3,2002-07-22 14:00:00+00:00,2017-06-30 03:00:00+00:00,5457,130968,106487,81.3076,14.7901,1.3607,8.8809,12.1827,2.7675,1.7767,5.9979,0.9293
2,M5,2004-10-18 10:00:00+00:00,2017-11-28 10:00:00+00:00,4790,114960,101137,87.9758,12.4739,1.5642,7.9071,9.4624,16.4984,16.2305,1.8124,0.831
3,M1,2001-02-06 13:00:00+00:00,2007-07-09 12:00:00+00:00,2344,56256,51626,91.7698,15.1082,2.799,14.1557,18.2389,3.0508,2.6169,1.2358,0.8284
4,M6,2006-09-25 13:00:00+00:00,2017-11-28 10:00:00+00:00,4082,97968,83695,85.431,13.758,0.84,21.5795,31.4045,14.5277,14.4704,2.4147,0.8231
5,M4-Archive,2003-04-16 09:00:00+00:00,2007-05-01 11:00:00+00:00,1477,35448,31215,88.0586,14.6931,1.5826,1.8805,9.9311,2.281,1.1981,0.9835,0.7889
6,M4,2007-05-03 09:00:00+00:00,2017-11-28 10:00:00+00:00,3863,92712,82930,89.449,9.3987,8.3528,12.6794,12.6239,37.9139,37.8862,2.9012,0.6597


==Top 5 GENERALIZATION candidates==


Unnamed: 0,station_id,start,end,span_days,expected_hours,actual_hours,coverage_%,danger_rate_%,miss_AtmosphericPressure,miss_WindSpeed,miss_Gust,miss_WaveHeight,miss_WavePeriod,miss_SeaTemperature,score
2,M5,2004-10-18 10:00:00+00:00,2017-11-28 10:00:00+00:00,4790,114960,101137,87.9758,12.4739,1.5642,7.9071,9.4624,16.4984,16.2305,1.8124,0.831
3,M1,2001-02-06 13:00:00+00:00,2007-07-09 12:00:00+00:00,2344,56256,51626,91.7698,15.1082,2.799,14.1557,18.2389,3.0508,2.6169,1.2358,0.8284
4,M6,2006-09-25 13:00:00+00:00,2017-11-28 10:00:00+00:00,4082,97968,83695,85.431,13.758,0.84,21.5795,31.4045,14.5277,14.4704,2.4147,0.8231
5,M4-Archive,2003-04-16 09:00:00+00:00,2007-05-01 11:00:00+00:00,1477,35448,31215,88.0586,14.6931,1.5826,1.8805,9.9311,2.281,1.1981,0.9835,0.7889
6,M4,2007-05-03 09:00:00+00:00,2017-11-28 10:00:00+00:00,3863,92712,82930,89.449,9.3987,8.3528,12.6794,12.6239,37.9139,37.8862,2.9012,0.6597


In [5]:
# =========================
# CELL 5 — Correlation & Distance Analysis for Generalization Selection
# =========================

def haversine_distance(lat1, lon1, lat2, lon2):
    """Great-circle distance between two points (km) using Haversine formula."""
    R = 6371.0
    lat1, lon1, lat2, lon2 = map(lambda x: np.radians(float(x)), [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Config for filtering
MIN_DISTANCE_KM = 200
MAX_CORR = 0.85

main_df = df_raw[df_raw[BUOY_COL] == MAIN_BUOY].copy()
main_lat = main_df['latitude'].iloc[0]
main_lon = main_df['longitude'].iloc[0]

distances, correlations, ids = [], [], []
other_ids = [sid for sid in df_raw[BUOY_COL].unique() if sid != MAIN_BUOY]

for sid in other_ids:
    df_other = df_raw[df_raw[BUOY_COL] == sid].copy()
    if df_other.empty:
        continue
    
    lat, lon = df_other['latitude'].iloc[0], df_other['longitude'].iloc[0]
    dist_km = haversine_distance(main_lat, main_lon, lat, lon)

    merged = pd.merge(
        main_df[[TIME_COL, 'WaveHeight']],
        df_other[[TIME_COL, 'WaveHeight']],
        on=TIME_COL,
        suffixes=('_main', '_other')
    ).dropna()

    corr = merged['WaveHeight_main'].corr(merged['WaveHeight_other']) if len(merged) > 10 else np.nan

    ids.append(sid)
    distances.append(dist_km)
    correlations.append(corr)

df_corr = pd.DataFrame({
    BUOY_COL: ids,
    'distance_km': distances,
    'waveheight_corr': correlations
}).sort_values('distance_km')

# Apply distance & correlation filters
generalization_candidates_after_filter = df_corr[
    (df_corr['distance_km'] >= MIN_DISTANCE_KM) &
    (df_corr['waveheight_corr'] <= MAX_CORR) &
    (df_corr[BUOY_COL].isin(GEN_CANDIDATES))
].copy()

print(f"=== Distance vs Correlation for MAIN buoy {MAIN_BUOY} ===")
display(df_corr)

print(f"=== Generalization candidates after distance & correlation filters ===")
display(generalization_candidates_after_filter)

# Save correlation-distance table
corr_path = REPORTS_DIR / f"distance_corr_{MAIN_BUOY}.csv"
df_corr.to_csv(corr_path, index=False)

# Save filtered gen candidates for later
filtered_path = REPORTS_DIR / f"gen_candidates_filtered_{MAIN_BUOY}.csv"
generalization_candidates_after_filter.to_csv(filtered_path, index=False)

# Select final generalization buoy
if not generalization_candidates_after_filter.empty:
    final_gen_buoy = generalization_candidates_after_filter.iloc[0][BUOY_COL]
else:
    final_gen_buoy = GEN_CANDIDATES[0] if GEN_CANDIDATES else None

# Save final selection
FINAL_SELECTION = {
    "main_buoy": MAIN_BUOY,
    "extra_train_buoys": EXTRA_TRAIN_BUOYS,
    "generalization_buoy": final_gen_buoy
}

sel_path = REPORTS_DIR / "selected_buoys.json"
with open(sel_path, 'w') as f:
    json.dump(FINAL_SELECTION, f, indent=2)

print(f"Final MAIN buoy: {MAIN_BUOY}")
print(f"Extra training buoy(s): {EXTRA_TRAIN_BUOYS}")
print(f"Final GENERALIZATION buoy: {final_gen_buoy}")
print(f"Saved buoy selection to {sel_path}")


=== Distance vs Correlation for MAIN buoy M2 ===


Unnamed: 0,station_id,distance_km,waveheight_corr
6,M5,217.0704,0.7635
5,M4-Archive,271.7304,0.5126
1,FS1,285.8127,0.6974
0,Belmullet-AMETS,321.3171,0.4043
4,M4,341.8246,0.5468
2,M1,385.7047,0.6236
3,M3,429.4566,0.6407
7,M6,696.1143,0.5901


=== Generalization candidates after distance & correlation filters ===


Unnamed: 0,station_id,distance_km,waveheight_corr
6,M5,217.0704,0.7635
5,M4-Archive,271.7304,0.5126
4,M4,341.8246,0.5468
2,M1,385.7047,0.6236
7,M6,696.1143,0.5901


Final MAIN buoy: M2
Extra training buoy(s): ['M3']
Final GENERALIZATION buoy: M5
Saved buoy selection to C:\Users\pesic\Desktop\GRU\reports\selected_buoys.json
