In [1]:
import pandas as pd
import maidenhead as mh
import numpy as np
import os
from datetime import datetime
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

BRAINS_CSV = '/usr/datalake/silver/stormevents/artifacts/igra_storm_event_autoencoder/brains_station_list.csv'
SEVERE_CSV = '/usr/datalake/silver/stormevents/csvfiles/severe_maidenhead.csv'
IGRA_CSV_PATTERN = '/usr/datalake/silver/stormevents/igra_encoded/{station_id}_igra_encoded.csv'
IGRA_MAIDENHEAD_PATH = '/usr/datalake/silver/stormevents/csvfiles/igra_maidenhead'
MAIDENHEADS = ['DN', 'EN', 'FN', 'DM', 'EM', 'FM', 'DL', 'EL']

In [2]:
os.makedirs(IGRA_MAIDENHEAD_PATH, exist_ok=True)

In [3]:
class OlieStormEvents:
    start_date = datetime(1990, 1, 1)
    stop_date = datetime(2025, 5, 30)

    def __init__(self, severe_path: str):
        self.storm_events = pd.read_csv(severe_path)

    def get_storm_events(self, maidenhead: str) -> pd.DataFrame:
        result = self._get_filtered(maidenhead)
        result = self._merge_temporal(result)

        return result

    def _get_filtered(self, maidenhead: str) -> pd.DataFrame:
        result = self.storm_events.copy()
        result = result[result['MAIDENHEAD'] == maidenhead]
        result['EFFECTIVE'] = pd.to_datetime(result['EFFECTIVE'])
        result = result.drop(labels=['MAIDENHEAD'], axis=1)

        return result
    
    def _merge_temporal(self, df_source: pd.DataFrame):
        result = pd.DataFrame(pd.date_range(self.start_date, self.stop_date), columns=['effective'])
        result = result.merge(df_source, how='left', left_on=['effective'], right_on=['EFFECTIVE'])
        result = result.drop(labels=['EFFECTIVE'], axis=1)
        result = result.fillna(0)
        result['LABEL'] = result['LABEL'].astype(int)

        return result

class OlieIgra:
    def __init__(self, stations_path: str):
        self.stations = pd.read_csv(stations_path)

    def get_igra(self, maidenhead: str, limit: int):
        ids = self._get_closest_stations(maidenhead, limit)
        result = self._merge_closest_igras(ids)

        return result

    def _get_closest_stations(self, maidenhead: str, limit: int) -> list[str]:
        lat, lon = mh.to_location(maidenhead, True)
        result = self.stations.copy()
        result['distance'] = result.apply(lambda x: self._haversine_distance(x.latitude, x.longitude, lat, lon), axis=1)
        result = result.sort_values(by='distance')
        result = result.iloc[0:limit]

        ids = result['id'].to_numpy()

        return ids

    def _merge_closest_igras(self, station_ids: list[str]) -> pd.DataFrame:
        df_igra = [self._read_igra_by_id(station_ids[i]) for i in range(3)]

        result = pd.merge(left=df_igra[0], right=df_igra[1], how="inner", left_on='effective_date', right_on='effective_date')
        result = pd.merge(left=result, right=df_igra[2], how="inner", left_on='effective_date', right_on='effective_date')
        result = result.drop(labels=['day_num', 'day_num_y'], axis=1)
        result.columns = self._get_column_names()

        return result
    
    @staticmethod
    def _haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float, earth_radius=6371.0): # Earth radius in km
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

        dlat = lat2 - lat1
        dlon = lon2 - lon1

        a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

        distance = earth_radius * c
        return distance
    
    @staticmethod
    def _read_igra_by_id(station_id: str) -> pd.DataFrame:
        filename = IGRA_CSV_PATTERN.replace('{station_id}', station_id)
        result = pd.read_csv(filename)
        result['effective_date'] = pd.to_datetime(result['effective_date'])

        return result
    
    @staticmethod
    def _get_column_names():
        result = [f'{i}' for i in range(-1, 61)]
        result[0] = 'effective_date'
        result = np.array(result)

        return result
    
class OlieMaidenhead:
    def get_maidenheads(self, fields: list[str]) -> list[str]:
        result = []

        for key in fields:
            for row in range(10):
                for col in range(10):
                    maidenhead = f'{key}{row}{col}'
                    result.append(maidenhead)

        return result
    
    def merge_severe_igra(self, features: pd.DataFrame, labels: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, int]:
        merged = features.merge(labels, how='inner', left_on=['effective_date'], right_on=['effective'])
        merged = merged.drop(labels=['effective', 'effective_date'], axis=1)
        feature_result = merged.iloc[:,:-1]
        label_result = merged.iloc[:,-1:]
        severe = label_result[label_result['LABEL'] > 0]
        significant = label_result[label_result['LABEL'] > 1]

        return feature_result, label_result, severe.shape[0], significant.shape[0]
    
    def undersample_non_severe(self, x: pd.DataFrame, y: pd.DataFrame, threshold: int) -> tuple[pd.DataFrame, pd.DataFrame]:
        n_count = y['LABEL'].value_counts().to_dict()
        if n_count[0] <= threshold:
            return x, y
        n_count[0] = threshold

        rus = RandomUnderSampler(sampling_strategy=n_count)
        r_x, r_y = rus.fit_resample(x, y)

        return r_x, r_y

    def oversample_severe(self, x: pd.DataFrame, y: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
        rus = SMOTE()
        r_x, r_y = rus.fit_resample(x, y)

        return r_x, r_y

In [4]:
# df_merged.to_csv(f'{IGRA_MAIDENHEAD_PATH}/{MAIDENHEAD}.csv', index=False)

In [5]:
olie_storm_events = OlieStormEvents(SEVERE_CSV)
olie_igra = OlieIgra(BRAINS_CSV)
olie_maidenhead = OlieMaidenhead()

for maidenhead in olie_maidenhead.get_maidenheads(MAIDENHEADS):
    storm_events = olie_storm_events.get_storm_events(maidenhead)
    igras = olie_igra.get_igra(maidenhead, 3)
    df_features, df_labels, severe_count, significant_count = olie_maidenhead.merge_severe_igra(igras, storm_events)

    if severe_count == 0:
        print (f'Maidenhead {maidenhead}: No severe records')
        continue

    if severe_count < 100:
        print (f'Maidenhead {maidenhead}: Not enough severe records')
        continue

    if significant_count < 6:
        # SMOTE requires at lest 6 samples
        df_labels.loc[df_labels['LABEL'] > 1, 'LABEL'] = 1
        significant_count = 0

    df_features, df_labels = olie_maidenhead.undersample_non_severe(df_features, df_labels, severe_count * 10)
    df_balanced_features, df_balanced_labels = olie_maidenhead.oversample_severe(df_features, df_labels)
    
    df_merged = pd.concat([df_balanced_features, df_balanced_labels], axis=1)
    df_merged.to_csv(f'{IGRA_MAIDENHEAD_PATH}/{maidenhead}.csv', index=False)
    
    print (f'Maidenhead {maidenhead}: {df_merged.shape}')

Maidenhead DN00: Not enough severe records
Maidenhead DN01: Not enough severe records
Maidenhead DN02: Not enough severe records
Maidenhead DN03: Not enough severe records
Maidenhead DN04: Not enough severe records
Maidenhead DN05: Not enough severe records
Maidenhead DN06: Not enough severe records
Maidenhead DN07: Not enough severe records
Maidenhead DN08: Not enough severe records
Maidenhead DN09: Not enough severe records
Maidenhead DN10: Not enough severe records
Maidenhead DN11: Not enough severe records
Maidenhead DN12: Not enough severe records
Maidenhead DN13: Not enough severe records
Maidenhead DN14: Not enough severe records
Maidenhead DN15: Not enough severe records
Maidenhead DN16: Not enough severe records
Maidenhead DN17: Not enough severe records
Maidenhead DN18: Not enough severe records
Maidenhead DN19: No severe records
Maidenhead DN20: Not enough severe records
Maidenhead DN21: Not enough severe records
Maidenhead DN22: Not enough severe records
Maidenhead DN23: No