In [1]:
from glob import glob
import os
import re
from typing import List

import numpy as np
import geopandas as gpd
import pandas as pd
from tqdm import tqdm

from open_gira.io import STORM_BASIN_IDS
from open_gira.io import STORM_CSV_SCHEMA as schema
from open_gira.utils import natural_sort

import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# divide by this factor to 'convert' STORM's 10-minutely sustained winds to
# 1-minutely sustained wind speeds, noting the vagueries of this process as
# explained here: https://library.wmo.int/doc_num.php?explnum_id=290
STORM_1MIN_WIND_FACTOR = 0.88
STORM_FREQUENCY = "3H"  # temporal frequency of STORM synthetic tracks

In [3]:
csv_dir = "/home/mark/projects/open-gira/results/input/STORM/events/constant/WP"
output = "/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP.geoparquet"

In [4]:
# Parse all the STORM data in the CSV files and accumulate the processed data into a list which we will then concetate
data = []
for path in tqdm(natural_sort(glob(f"{csv_dir}/*.csv"))):

        
    df = pd.read_csv(path, names=schema.keys(), dtype=schema)

    # example paths containing sample number:
    # STORM_DATA_HadGEM3-GC31-HM_WP_1000_YEARS_9_IBTRACSDELTA.csv
    # STORM_DATA_IBTRACS_EP_1000_YEARS_0.csv
    sample, = re.search(r"1000_YEARS_([\d])", os.path.basename(path)).groups()

    df["sample"] = int(sample)

    # change geometry from 0-360 to -180-180
    df.lon = np.where(df.lon > 180, df.lon - 360, df.lon)

    # lookup string basin code from integer representation
    df.basin_id = np.array(STORM_BASIN_IDS)[df.basin_id]

    # different track_id format for STORM vs. IBTrACS, ensures no collisions
    df["track_id"] = (
        df["basin_id"] + "_"
        + df["sample"].astype(str) + "_"
        + df["year"].astype(int).astype(str) + "_"
        + df["tc_number"].astype(int).astype(str)
    )

    # we'll want to interpolate and then measure the speed of tracks later,
    # this is easiest when we have some temporal index (as in IBTrACS)
    # so make up an artificial one here based on the STORM reporting frequency

    track_datetimes: List[np.ndarray] = []
    track_lengths: np.ndarray = df.track_id.apply(hash).value_counts(sort=False).values
    for length in track_lengths:
        track_datetimes.append(pd.date_range(start="2000-01-01", periods=length, freq=STORM_FREQUENCY).values)

    df = df.set_index(np.concatenate(track_datetimes))

    # reorder columns
    df = df.loc[:, list(schema.keys()) + ["track_id", "sample"]]

    data.append(df)

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [01:01<00:00,  6.17s/it]


In [7]:
data_1 = data[0]
data_2 = data[1]
data_3 = data[2]
data_4 = data[3]
data_5 = data[4]
data_6 = data[5]
data_7 = data[6]
data_8 = data[7]
data_9 = data[8]
data_10 = data[9]

In [8]:
def preprocess(df):
    df.max_wind_speed_ms /= STORM_1MIN_WIND_FACTOR
    df = gpd.GeoDataFrame(
        data=df,
        geometry=gpd.points_from_xy(df["lon"], df["lat"], crs=4326)
    )
    df = df.drop(columns=["lon", "lat"])
    return df

In [9]:
data_1 = preprocess(data_1)
data_2 = preprocess(data_2)
data_3 = preprocess(data_3)
data_4 = preprocess(data_4)
data_5 = preprocess(data_5)
data_6 = preprocess(data_6)
data_7 = preprocess(data_7)
data_8 = preprocess(data_8)
data_9 = preprocess(data_9)
data_10 = preprocess(data_10)

In [10]:
data_1.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_1.geoparquet")
data_2.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_2.geoparquet")
data_3.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_3.geoparquet")
data_4.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_4.geoparquet")
data_5.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_5.geoparquet")
data_6.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_6.geoparquet")
data_7.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_7.geoparquet")
data_8.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_8.geoparquet")
data_9.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_9.geoparquet")
data_10.to_parquet("/home/mark/projects/open-gira/results/storm_tracks/STORM-constant/tracks_WP_10.geoparquet")

In [16]:
# Concat data
df = pd.concat(data)

In [17]:
# rescale winds to 1-minutely
df.max_wind_speed_ms /= STORM_1MIN_WIND_FACTOR

# construct geometry from lat and long
df = gpd.GeoDataFrame(
    data=df,
    geometry=gpd.points_from_xy(df["lon"], df["lat"], crs=4326)
)
df = df.drop(columns=["lon", "lat"])

In [18]:
# Save data
os.makedirs(os.path.dirname(output), exist_ok=True)
df.to_parquet(output)