In [1]:
import requests
from bs4 import BeautifulSoup
import datetime
import re
import tempfile
import numpy as np
import pandas as pd
import xarray as xr
import cfgrib
from pathlib import Path
from sqlalchemy import create_engine
from sqlalchemy.exc import SQLAlchemyError






In [2]:
def latest_url():
    date = datetime.datetime.now().strftime("%Y%m%d")
    url = f"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gens/prod/gefs.{date}/00/wave/gridded/"
    return url

In [4]:
latest_url()

'https://nomads.ncep.noaa.gov/pub/data/nccf/com/gens/prod/gefs.20231126/00/wave/gridded/'

In [5]:
def get_grib2_links():    # parse the list of models
    response = requests.get(latest_url())
    soup = BeautifulSoup(response.content, 'html.parser')
    # match the average global model for all forecast hours
    pattern = re.compile(r'gefs\.wave\.t00z\.mean\.global\.0p25\.f\d{3}\.grib2')
    hrefs = [a.get('href') for a in soup.find_all('a', href=pattern)]
    return hrefs

In [6]:
hrefs = get_grib2_links()

In [7]:
hrefs

['gefs.wave.t00z.mean.global.0p25.f000.grib2',
 'gefs.wave.t00z.mean.global.0p25.f003.grib2',
 'gefs.wave.t00z.mean.global.0p25.f006.grib2',
 'gefs.wave.t00z.mean.global.0p25.f009.grib2',
 'gefs.wave.t00z.mean.global.0p25.f012.grib2',
 'gefs.wave.t00z.mean.global.0p25.f015.grib2',
 'gefs.wave.t00z.mean.global.0p25.f018.grib2',
 'gefs.wave.t00z.mean.global.0p25.f021.grib2',
 'gefs.wave.t00z.mean.global.0p25.f024.grib2',
 'gefs.wave.t00z.mean.global.0p25.f027.grib2',
 'gefs.wave.t00z.mean.global.0p25.f030.grib2',
 'gefs.wave.t00z.mean.global.0p25.f033.grib2',
 'gefs.wave.t00z.mean.global.0p25.f036.grib2',
 'gefs.wave.t00z.mean.global.0p25.f039.grib2',
 'gefs.wave.t00z.mean.global.0p25.f042.grib2',
 'gefs.wave.t00z.mean.global.0p25.f045.grib2',
 'gefs.wave.t00z.mean.global.0p25.f048.grib2',
 'gefs.wave.t00z.mean.global.0p25.f051.grib2',
 'gefs.wave.t00z.mean.global.0p25.f054.grib2',
 'gefs.wave.t00z.mean.global.0p25.f057.grib2',
 'gefs.wave.t00z.mean.global.0p25.f060.grib2',
 'gefs.wave.t

## Convert grib2 data to dataframe

In [8]:
def grib2_url_to_dataframe(target):
    response = requests.get(f'{latest_url()}/{target}')
    if response.status_code == 200:
        # Use a temporary file to store the response content
        with tempfile.NamedTemporaryFile() as tmp:
            tmp.write(response.content)
            tmp.flush()

            # Open the dataset from the temporary file
            with xr.open_dataset(tmp.name, engine='cfgrib') as ds:
                # Extract the necessary data here
                data = ds.load()  # 'load' will load the data into memory
                # load to pandas dataframe
                df = data.to_dataframe()
                # drop landlocked rows
                df = df.dropna(subset=['swh'])
                # reset index
                df.reset_index(level=['latitude', 'longitude'], inplace=True)

                # Convert the timedelta to total number of hours as a string with ' hours' appended
                df['step'] = df['step'].dt.total_seconds() / 3600.0
                df['step'] = df['step'].astype(str) + ' hours'
                return df
                    
    else:
        print(f"Failed to get data: {response.status_code}")

In [9]:
target = hrefs[1]

In [10]:
test_df = grib2_url_to_dataframe(target)

In [None]:
table_name = 'wave_forecast'
DATABASE_URL = "postgresql://postgres:your_password@localhost:5432/surfing_data"
engine = create_engine(DATABASE_URL)

In [None]:
test_df.head()

In [None]:
def save_dataframe_to_db(df, engine, table_name):
    with engine.begin() as connection:  # Automatically handles transactions, including rollbacks if neccessary
        try:
            df['entry_updated'] = datetime.datetime.now()
            df.to_sql(table_name, con=connection, if_exists='append', index=False)
            print(f"Successfully wrote grib2 file")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

In [None]:
save_dataframe_to_db(test_df, engine, table_name)

In [None]:
# test a loop
def all_wave_forecasts_to_db(url, engine, table_name):
    count = 0
    targets = get_grib2_links(url)
    for target in targets:
        df = grib2_url_to_dataframe(url, target)
        save_dataframe_to_db(df, engine, table_name)
        count += 1
        print(f"Wrote grib file number {count} out of {len(targets)}")

In [None]:
all_wave_forecasts_to_db(url, engine, table_name)