In [1]:
import asyncio
from typing import List, Dict, Optional
import json
import time
from pathlib import Path

import aiohttp
import numpy as np
import pandas as pd
import pendulum
import logging
import requests
from dataclasses import dataclass
from enum import Enum

from sqlalchemy import create_engine, select, text, and_

from sqlalchemy.orm import sessionmaker, declarative_base
from utils.utils import LOCAL_AIRFLOW_PG_URI, LOCAL_PG_URI

ImportError: cannot import name 'LOCAL_AIRFLOW_PG_URI' from 'utils.utils' (/home/peter-legion-wsl2/peter-projects/bodhi-cast/nbs/python/utils/utils.py)

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
logging.basicConfig(level=logging.INFO)

In [None]:
Base = declarative_base()

In [None]:
engine = create_engine(LOCAL_AIRFLOW_PG_URI)
SessionLocal = sessionmaker(bind=engine)

## Note:
See `240220_sl_surf_spots.ipynb` for spot getter

In [None]:
response = requests.get("https://services.surfline.com/taxonomy?type=taxonomy&id=58f7ed51dadb30820bb3879c&maxDepth=0")

* You will not get Surfline forecast data without a valid Surfline premium login. Add your credentials to `.env.development`:
  ```
  SURFLINE_EMAIL=xxx
  SURFLINE_PASSWORD=yyy
  ```

##### Requests

`https://services.surfline.com/kbyg/spots/forecasts/{type}?{params}`


Type|Data
----|----
rating|array of human-readable and numeric (0-6) ratings
wave|array of min/max sizes & optimal scores
wind|array of wind directions/speeds & optimal scores
tides|array of types & heights
weather|array of sunrise/set times, array of temperatures/weather conditions

Param|Values|Effect
-----|------|------
spotId|string|Surfline spot id that you want data for. A typical Surfline URL is `https://www.surfline.com/surf-report/venice-breakwater/590927576a2e4300134fbed8` where `590927576a2e4300134fbed8` is the `spotId`
days|integer|Number of forecast days to get (Max 6 w/o access token, Max 17 w/ premium token)
intervalHours|integer|Minimum of 1 (hour)
maxHeights|boolean|`true` seems to remove min & optimal values from the wave data output
sds|boolean|If true, use the new LOTUS forecast engine
accesstoken|string|Auth token to get premium data access (optional)

Anywhere there is an `optimalScore` the value can be interpreted as follows:

Value|Meaning
-----|-------
0|Suboptimal
1|Good
2|Optimal


In [None]:
types = ["rating", "wave", "wind", "tides", "weather"]
params = ["spotId", "days", "intervalHours", "maxHeights", "sds", "accesstoken"]
base = "https://services.surfline.com/kbyg/spots/forecasts"

In [None]:
datapath = Path('./data')


In [None]:
df = pd.read_csv(datapath/'spot_list.csv')

In [None]:
df.head()

Get the spot `id` for 1st Street Jetty in Va Beach

In [None]:
jetty_id = df[df['names'].str.contains('1st Street Jetty', case=False, na=False)]['ids'].values[0]
jetty_id

In [None]:
ex_params = {params[0]: jetty_id}
ex_params

Surfline seems to change their spot IDs periodically. Check a spot on the website and pass the objectId from the url as a param to debug if this is the case. If they've changed you'll need to run the notebook `240220_sl_surf_spots.ipynb` as mentioned above to refresh the spots dataset

In [None]:
debug_params = {params[0]: "584204214e65fad6a7709ce7"}

In [None]:
res = requests.get(f"{base}/{types[0]}", params=ex_params)
res.status_code

In [None]:
rating_json = res.json()

In [None]:
four_day_json = res.json()
if 'data' in four_day_json and 'rating' in four_day_json['data']:
    four_day_json['data']['rating'] = four_day_json['data']['rating'][:24]

In [None]:
def cull_extra_days(full_json):
    if 'data' in full_json and 'rating' in full_json['data']:
        full_json['data']['rating'] = full_json['data']['rating'][:24]

Drop extra days of forecast

In [None]:
cull_extra_days(four_day_json)

In [None]:
len(four_day_json['data']['rating'])

Convert a unix timestamp -> utc

In [None]:
pendulum.from_timestamp(rating_json['data']['rating'][0]['timestamp'], 'UTC')

In [None]:
pendulum.from_timestamp(rating_json['data']['rating'][int(72 / 3)-1]['timestamp'], 'UTC')

The `utcOffset` field seems to be aware that I'm working in EST currently. Either that or it's the time coding for the spot itself.

Let's check a west coast spot to confirm how this is handled

In [None]:
df

In [None]:
la_jolla_id = df[df['names'].str.contains("La Jolla", case=False, na=False)]['ids'].values[0]
la_jolla_dict = {params[0]: la_jolla_id}

In [None]:
la_jolla_dict

In [None]:
pendulum.now("utc")

In [None]:
new_dict = {"spot_id": "test", "spot_name": "test_2", "date": pendulum.now("utc"), "forecast": four_day_json}

In [None]:
@dataclass
class SlApiEndpoints(Enum):
    RATING = 'rating'
    WAVE = 'wave'
    WIND = 'wind'
    TIDES = 'tides'
    WEATHER = 'weather'

In [None]:
@dataclass
class SlApiParams(Enum):
    SPOT_ID = 'spotId'
    DAYS = 'days'
    INTERVAL_HOURS = 'intervalHours'
    MAX_HEIGHTS = 'maxHeights'
    SDS = 'sds'
    ACCESSTOKEN = 'accesstoken'

In [None]:
def fetch_from_sl_api(endpoint: SlApiEndpoints, param_type: SlApiParams, param: str):
    base_url = "https://services.surfline.com/kbyg/spots/forecasts"
    res = requests.get(f"{base_url}/{endpoint}", params={param_type: param})
    data = res.json()
    return data

In [None]:
test_res = fetch_from_sl_api(SlApiEndpoints.RATING.value, SlApiParams.SPOT_ID.value, param=jetty_id)

In [None]:
spot_ratings = []
for spot_id, spot_name in df[['ids', 'names']][:3].values:
    res = requests.get(f"{base}/rating", params={'spotId': spot_id})
    data = res.json()
    cull_extra_days(data)
    current_date = pendulum.now("utc")
    utc_date = current_date.strftime("%Y-%m-%d")
    data['spot_id'] = spot_id
    data['spot_name'] = spot_name
    data['utc_fetch_date'] = utc_date
    spot_ratings.append(data)
    # time.sleep()

In [None]:
pendulum.from_timestamp(rating_json['data']['rating'][0]['timestamp'], 'UTC')

In [None]:
ratings_df = pd.json_normalize(spot_ratings, record_path=['data', 'rating'], meta=['spot_id', 'spot_name', 'utc_fetch_date'] )

In [None]:
ratings_df

In [None]:
ratings_df['timestamp'] = ratings_df['timestamp'].apply(lambda x: pendulum.from_timestamp(x).to_datetime_string())

Alright, so it looks like each spot's forecast starts at 12am *local time*, with the timestamp for that time in unix. To figure out the flat `UTC` time for each spot you can just apply the `utcOffset` that is included in response. 

In [None]:
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'])

In [None]:
ratings_df

In [None]:
ratings_df['timestamp_utc'] = ratings_df.apply(lambda row: row['timestamp'] + pd.Timedelta(hours=row['utcOffset']), axis=1)

In [None]:
ratings_df

In [None]:
ratings_df.dtypes

In [None]:
with SessionLocal() as db:
    matching_spots = []
    for spot in df['names']:
        stmt = text("""select * from spots where spot_name like :spot""")
        result = db.execute(stmt, {"spot": spot}).fetchall()
        if len(result) > 0:
            matching_spots.append(result)
    

In [None]:
len(matching_spots)

In [None]:
jetty_waves = fetch_from_sl_api(SlApiEndpoints.WAVE.value, SlApiParams.SPOT_ID.value, jetty_id)

In [None]:
jetty_waves['associated']

In [None]:
jetty_waves['associated']['spotId'] = jetty_id

In [None]:
jetty_meta_df = pd.json_normalize(jetty_waves)
jetty_meta_df.drop(['permissions.violations', 'permissions.data', 'data.wave'], inplace=True, axis=1)

In [None]:
jetty_waves['data']['spotId'] = jetty_id

In [None]:
jetty_wave_df = pd.json_normalize(
    jetty_waves, record_path=["data", "wave"], meta=[["data", "spotId"]]
)
jetty_wave_df.drop("swells", inplace=True, axis=1)

In [None]:
jetty_wave_df.rename(columns={"power": "wave_power"}, inplace=True)

In [None]:
jetty_swell_df = pd.json_normalize(
    jetty_waves,
    record_path=['data', 'wave', 'swells'],
    meta=[['data', 'wave', 'timestamp'], ['data', 'spotId']]
)

In [None]:
jetty_swell_df['swells_idx'] = jetty_swell_df.groupby('data.wave.timestamp').cumcount()

In [None]:
jetty_swell_df.head()

In [None]:
jetty_swell_df.rename({"power": 'swell_power'}, inplace=True)

In [None]:
jetty_swell_df['data.wave.timestamp'].value_counts()

In [None]:
jetty_meta_df.head()

In [None]:
jetty_wave_df.head()

In [None]:
jetty_swell_df.head(n=10)

In [None]:
combined_waves_df = pd.merge(
    jetty_wave_df,
    jetty_swell_df,
    how="inner",
    left_on=["timestamp", "data.spotId"],
    right_on=["data.wave.timestamp", 'data.spotId'],
)

In [None]:
len(combined_waves_df)

In [None]:
combined_waves_df

In [None]:
combined_df = pd.merge(jetty_meta_df, combined_waves_df, how='cross')

In [None]:
combined_df

In [None]:
class SurflineSpots:
    def __init__(self):
        self.states = []
        self.state_ids = []
        self.state_urls = []
        self.state_data = []
        self.county_data = []
        self.region_data = [] 
        self.spot_ids = []
        self.spot_names = []
        self.spot_address = []
        self.spot_lon = []
        self.spot_lat = []
        self.spot_urls = []

    def _update_states(self):
        response = requests.get(
            "https://services.surfline.com/taxonomy?type=taxonomy&id=58f7ed51dadb30820bb3879c&maxDepth=0"
        )
        json_data = response.json()
        json_contains = json_data["contains"]
        for x in json_contains:
            self.states.append(x["name"])
            self.state_ids.append(x["_id"])
        
        for state_id in self.state_ids:
            self.state_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + state_id + "&maxDepth=0")


    async def fetch_url(self, url, session):
        async with session.get(url) as response:
            return await response.json()


    async def fetch_all_urls(self, target):
        data = []
        async with aiohttp.ClientSession() as session:
            tasks = []
            for url in self.state_urls:
                tasks.append(self.fetch_url(url, session))
            data = await asyncio.gather(*tasks)
        return data


    def update_data(self, data_target: List[str], attr_target):
        data = asyncio.run(self.fetch_all_urls(data_target))
        setattr(self, attr_target, data)
    

    def process_spots(self):
        if len(self.states) == 0:
            self._update_states() 
        
        self.update_data(self.state_urls, "state_data")
        logging.info("spots")



        county_ids = []    
        for state in self.state_data:
            state_contains = state['contains']
            for y in state_contains:
                county_ids.append(y['_id'])
        
        county_urls = []
        for county_id in county_ids:
            county_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + county_id + "&maxDepth=0")

        self.update_data(county_urls, "county_data")

        region_ids = []
        region_names = []
        for county in self.county_data:
            county_contains = county['contains']
            for z in county_contains:
                region_ids.append(z['_id'])
                region_names.append(z['name'])

        region_urls = []
        for region_id in region_ids:
            region_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + region_id + "&maxDepth=0")

        self.update_data(region_urls, "region_data")


        for region in self.region_data:
            region_contains = region['contains']
            if len(region_contains) == 0:
                self.spot_ids.append(region.get("spot", ""))
                self.spot_names.append(region.get("name", ""))
                self.spot_address.append("")
                region_associated = region['associated']
                region_links = region_associated['links']
                region_location = region['location']
                region_coordinates = region_location['coordinates']
                self.spot_lon.append(region_coordinates[0])
                self.spot_lat.append(region_coordinates[1])
                for i in region_links:
                    if i['key'] == "www":
                        self.spot_urls.append(i['href'])

        df = pd.DataFrame({"ids": self.spot_ids, "names": self.spot_names, "lon": self.spot_lon, "lat": self.spot_lat, "urls": self.spot_urls})
        return df    

In [None]:
spots = SurflineSpots()

In [None]:
spots.process_spots()

In [None]:
def fetch_sl_spots():
    response = requests.get(
        "https://services.surfline.com/taxonomy?type=taxonomy&id=58f7ed51dadb30820bb3879c&maxDepth=0"
    )
    json_data = response.json()
    json_contains = json_data["contains"]
    states = []
    state_ids = []
    state_urls = []
    for x in json_contains:
        states.append(x["name"])
        state_ids.append(x["_id"])
    

    for state_id in state_ids:
        state_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + state_id + "&maxDepth=0")

In [None]:
def transform_sl_wave_data(data: Dict) -> pd.DataFrame:
    if not data:
        raise ValueError("Data is empty")

    data["associated"]["spotId"] = jetty_id
    meta_df = pd.json_normalize(jetty_waves)
    jetty_meta_df.drop(['permissions.violations', 'permissions.data', 'data.wave'], inplace=True, axis=1)   