In [1]:
import asyncio
from typing import List, Dict, Optional
import json
import time
from pathlib import Path

import aiohttp
import numpy as np
import pandas as pd
import pendulum
import logging
import requests
from dataclasses import dataclass
from enum import Enum

from sqlalchemy import create_engine, select, text, and_

from sqlalchemy.orm import sessionmaker, declarative_base
from utils.utils import LOCAL_AIRFLOW_PG_URI, LOCAL_PG_URI

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
Base = declarative_base()

In [5]:
engine = create_engine(LOCAL_PG_URI)
SessionLocal = sessionmaker(bind=engine)

In [6]:
class SurflineSpots:
    def __init__(self):
        self.states = []
        self.state_ids = []
        self.state_urls = []
        self.state_data = []
        self.county_data = []
        self.region_data = [] 
        self.spot_ids = []
        self.spot_names = []
        self.spot_address = []
        self.spot_lon = []
        self.spot_lat = []
        self.spot_urls = []

    def _update_states(self):
        response = requests.get(
            "https://services.surfline.com/taxonomy?type=taxonomy&id=58f7ed51dadb30820bb3879c&maxDepth=0"
        )
        json_data = response.json()
        json_contains = json_data["contains"]
        for x in json_contains:
            self.states.append(x["name"])
            self.state_ids.append(x["_id"])
        
        for state_id in self.state_ids:
            self.state_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + state_id + "&maxDepth=0")
        logging.info(f"{len(self.state_urls)}")


    async def fetch_url(self, url, session):
        async with session.get(url) as response:
            return await response.json()


    async def fetch_all_urls(self, target):
        data = []
        async with aiohttp.ClientSession() as session:
            tasks = []
            for url in target:
                tasks.append(self.fetch_url(url, session))
            data = await asyncio.gather(*tasks)
        return data


    def update_data(self, data_target: List[str], attr_target):
        data = asyncio.run(self.fetch_all_urls(data_target))
        setattr(self, attr_target, data)
    

    def process_spots(self):
        if len(self.states) == 0:
            self._update_states() 
        
        self.update_data(self.state_urls, "state_data")
        logging.info("spots")



        county_ids = []    
        for state in self.state_data:
            state_contains = state['contains']
            for y in state_contains:
                county_ids.append(y['_id'])
        
        county_urls = []
        for county_id in county_ids:
            county_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + county_id + "&maxDepth=0")

        self.update_data(county_urls, "county_data")

        region_ids = []
        region_names = []
        for county in self.county_data:
            county_contains = county['contains']
            for z in county_contains:
                region_ids.append(z['_id'])
                region_names.append(z['name'])

        region_urls = []
        for region_id in region_ids:
            region_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + region_id + "&maxDepth=0")

        self.update_data(region_urls, "region_data")


        for region in self.region_data:
            region_contains = region['contains']
            if len(region_contains) == 0:
                self.spot_ids.append(region.get("spot", ""))
                self.spot_names.append(region.get("name", ""))
                self.spot_address.append("")
                region_associated = region['associated']
                region_links = region_associated['links']
                region_location = region['location']
                region_coordinates = region_location['coordinates']
                self.spot_lon.append(region_coordinates[0])
                self.spot_lat.append(region_coordinates[1])
                for i in region_links:
                    if i['key'] == "www":
                        self.spot_urls.append(i['href'])

        df = pd.DataFrame({"ids": self.spot_ids, "names": self.spot_names, "lon": self.spot_lon, "lat": self.spot_lat, "urls": self.spot_urls})
        return df

In [7]:
spots = SurflineSpots()

In [8]:
spots_df = spots.process_spots()

INFO:root:30
INFO:root:spots


In [9]:
spots_df.head()

Unnamed: 0,ids,names,lon,lat,urls
0,5842041f4e65fad6a7708b92,Boca Chica,-97.14769,26.061231,https://www.surfline.com/surf-report/boca-chic...
1,5842041f4e65fad6a7708b96,Port Mansfield,-97.26967,26.56132,https://www.surfline.com/surf-report/port-mans...
2,5842041f4e65fad6a7708b93,Isla Blanca State Park,-97.15243,26.069619,https://www.surfline.com/surf-report/isla-blan...
3,584204204e65fad6a7709436,South Padre Island,-97.16126,26.107378,https://www.surfline.com/surf-report/south-pad...
4,6033da7c9ddc9303e4202738,Port O'Connor Jetties,-96.32321,28.417735,https://www.surfline.com/surf-report/port-o-co...
