In [1]:
import asyncio
from typing import List, Dict, Optional
import json
import time
from pathlib import Path

import aiohttp
import numpy as np
import pandas as pd
import pendulum
import logging
import requests
from dataclasses import dataclass
from enum import Enum

from sqlalchemy import create_engine, select, text, and_, Column, Text, String, Integer, Float

from sqlalchemy.orm import sessionmaker, declarative_base
from utils.utils import LOCAL_AIRFLOW_PG_URI, LOCAL_PG_URI

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
Base = declarative_base()

In [5]:
engine = create_engine(LOCAL_PG_URI)
SessionLocal = sessionmaker(bind=engine)

In [6]:
class SurflineSpots:
    def __init__(self):
        self.states = []
        self.state_ids = []
        self.state_urls = []
        self.state_data = []
        self.county_data = []
        self.region_data = [] 
        self.spot_ids = []
        self.spot_names = []
        self.spot_address = []
        self.spot_lon = []
        self.spot_lat = []
        self.spot_urls = []

    def _update_states(self):
        response = requests.get(
            "https://services.surfline.com/taxonomy?type=taxonomy&id=58f7ed51dadb30820bb3879c&maxDepth=0"
        )
        json_data = response.json()
        json_contains = json_data["contains"]
        for x in json_contains:
            self.states.append(x["name"])
            self.state_ids.append(x["_id"])
        
        for state_id in self.state_ids:
            self.state_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + state_id + "&maxDepth=0")
        logging.info(f"{len(self.state_urls)}")


    async def fetch_url(self, url, session):
        async with session.get(url) as response:
            return await response.json()


    async def fetch_all_urls(self, target):
        data = []
        async with aiohttp.ClientSession() as session:
            tasks = []
            for url in target:
                tasks.append(self.fetch_url(url, session))
            data = await asyncio.gather(*tasks)
        return data


    def update_data(self, data_target: List[str], attr_target):
        data = asyncio.run(self.fetch_all_urls(data_target))
        setattr(self, attr_target, data)
    

    def process_spots(self):
        if len(self.states) == 0:
            self._update_states() 
        
        self.update_data(self.state_urls, "state_data")
        logging.info("spots")



        county_ids = []    
        for state in self.state_data:
            state_contains = state['contains']
            for y in state_contains:
                county_ids.append(y['_id'])
        
        county_urls = []
        for county_id in county_ids:
            county_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + county_id + "&maxDepth=0")

        self.update_data(county_urls, "county_data")

        region_ids = []
        region_names = []
        for county in self.county_data:
            county_contains = county['contains']
            for z in county_contains:
                region_ids.append(z['_id'])
                region_names.append(z['name'])

        region_urls = []
        for region_id in region_ids:
            region_urls.append("https://services.surfline.com/taxonomy?type=taxonomy&id=" + region_id + "&maxDepth=0")

        self.update_data(region_urls, "region_data")


        for region in self.region_data:
            region_contains = region['contains']
            if len(region_contains) == 0:
                self.spot_ids.append(region.get("spot", ""))
                self.spot_names.append(region.get("name", ""))
                self.spot_address.append("")
                region_associated = region['associated']
                region_links = region_associated['links']
                region_location = region['location']
                region_coordinates = region_location['coordinates']
                self.spot_lon.append(region_coordinates[0])
                self.spot_lat.append(region_coordinates[1])
                for i in region_links:
                    if i['key'] == "www":
                        self.spot_urls.append(i['href'])

        df = pd.DataFrame({"spot_id": self.spot_ids, "spot_name": self.spot_names, "spot_lon": self.spot_lon, "spot_lat": self.spot_lat, "spot_url": self.spot_urls})
        return df

In [7]:
spots = SurflineSpots()

In [8]:
spots_df = spots.process_spots()

INFO:root:30
INFO:root:spots


In [20]:
spots_df.columns

Index(['ids', 'names', 'lon', 'lat', 'urls'], dtype='object')

In [19]:
spots_df.to_sql('sl_spots', con=engine, if_exists='append')

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "index" of relation "sl_spots" does not exist
LINE 1: INSERT INTO sl_spots (index, ids, names, lon, lat, urls) VAL...
                              ^

[SQL: INSERT INTO sl_spots (index, ids, names, lon, lat, urls) VALUES (%(index)s, %(ids)s, %(names)s, %(lon)s, %(lat)s, %(urls)s)]
[parameters: ({'index': 0, 'ids': '5842041f4e65fad6a7708a1b', 'names': 'North End to Ocean City Inlet', 'lon': -75.08017670566352, 'lat': 38.33889038664497, 'urls': 'https://www.surfline.com/surf-report/north-end-to-ocean-city-inlet/5842041f4e65fad6a7708a1b'}, {'index': 1, 'ids': '6307df22782926254d3d5210', 'names': 'Midtown', 'lon': -75.07013, 'lat': 38.365050614750835, 'urls': 'https://www.surfline.com/surf-report/midtown/6307df22782926254d3d5210'}, {'index': 2, 'ids': '5842041f4e65fad6a770886d', 'names': 'Ocean City Boardwalk', 'lon': -75.08117, 'lat': 38.338461, 'urls': 'https://www.surfline.com/surf-report/ocean-city-boardwalk/5842041f4e65fad6a770886d'}, {'index': 3, 'ids': '5842041f4e65fad6a7708a1a', 'names': 'Assateague', 'lon': -75.17704, 'lat': 38.148058, 'urls': 'https://www.surfline.com/surf-report/assateague/5842041f4e65fad6a7708a1a'}, {'index': 4, 'ids': '584204204e65fad6a7709562', 'names': 'Park Point', 'lon': -92.04979, 'lat': 46.731855, 'urls': 'https://www.surfline.com/surf-report/park-point/584204204e65fad6a7709562'}, {'index': 5, 'ids': '584204204e65fad6a7709563', 'names': 'Stoney Point', 'lon': -91.811896, 'lat': 46.928071, 'urls': 'https://www.surfline.com/surf-report/stoney-point/584204204e65fad6a7709563'}, {'index': 6, 'ids': '584204204e65fad6a7709570', 'names': 'Magney State Park', 'lon': -90.047617, 'lat': 47.814652, 'urls': 'https://www.surfline.com/surf-report/magney-state-park/584204204e65fad6a7709570'}, {'index': 7, 'ids': '584204204e65fad6a770956c', 'names': 'Temperance River Mouth', 'lon': -90.872941, 'lat': 47.55269, 'urls': 'https://www.surfline.com/surf-report/temperance-river-mouth/584204204e65fad6a770956c'}  ... displaying 10 of 1306 total bound parameter sets ...  {'index': 1304, 'ids': '61e2073a51944e645f142ef4', 'names': 'Stone Jetty', 'lon': -80.13096, 'lat': 42.161137, 'urls': 'https://www.surfline.com/surf-report/stone-jetty/61e2073a51944e645f142ef4'}, {'index': 1305, 'ids': '584204204e65fad6a7709524', 'names': 'Freeport', 'lon': -79.833447, 'lat': 42.24236, 'urls': 'https://www.surfline.com/surf-report/freeport/584204204e65fad6a7709524'})]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [10]:
class SlSpots(Base):
    __tablename__ = 'sl_spots'
    spot_id = Column(Text, primary_key=True)
    spot_name = Column(Text)
    spot_lon = Column(Float)
    spot_lat = Column(Float)
    spot_url = Column(Text) 

In [11]:

def create_tables():
    Base.metadata.create_all(bind=engine)

In [12]:
create_tables()

In [15]:
with SessionLocal() as db:
    stmt = select(SlSpots.spot_id)
    results = db.execute(stmt).scalars().all()
    

In [16]:
results

[]