# WECC Weather Data

<b>Author:</b> Neal Ma

<b>Creation Date:</b> October 20, 2024

<b>Description:</b> This notebook pulls weather data for 10 of the largest cities in WECC. Hourly weather data is retrieved over the past 10 years from open-meteo.com. The 10 chosen cities are:

1) Seattle
2) Portland
3) Sacramento
4) San Jose
5) Los Angeles
6) Las Vegas
7) Pheonix
8) Salt Lake City
9) Denver
10) Albuquerque

These 10 cities were selected due to their location within WECC, spacing from each other, and relatively high populations.

In [7]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
import json
import os
from typing import List, Dict
from geopy.geocoders import Nominatim
import time

In [66]:
BASE_URL = "https://archive-api.open-meteo.com/v1/archive"

CITIES = [
    "Seattle, WA",
    "Portland, OR",
    "Sacramento, CA",
    "San Jose, CA",
    "Los Angeles, CA",
    "Las Vegas, CA",
    "Phoenix, AZ",
    "Salt Lake City, UT",
    "Denver, CO",
    "Albuquerque, NM",
]

START_DATE = pd.Timestamp("October 15, 2004")
END_DATE = pd.Timestamp("October 15, 2024")

TIMEZONE = "America/Los_Angeles"  # our price data is all in this time zone
TEMP_KEY = "temperature_2m"

geolocator = Nominatim(user_agent="my_app")

city_coords = []
for city in CITIES:
    loc = geolocator.geocode(city)
    city_coords.append([city, loc.latitude, loc.longitude])


city_df = pd.DataFrame(data=city_coords, columns=["city", "latitude", "longitude"])

city_df

Unnamed: 0,city,latitude,longitude
0,"Seattle, WA",47.603832,-122.330062
1,"Portland, OR",45.520247,-122.674194
2,"Sacramento, CA",38.581061,-121.493895
3,"San Jose, CA",37.336166,-121.890591
4,"Los Angeles, CA",34.053691,-118.242766
5,"Las Vegas, CA",10.270969,-67.740856
6,"Phoenix, AZ",33.448437,-112.074141
7,"Salt Lake City, UT",40.75962,-111.886797
8,"Denver, CO",39.739236,-104.984862
9,"Albuquerque, NM",35.084103,-106.650985


In [67]:
def pull_temperature_data(
    city_info: pd.DataFrame = None,
    start_date: pd.Timestamp = None,
    end_date: pd.Timestamp = None,
):
    if city_info is None or start_date is None or end_date is None:
        return None

    temperature_data = pd.DataFrame()

    for i, row in city_info.iterrows():
        city_temperature_data = pd.DataFrame()
        current_start = start_date
        city_name = ("_".join(row["city"].split(",")[0].split())).lower()
        print(city_name)

        while current_start < end_date:
            current_end = min(
                current_start + pd.DateOffset(years=1) - pd.DateOffset(days=1), end_date
            )

            try:
                params = {
                    "latitude": row["latitude"],
                    "longitude": row["longitude"],
                    "start_date": current_start.strftime("%Y-%m-%d"),
                    "end_date": current_end.strftime("%Y-%m-%d"),
                    "hourly": TEMP_KEY,
                    "timezone": TIMEZONE,
                }

                response = requests.get(BASE_URL, params=params)
                response.raise_for_status()
                data = response.json()

                # Extract hourly data
                temps = data["hourly"][TEMP_KEY]
                times = data["hourly"]["time"]

                # Create city-specific DataFrame for this time period
                period_df = pd.DataFrame(
                    {"timestamp": times, f"{city_name}_temp": temps}
                )

                city_temperature_data = pd.concat([city_temperature_data, period_df])

                # Respect API rate limits
                time.sleep(1)

            except Exception as e:
                print(
                    f"Error fetching data for {row['city']} from {current_start} to {current_end}: {str(e)}"
                )

            current_start = current_end + pd.DateOffset(days=1)

        if temperature_data.empty:
            temperature_data = city_temperature_data
        else:
            temperature_data = temperature_data.merge(
                city_temperature_data, on="timestamp", how="outer"
            )
        print(len(temperature_data), temperature_data.columns)

    if not temperature_data.empty:
        temperature_data["timestamp"] = pd.to_datetime(temperature_data["timestamp"])
        temperature_data.set_index("timestamp", inplace=True)
        temperature_data.sort_index(inplace=True)

    return temperature_data

In [68]:
temperature_data = pull_temperature_data(city_df, START_DATE, END_DATE)
print(temperature_data.head())

seattle
175320 Index(['timestamp', 'seattle_temp'], dtype='object')
portland
175320 Index(['timestamp', 'seattle_temp', 'portland_temp'], dtype='object')
sacramento
175320 Index(['timestamp', 'seattle_temp', 'portland_temp', 'sacramento_temp'], dtype='object')
san_jose
175320 Index(['timestamp', 'seattle_temp', 'portland_temp', 'sacramento_temp',
       'san_jose_temp'],
      dtype='object')
los_angeles
175320 Index(['timestamp', 'seattle_temp', 'portland_temp', 'sacramento_temp',
       'san_jose_temp', 'los_angeles_temp'],
      dtype='object')
las_vegas
175320 Index(['timestamp', 'seattle_temp', 'portland_temp', 'sacramento_temp',
       'san_jose_temp', 'los_angeles_temp', 'las_vegas_temp'],
      dtype='object')
phoenix
175320 Index(['timestamp', 'seattle_temp', 'portland_temp', 'sacramento_temp',
       'san_jose_temp', 'los_angeles_temp', 'las_vegas_temp', 'phoenix_temp'],
      dtype='object')
salt_lake_city
175320 Index(['timestamp', 'seattle_temp', 'portland_temp', 'sacramen

In [75]:
# save weather data to parquet file in data
temperature_data.to_parquet(
    f'./data/temperature_data_{START_DATE.strftime("%Y-%m-%d")}_{END_DATE.strftime("%Y-%m-%d")}.parquet'
)