In [7]:
import pandas as pd
import requests
from datetime import datetime
from dotenv import load_dotenv
import os


# Making the API call and viewing the json

If you want to search for the city by name and country instead of latitude and longitude, try using the [Geocoding API](https://openweathermap.org/api/geocoding-api).

In [8]:
# Berlin
latitude = 52.520008
longitude = 13.404954

load_dotenv()
API_key = 'weather_api_key'

# check out the docs for more info on making an api call https://openweathermap.org/forecast5
url = (f"https://api.openweathermap.org/data/2.5/forecast?lat={latitude}&lon={longitude}&appid={API_key}&units=metric")

response = requests.get(url)
json = response.json()

json

{'cod': 401,
 'message': 'Invalid API key. Please see https://openweathermap.org/faq#error401 for more info.'}

# Exploring the json
Remember, if you want to know more about any of the information in the json, or you're unsure what a key is, check out the [open weather docs](https://openweathermap.org/forecast5).

First, let's look at the keys

In [None]:
json.keys()

dict_keys(['cod', 'message', 'cnt', 'list', 'city'])

In [None]:
json["cod"]

'200'

In [None]:
json["message"]

0

In [None]:
json["cnt"]

40

In [None]:
json["list"]

[{'dt': 1730732400,
  'main': {'temp': 10.96,
   'feels_like': 10.36,
   'temp_min': 10.28,
   'temp_max': 10.96,
   'pressure': 1031,
   'sea_level': 1031,
   'grnd_level': 1024,
   'humidity': 86,
   'temp_kf': 0.68},
  'weather': [{'id': 803,
    'main': 'Clouds',
    'description': 'broken clouds',
    'icon': '04d'}],
  'clouds': {'all': 83},
  'wind': {'speed': 0.61, 'deg': 197, 'gust': 0.89},
  'visibility': 10000,
  'pop': 0,
  'sys': {'pod': 'd'},
  'dt_txt': '2024-11-04 15:00:00'},
 {'dt': 1730743200,
  'main': {'temp': 10.19,
   'feels_like': 9.48,
   'temp_min': 9.63,
   'temp_max': 10.19,
   'pressure': 1030,
   'sea_level': 1030,
   'grnd_level': 1024,
   'humidity': 85,
   'temp_kf': 0.56},
  'weather': [{'id': 804,
    'main': 'Clouds',
    'description': 'overcast clouds',
    'icon': '04n'}],
  'clouds': {'all': 91},
  'wind': {'speed': 1.71, 'deg': 132, 'gust': 2.91},
  'visibility': 10000,
  'pop': 0,
  'sys': {'pod': 'n'},
  'dt_txt': '2024-11-04 18:00:00'},
 {'dt'

Looks like the information we'll want is in `list`. Let's look at the first element of `list` to investigate further.

In [None]:
json["list"][0]

{'dt': 1730732400,
 'main': {'temp': 10.96,
  'feels_like': 10.36,
  'temp_min': 10.28,
  'temp_max': 10.96,
  'pressure': 1031,
  'sea_level': 1031,
  'grnd_level': 1024,
  'humidity': 86,
  'temp_kf': 0.68},
 'weather': [{'id': 803,
   'main': 'Clouds',
   'description': 'broken clouds',
   'icon': '04d'}],
 'clouds': {'all': 83},
 'wind': {'speed': 0.61, 'deg': 197, 'gust': 0.89},
 'visibility': 10000,
 'pop': 0,
 'sys': {'pod': 'd'},
 'dt_txt': '2024-11-04 15:00:00'}

In [None]:
json["city"]

{'id': 6545310,
 'name': 'Mitte',
 'coord': {'lat': 52.52, 'lon': 13.405},
 'country': 'DE',
 'population': 329078,
 'timezone': 3600,
 'sunrise': 1730700519,
 'sunset': 1730734273}

# Building a DataFrame

## What would we like from the json?
Let's begin by exploring the first 2 items of `list` and see what we have.

In [None]:
json["list"][0]

{'dt': 1730732400,
 'main': {'temp': 10.96,
  'feels_like': 10.36,
  'temp_min': 10.28,
  'temp_max': 10.96,
  'pressure': 1031,
  'sea_level': 1031,
  'grnd_level': 1024,
  'humidity': 86,
  'temp_kf': 0.68},
 'weather': [{'id': 803,
   'main': 'Clouds',
   'description': 'broken clouds',
   'icon': '04d'}],
 'clouds': {'all': 83},
 'wind': {'speed': 0.61, 'deg': 197, 'gust': 0.89},
 'visibility': 10000,
 'pop': 0,
 'sys': {'pod': 'd'},
 'dt_txt': '2024-11-04 15:00:00'}

In [None]:
json["list"][1]

{'dt': 1730743200,
 'main': {'temp': 10.19,
  'feels_like': 9.48,
  'temp_min': 9.63,
  'temp_max': 10.19,
  'pressure': 1030,
  'sea_level': 1030,
  'grnd_level': 1024,
  'humidity': 85,
  'temp_kf': 0.56},
 'weather': [{'id': 804,
   'main': 'Clouds',
   'description': 'overcast clouds',
   'icon': '04n'}],
 'clouds': {'all': 91},
 'wind': {'speed': 1.71, 'deg': 132, 'gust': 2.91},
 'visibility': 10000,
 'pop': 0,
 'sys': {'pod': 'n'},
 'dt_txt': '2024-11-04 18:00:00'}

When deciding what is useful, remember the project goals.

The time the forecast is for will be useful.

In [None]:
json["list"][0]["dt_txt"]

'2024-11-04 15:00:00'

The temperature will be useful. Scooters don't have heaters!

In [None]:
json["list"][0]["main"]["temp"]

10.96

The forecast will be useful. Scooters don't have a roof!

In [None]:
json["list"][0]["weather"][0]["main"]

'Clouds'

If it's been recently raining will be useful. Wet conditions will affect scooter rentals.

In [None]:
json["list"][0]["rain"]["3h"]

KeyError: 'rain'

In [None]:
json["list"][0].get("rain",{}).get("3h",0)

0

> If you get an error on 'rain', it's because it hasn't rained in the last 3 hours. But we know this key exists, because we read the [docs](https://openweathermap.org/forecast5)

Wind speed will be useful. It's likely to affect whether people rent scooters or not.

In [None]:
json["list"][0]["wind"]["speed"]

0.61

In [None]:
json["list"][0].get("rain", {}).get("3h", 0)

0

## Bringing together our chosen items

In [None]:
weather_items = []
# run through every element in the json list. The Item is adictionary. The .get(KEY) method returns the value
for item in json["list"]:
    weather_item = {
        "forecast_time": item.get("dt_txt", None),
        "temperature": item["main"].get("temp", None), # specify the key main --> and then use the get method for this dictionary
        "forecast": item["weather"][0].get("main", None), #specify the key weather --> that is a list - use the first element
        "rain_in_last_3h": item.get("rain", {}).get("3h", 0),
        "wind_speed": item["wind"].get("speed", None),
    }

    weather_items.append(weather_item)

weather_df = pd.DataFrame(weather_items)

weather_df.head()

Unnamed: 0,forecast_time,temperature,forecast,rain_in_last_3h,wind_speed
0,2024-11-04 15:00:00,10.96,Clouds,0,0.61
1,2024-11-04 18:00:00,10.19,Clouds,0,1.71
2,2024-11-04 21:00:00,7.68,Clouds,0,2.33
3,2024-11-05 00:00:00,6.63,Clouds,0,2.37
4,2024-11-05 03:00:00,6.06,Clear,0,2.37


The `rain` key in the JSON data is not always present. In the absence of this key, it's assumed that there hasn't been any recent rainfall. Since this key isn't always available, not encountering it during our loop can cause an error. To address this, we'll employ two `.get()` methods to handle the absence of the `rain` key.

> **Remember:** The second parameter of `.get()` serves as a fallback value, which is inserted into our dictionary if the key we're looking for doesn't exist.

The first `.get()` method checks for the existence of the `rain` key. If present, it will use the value associated with that key. If the `rain` key is missing, an empty dictionary will be used, preventing the code from breaking. The second `.get()` method checks for the `3h` key. If present, it will use its value. Otherwise, a default value of `0` will be used.

# Creating a function for multiple cities

In [None]:
import pandas as pd
from datetime import datetime
from pytz import timezone

def fetch_weather_data(cities):
  # Let's ensure the timezone is our local timezone
  berlin_timezone = timezone('Europe/Berlin')
  API_key = userdata.get('openWeatherApi')
  weather_items = []

  for city in cities:

    country = "DE"

    # Geocoder api to get the latitude and longitude
    # This information should be from your SQL table, but we don't have that here
    geo_url = (f"http://api.openweathermap.org/geo/1.0/direct?q={city},{country}&limit=5&appid={API_key}")
    geo_response = requests.get(geo_url)
    geo_json = geo_response.json()
    latitude = geo_json[0]["lat"]
    longitude = geo_json[0]["lon"]

    # weather api
    weather_url = (f"https://api.openweathermap.org/data/2.5/forecast?lat={latitude}&lon={longitude}&appid={API_key}&units=metric")
    weather_response = requests.get(weather_url)
    weather_json = weather_response.json()

    # Added the time retrieved so we know when the forecast was made
    retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")

    for item in weather_json["list"]:
        weather_item = {
            # Added the city name, so the information is clear when looking at multiple cities
            "city": city,
            "forecast_time": item.get("dt_txt", None),
            "temperature": item["main"].get("temp", None),
            "forecast": item["weather"][0].get("main", None),
            "rain_in_last_3h": item.get("rain", {}).get("3h", 0),
            "wind_speed": item["wind"].get("speed", None),
            "data_retrieved_at": retrieval_time
        }

        weather_items.append(weather_item)

  weather_df = pd.DataFrame(weather_items)

  return weather_df

In [None]:
fetch_weather_data(["Berlin", "Hamburg", "Munich"])

Unnamed: 0,city,forecast_time,temperature,forecast,rain_in_last_3h,wind_speed,data_retrieved_at
0,Berlin,2024-11-04 15:00:00,10.97,Clouds,0,0.61,2024-11-04 13:43:18
1,Berlin,2024-11-04 18:00:00,10.19,Clouds,0,1.71,2024-11-04 13:43:18
2,Berlin,2024-11-04 21:00:00,7.68,Clouds,0,2.33,2024-11-04 13:43:18
3,Berlin,2024-11-05 00:00:00,6.63,Clouds,0,2.37,2024-11-04 13:43:18
4,Berlin,2024-11-05 03:00:00,6.06,Clear,0,2.37,2024-11-04 13:43:18
...,...,...,...,...,...,...,...
115,Munich,2024-11-09 00:00:00,5.26,Clear,0,0.43,2024-11-04 13:43:19
116,Munich,2024-11-09 03:00:00,4.62,Clouds,0,0.26,2024-11-04 13:43:19
117,Munich,2024-11-09 06:00:00,4.03,Clouds,0,0.55,2024-11-04 13:43:19
118,Munich,2024-11-09 09:00:00,8.55,Clear,0,0.06,2024-11-04 13:43:19


# Making our function work with our SQL cities data
This section delves into a more advanced aspect of the project, which may be beyond the point many of you got to. However, it serves as an illustrative example of how to structure your work effectively. It demonstrates how we can compartmentalise various aspects of our data processing pipeline into distinct functions, enhancing code readability and facilitating debugging.
> This part will only work locally, not on colab. You must also create your weather table in SQL first.

In [None]:
import pandas as pd
import requests
from pytz import timezone
from datetime import datetime

def retrieve_and_send_data():
  connection_string = create_connection_string()
  cities_df = fetch_cities_data(connection_string)
  weather_df = fetch_weather_data(cities_df)
  store_weather_data(weather_df, connection_string)
  return "Data has been updated"

def create_connection_string():
  schema = "gans"
  host = "127.0.0.1"
  user = "root"
  password = "YOUR_PASSWORD_HERE"
  port = 3306
  return f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

def fetch_cities_data(connection_string):
  return pd.read_sql("cities", con=connection_string)

def fetch_weather_data(cities_df):
  berlin_timezone = timezone('Europe/Berlin')
  API_key = 'YOUR_API_HERE'
  weather_items = []

  for _, city in cities_df.iterrows():
      latitude = city["latitude"]
      longitude = city["longitude"]
      city_id = city["city_id"]
      #city_name = city["city"]

      url = (f"https://api.openweathermap.org/data/2.5/forecast?lat={latitude}&lon={longitude}&appid={API_key}&units=metric")
      response = requests.get(url)
      weather_data = response.json()

      retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")

      for item in weather_data["list"]:
          weather_item = {
              "city_id": city_id,
              "forecast_time": item.get("dt_txt"),
              "temperature": item["main"].get("temp"),
              "forecast": item["weather"][0].get("main"),
              "rain_in_last_3h": item.get("rain", {}).get("3h", 0),
              "wind_speed": item["wind"].get("speed"),
              "data_retrieved_at": retrieval_time
          }
          weather_items.append(weather_item)

  weather_df = pd.DataFrame(weather_items)
  weather_df["forecast_time"] = pd.to_datetime(weather_df["forecast_time"])
  weather_df["data_retrieved_at"] = pd.to_datetime(weather_df["data_retrieved_at"])

  return weather_df

def store_weather_data(weather_df, connection_string):
  weather_df.to_sql('weather',
                    if_exists='append',
                    con=connection_string,
                    index=False)

In [1]:
retrieve_and_send_data()

NameError: name 'retrieve_and_send_data' is not defined

In [None]:
import pandas as pd
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 32, 37]
})
for index, city in df.iterrows():
    print(f"Row {index} data:")
    print(f"Name: {city['Name']}, Age: {city['Age']}")

Row 0 data:
Name: Alice, Age: 25
Row 1 data:
Name: Bob, Age: 32
Row 2 data:
Name: Charlie, Age: 37


In [None]:
df

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,32
2,Charlie,37


## Another option without using `.iterrows()`

In [None]:
import pandas as pd
import requests
from pytz import timezone
from datetime import datetime

def retreiving_and_sending_data():
  connection_string = connection()
  cities_df = get_cities_data(connection_string)
  weather_df = get_weather_data(cities_df)
  send_weather_data(weather_df, connection_string)
  return "Data has been updated"

def connection():
  schema = "gans"
  host = "127.0.0.1"
  user = "root"
  password = "YOUR_PASSWORD_HERE"
  port = 3306
  return f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

def get_cities_data(connection_string):
  return pd.read_sql("cities", con=connection_string)

def get_weather_data(cities_df):
  berlin_timezone = timezone('Europe/Berlin')
  API_key = 'YOUR_API_HERE'
  weather_items = []

  for city in cities_df["city"]:
    url = (f"http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={API_key}&units=metric")
    response = requests.get(url)
    json = response.json()

    # As we are now using the data from our relational database
    # the city should reflect the city_id and not the city name
    city_id = cities_df.loc[cities_df["city"] == city, "city_id"].values[0]

    retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")

    for item in json["list"]:
        weather_item = {
            "city_id": city_id,
            "forecast_time": item.get("dt_txt", None),
            "temperature": item["main"].get("temp", None),
            "forecast": item["weather"][0].get("main", None),
            "rain_in_last_3h": item.get("rain", {}).get("3h", 0),
            "wind_speed": item["wind"].get("speed", None),
            "data_retrieved_at": retrieval_time
        }

        weather_items.append(weather_item)

  weather_df = pd.DataFrame(weather_items)
  weather_df["forecast_time"] = pd.to_datetime(weather_df["forecast_time"])
  weather_df["data_retrieved_at"] = pd.to_datetime(weather_df["data_retrieved_at"])

  return weather_df

def send_weather_data(weather_df, connection_string):
  weather_df.to_sql('weather',
                    if_exists='append',
                    con=connection_string,
                    index=False)