In [1]:
import pandas as pd
from datetime import datetime, timedelta
import re
import requests
import json
import os

# Logical Structure

1. We download the real weather data and store it in a DataFrame
2. Loop over the forecasts
    - Take the forecast and restrict them to the next 2 days
    - Compare each forecast with the real weather and store the difference as a percentage
    - Add this percentage to a list. Now we can calculate the average

# 1. Download Real Weather Data

In [2]:
def create_date_and_hour(df:pd.DataFrame):
    df["date"] = pd.to_datetime(df.time).dt.date
    df["hour"] = pd.to_datetime(df.time).dt.hour
    return df

def get_weather(start_date:str, end_date:str) -> pd.DataFrame:
    api = f"https://archive-api.open-meteo.com/v1/archive?latitude=51.51&longitude=-0.13&start_date={start_date}&end_date={end_date}&hourly=temperature_2m,relativehumidity_2m,surface_pressure,direct_radiation,windspeed_10m,winddirection_10m"
    result = requests.get(api).text
    df = pd.DataFrame(json.loads(result).get("hourly"))

    # Create better readable date and hour
    df = create_date_and_hour(df)

    # Remove the last day, because it's empty
    df = df.loc[df.date != datetime.strptime(end_date, "%Y-%m-%d").date()]
    
    return df

In [21]:
# Right now the API takes 5 days to publish the historical weather data
real_weather = get_weather("2023-05-11", "2023-05-15")

# 2. Loop over forecasts

In [22]:
# Get all files in folder and store the filepaths
filepaths = []
# assign directory
directory = 'forecast-data'

# iterate over files in
# that directory
for filename in os.listdir(directory):
	f = os.path.join(directory, filename)
	# checking if it is a file
	if os.path.isfile(f):
		filepaths.append(f)

filepaths = sorted(filepaths)

In [23]:
temps = []
for file in filepaths[:5]:
    # Remove [:1] when you want to do it on more data
    # Data is stored each day in a file. The only indicator which date we have is
    # the filename. We have to extract the date out of the filename in step 1

    # file name with extension
    forecast_date = os.path.basename(file).split("_")[0]
    forecast_date = datetime.strptime(forecast_date, "%Y-%m-%d").date()
    df = pd.read_json(file)
    df = create_date_and_hour(df)
    df = df.loc[(df["date"] > forecast_date) & (df["date"] <= forecast_date + timedelta(days=2))]
    # Now we merge the real weather data in the forecasts
    df = pd.merge(df, real_weather, on="time", how="inner", suffixes=("_forecast", "_real"))
    temp_diff = df["temperature_2m_forecast"] / df["temperature_2m_real"]
    [temps.append(x) for x in temp_diff]




In [24]:
# This should mean that the real weather is 13.3 percent higher than the prediction.
# Be aware that the database has a 5 days delay. So we dont have all the data yet
sum(temps) / len(temps)

1.1338483259650256