In [217]:
import pandas as pd
from datetime import datetime, timedelta
import re
import requests
import json
import os

# Locial Structure

1. We download the real weather data and store it in a DataFrame
2. Loop over the forecasts
    - Take the forecast and restrict them to the next 2 days
    - Compare each forecast with the real weather and store the difference as a percentage
    - Add this percentage to a list. Now we can calculate the average

# 1. Download Real Weather Data

In [218]:
def create_date_and_hour(df:pd.DataFrame):
    df["date"] = pd.to_datetime(df.time).dt.date
    df["hour"] = pd.to_datetime(df.time).dt.hour
    return df

def get_weather(start_date:str, end_date:str) -> pd.DataFrame:
    api = f"https://archive-api.open-meteo.com/v1/archive?latitude=51.51&longitude=-0.13&start_date={start_date}&end_date={end_date}&hourly=temperature_2m,relativehumidity_2m,surface_pressure,direct_radiation,windspeed_10m,winddirection_10m"
    result = requests.get(api).text
    df = pd.DataFrame(json.loads(result).get("hourly"))

    # Create better readable date and hour
    df = create_date_and_hour(df)

    # Remove the last day, because it's empty
    df = df.loc[df.date != datetime.strptime(end_date, "%Y-%m-%d").date()]
    
    return df

In [222]:
# Right now the API takes 5 days to publish the historical weather data
real_weather = get_weather("2023-05-11", "2023-05-13")

# 2. Loop over forecasts

In [223]:
# Get all files in folder and store the filepaths
filepaths = []
# assign directory
directory = 'forecast-data'

# iterate over files in
# that directory
for filename in os.listdir(directory):
	f = os.path.join(directory, filename)
	# checking if it is a file
	if os.path.isfile(f):
		filepaths.append(f)

filepaths = sorted(filepaths)

In [224]:
for file in filepaths[:1]:
    # Remove [:1] when you want to do it on more data
    # Data is stored each day in a file. The only indicator which date we have is
    # the filename. We have to extract the date out of the filename in step 1

    # file name with extension
    forecast_date = os.path.basename(file).split("_")[0]
    forecast_date = datetime.strptime(forecast_date, "%Y-%m-%d").date()
    df = pd.read_json(file)
    df = create_date_and_hour(df)
    df = df.loc[(df["date"] > forecast_date) & (df["date"] <= forecast_date + timedelta(days=2))]
    # Now we merge the real weather data in the forecasts
    df = pd.merge(df, real_weather, on="time", how="inner")
    print(df.head())




               time  temperature_2m_x  relativehumidity_2m_x   
0  2023-05-12T00:00              11.4                     94  \
1  2023-05-12T01:00              10.7                     96   
2  2023-05-12T02:00              10.8                     90   
3  2023-05-12T03:00              11.0                     88   
4  2023-05-12T04:00              10.8                     86   

   surface_pressure_x  windspeed_10m_x  winddirection_10m_x  is_day   
0              1015.3             10.0                   69       0  \
1              1015.8             10.3                   54       0   
2              1016.2              9.4                   50       0   
3              1016.6             11.2                   42       0   
4              1016.7             11.9                   35       0   

   terrestrial_radiation      date_x  hour_x  temperature_2m_y   
0                    0.0  2023-05-12       0               9.8  \
1                    0.0  2023-05-12       1            