In [1]:
# Import libraries
import pandas as pd
import os
from datetime import datetime
from meteostat import Hourly

# Folders for loading and saving data
load_path = "divvy/"
save_path = "data/"
if os.path.isdir(load_path) == False:
    os.mkdir(load_path)
if os.path.isdir(save_path) == False:
    os.mkdir(save_path)

### 1. Weather dataset from Meteostat 'Chicago Midway airport', ID 72534 ###

Data structure:
https://dev.meteostat.net/python/hourly.html#example

|Column|Description|Type|
|---|---|---|
|time|The datetime of the observation|Datetime64|
|temp|The air temperature in °C|Float64|
|dwpt|The dew point in °C|Float64|
|rhum|The relative humidity in percent (%)|Float64|
|prcp|The one hour precipitation total in mm|Float64|
|snow|The snow depth in mm|Float64|
|wdir|The average wind direction in degrees (°)|Float64|
|wspd|The average wind speed in km/h|Float64|
|wpgt|The peak wind gust in km/h|Float64|
|pres|The average sea-level air pressure in hPa|Float64|
|tsun|The one hour sunshine total in minutes (m)|Float64|
|coco|The weather condition code|Float64|

In [2]:
# Get hourly weather data from Chicago Midway airport (ID 72534) from Meteostat-API
df_meteostat = pd.DataFrame

# Set time period
start = datetime(2023, 9, 1, 0, 0)
end = datetime(2023, 9, 30, 23, 59)

# Get hourly data for weather station 'Chicago Midway airport' (ID 72534)
data = Hourly('72534', start, end)
df_meteostat = data.fetch()

# Drop unused columns
df_meteostat.drop(columns=['dwpt', 'rhum', 'wpgt', 'pres', 'tsun', 'coco'], inplace=True)
df_meteostat.reset_index(drop=False, inplace=True)

# Cleaning
df_meteostat = df_meteostat.astype({'wdir': int})
df_meteostat['prcp'].fillna(0, inplace=True)
df_meteostat['snow'].fillna(0, inplace=True)

# Renaming
df_meteostat.rename(columns = {'time':'datetime', 'prcp':'rain_1h', 'snow':'snow_1h', 'wdir':'wind_deg', 'wspd':'wind_speed_km_h'}, inplace = True)

# Sorting
df_meteostat.drop_duplicates(subset=['datetime'], inplace=True)

# Check if received Datapoints match expected number of Datapoints (days * 24)
if (((end - start).days + 1) * 24) == len(df_meteostat):
    print(f'Received Datapoints from Meteostat (' + str(len(df_meteostat)) + ') match expected Datapoints (' + str(((end - start).days + 1) * 24) + ').')
else:
    print(f'Received Datapoints from Meteostat (' + str(len(df_meteostat)) + ') DO NOT match expected Datapoints (' + str(((end - start).days + 1) * 24) + ')!')

# Raw estimation: when in DataFrame precipitation and temperatures < 0.0 °C -> snow instead of rain
for i, row in df_meteostat.iterrows():
    if (row['temp'] < 0.0) and (row['rain_1h'] > 0.0):
        df_meteostat.at[i,'snow_1h'] = df_meteostat.at[i,'rain_1h']
        df_meteostat.at[i,'rain_1h'] = 0

# Print DataFrame
df_meteostat.info(show_counts=True)

Received Datapoints from Meteostat (720) match expected Datapoints (720).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   datetime         720 non-null    datetime64[ns]
 1   temp             720 non-null    float64       
 2   rain_1h          720 non-null    float64       
 3   snow_1h          720 non-null    float64       
 4   wind_deg         720 non-null    int32         
 5   wind_speed_km_h  720 non-null    float64       
dtypes: datetime64[ns](1), float64(4), int32(1)
memory usage: 31.1 KB


### 2. Get weather data from September 2023 on from Openweather API 3.0 call ###

In [None]:
# Be cautious to have not more than 60 API calls / minute and 2000 API calls / day


### 3. Weather dataset from bought Openweather csv-file 'History Bulk' (including August 2023) ###
https://openweathermap.org/history-bulk

In [3]:
# Load DataFrames from csv-files
df_temp_openweather = pd.read_csv(load_path + 'Chicago_weather.csv', low_memory=False)

# Calculate local time
df_temp_openweather['datetime'] = pd.to_datetime(df_temp_openweather['dt'] + df_temp_openweather['timezone'], unit='s')

# Drop duplicates
df_temp_openweather.drop_duplicates(subset=['datetime'], inplace=True)

# Cleaning
df_temp_openweather['rain_1h'].fillna(0, inplace=True)
df_temp_openweather['snow_1h'].fillna(0, inplace=True)

# Change wind_speed to unit km/h and round value to 1 decimal
df_temp_openweather['wind_speed'] = round(df_temp_openweather['wind_speed'] * 3.6, 1)
df_temp_openweather.rename(columns = {'wind_speed':'wind_speed_km_h'}, inplace = True)

# Using weather data from '2013-06-27 00:00' to '2023-09-01 00:00'
df_temp_openweather = df_temp_openweather[(df_temp_openweather['datetime'] >= '2013-06-27 00:00') & (df_temp_openweather['datetime'] < '2023-09-01 00:00')]

# Copy only relevant columns to final DataFrame
df_openweather = df_temp_openweather[['datetime', 'temp', 'rain_1h', 'snow_1h', 'wind_deg', 'wind_speed_km_h']]
df_openweather.reset_index(drop=True, inplace=True)
df_openweather.info(show_counts=True)

### Merge DataFrames into one: ###

In [5]:
df_weather = pd.concat([df_openweather, df_meteostat])
df_weather.reset_index(drop=True, inplace=True)

In [6]:
# Save DataFrame as csv-file
df_weather.to_csv(save_path + 'chicago_weather.csv', index=False)

# Print DataFrame
df_weather

Unnamed: 0,datetime,temp,rain_1h,snow_1h,wind_deg,wind_speed_km_h
0,2013-06-27 00:00:00,22.36,0.0,0.0,350,9.4
1,2013-06-27 01:00:00,22.20,0.0,0.0,0,0.0
2,2013-06-27 02:00:00,22.26,0.0,0.0,0,0.0
3,2013-06-27 03:00:00,22.17,0.0,0.0,300,7.6
4,2013-06-27 04:00:00,21.92,0.0,0.0,0,0.0
...,...,...,...,...,...,...
89937,2023-09-30 19:00:00,27.80,0.0,0.0,167,7.6
89938,2023-09-30 20:00:00,28.30,0.0,0.0,110,13.0
89939,2023-09-30 21:00:00,27.80,0.0,0.0,120,16.6
89940,2023-09-30 22:00:00,26.70,0.0,0.0,130,13.0
