In [29]:
from datetime import datetime, timedelta
from pathlib import Path

import numpy as np


def generate_synthetic_seasonal_weather_data(start_date, end_date):
    date_format = "%Y-%m-%d"
    start_date = datetime.strptime(start_date, date_format)
    end_date = datetime.strptime(end_date, date_format)

    date_range = pd.date_range(start_date, end_date, freq="D")
    weather_data = []
    mean_temp_lst = [15, 25, 15, 5]
    std_temp = 5
    mean_humidity_lst = [50, 80, 60, 30]
    std_humidity = 10

    for date in date_range:
        # Extract month to determine the season
        month = date.month

        # Apply seasonal variations
        if 3 <= month <= 5:  # Spring
            season = "spring"
            temperature = max(0, np.random.normal(mean_temp_lst[0], std_temp))
            humidity = max(0, np.random.normal(mean_humidity_lst[0], std_humidity))
        elif 6 <= month <= 8:  # Summer
            season = "summer"
            temperature = max(0, np.random.normal(mean_temp_lst[1], std_temp))
            humidity = max(0, np.random.normal(mean_humidity_lst[1], std_humidity))
        elif 9 <= month <= 11:  # Fall
            season = "fall"
            temperature = max(0, np.random.normal(mean_temp_lst[2], std_temp))
            humidity = max(0, np.random.normal(mean_humidity_lst[2], std_humidity))
        else:  # Winter
            season = "winter"
            temperature = max(0, np.random.normal(mean_temp_lst[3], std_temp))
            humidity = max(0, np.random.normal(mean_humidity_lst[3], std_humidity))

        weather_data.append(
            {
                "date": date,
                "temperature": temperature,
                "humidity": humidity,
                "season": season,
            }
        )

    return pd.DataFrame(weather_data)

In [30]:
start_date = "2016-01-01"
end_date = "2017-12-31"

df_tokyo_weather = generate_synthetic_seasonal_weather_data(start_date, end_date)

In [33]:
df_tokyo_weather = df_tokyo_weather.assign(year=lambda df: df.date.dt.year)

In [34]:
df_tokyo_weather.groupby(["year", "season"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,temperature,humidity
year,season,Unnamed: 2_level_1,Unnamed: 3_level_1
2016,fall,15.182122,61.1078
2016,spring,14.699303,49.456428
2016,summer,24.510443,79.468291
2016,winter,6.408534,31.659593
2017,fall,15.682703,60.184843
2017,spring,14.546191,50.451363
2017,summer,25.355923,80.619643
2017,winter,5.363609,28.39497


In [36]:
df_tokyo_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         731 non-null    datetime64[ns]
 1   temperature  731 non-null    float64       
 2   humidity     731 non-null    float64       
 3   season       731 non-null    object        
 4   year         731 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 28.7+ KB


In [35]:
output_path = Path("../data/preprocessed-data/tokyo_weather.csv")
df_tokyo_weather.to_csv(output_path, index=False)