In [131]:
import pandas as pd
import numpy as np
from pycaret.time_series import *
from pycaret.regression import *
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3
from pycaret import *
from sklearn.model_selection import train_test_split
import os
from pycaret.regression import load_model, plot_model
import plotly.express as px
import matplotlib.pyplot as plt
import logging
from datetime import datetime, timedelta
from meteostat import Hourly, Point
import warnings


In [132]:
boto3.setup_default_session(profile_name='anthony_garove_fellow_dssgx_24')

bucket = "dssgx-munich-2024-bavarian-forest"
raw_data_folder = "raw-data"
preprocessed_data_folder = "preprocessed_data"

In [133]:
def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df = wr.s3.read_csv(path=path, **kwargs)
    return df
df = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/joined_sensor_weather_visitorcenter_2016-2024.csv"
)
df.head()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Schneehoehe,GS mit,GS max,Total,Temperature (°C),Relative Humidity (%),Precipitation (mm),Wind Speed (km/h),Sunshine Duration (min),coco_2
0,2016-05-10 03:00:00,,,2.0,0.0,,,,,,...,,,,,,,,,,
1,2016-05-10 04:00:00,,,0.0,0.0,,,,,,...,,,,,,,,,,
2,2016-05-10 05:00:00,,,0.0,0.0,,,,,,...,,,,,,,,,,
3,2016-05-10 06:00:00,,,0.0,0.0,,,,,,...,,,,,,,,,,
4,2016-05-10 07:00:00,,,0.0,9.0,,,,,,...,,,,,,,,,,


In [134]:
df.tail()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Schneehoehe,GS mit,GS max,Total,Temperature (°C),Relative Humidity (%),Precipitation (mm),Wind Speed (km/h),Sunshine Duration (min),coco_2
75784,2024-12-31 19:00:00,,,,,,,,,,...,,,,1.0,,,,,,
75785,2024-12-31 20:00:00,,,,,,,,,,...,,,,1.0,,,,,,
75786,2024-12-31 21:00:00,,,,,,,,,,...,,,,1.0,,,,,,
75787,2024-12-31 22:00:00,,,,,,,,,,...,,,,1.0,,,,,,
75788,2024-12-31 23:00:00,,,,,,,,,,...,,,,1.0,,,,,,


In [135]:
# Specify the columns to use
columns_to_use = [
'Time',  'Bayerisch Eisenstein IN',  'Bayerisch Eisenstein OUT',  'Brechhäuslau IN',  'Brechhäuslau OUT',  
'Deffernik IN',  'Deffernik OUT',  'Diensthüttenstraße IN',  'Diensthüttenstraße OUT',  'Felswandergebiet IN',  
'Felswandergebiet OUT',  'Ferdinandsthal IN',  'Ferdinandsthal OUT',  'Fredenbrücke IN',  'Fredenbrücke OUT',  
'Gfäll IN',  'Gfäll OUT',  'Gsenget IN',  'Gsenget OUT',  'Klingenbrunner Wald IN',  'Klingenbrunner Wald OUT',  
'Klosterfilz IN',  'Klosterfilz OUT',  'Racheldiensthütte IN',  'Racheldiensthütte OUT',  'Sagwassersäge IN',  
'Sagwassersäge OUT',  'Scheuereck IN',  'Scheuereck OUT',  'Schillerstraße IN',  'Schillerstraße OUT',  
'Schwarzbachbrücke IN',  'Schwarzbachbrücke OUT',  'Falkenstein 2 OUT',  'Falkenstein 2 IN',  'Lusen 2 IN',  
'Lusen 2 OUT',  'Lusen 3 IN',  'Lusen 3 OUT',  'Waldhausreibe IN',  'Waldhausreibe OUT',  'Waldspielgelände IN',  
'Waldspielgelände OUT',  'Wistlberg IN',  'Wistlberg OUT',  'Bucina MERGED IN',  'Bucina MERGED OUT',  
'Falkenstein 1 MERGED IN',  'Falkenstein 1 MERGED OUT',  'Lusen 1 MERGED IN',  'Lusen 1 MERGED OUT',  
'Trinkwassertalsperre MERGED IN',  'Trinkwassertalsperre MERGED OUT',  
'traffic_abs',  'sum_IN_abs',  'sum_OUT_abs',  'Temperature (°C)',  'Relative Humidity (%)',  
'Wind Speed (km/h)', 'Tag',  'Monat', 'Wochentag',  'Wochenende',  'Jahreszeit',  'Laubfärbung',  'Schulferien_Bayern',  
'Schulferien_CZ', 'Feiertag_Bayern',  'Feiertag_CZ',  'HEH_geoeffnet',  'HZW_geoeffnet',  'WGM_geoeffnet',  
'Lusenschutzhaus_geoeffnet',  'Racheldiensthuette_geoeffnet',  'Falkensteinschutzhaus_geoeffnet',  
'Schwellhaeusl_geoeffnet'
]


In [136]:
# Filter the dataframe to only include the specified columns
df = df[columns_to_use]

# Display the first few rows to ensure the data is loaded correctly
df.tail()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Schulferien_CZ,Feiertag_Bayern,Feiertag_CZ,HEH_geoeffnet,HZW_geoeffnet,WGM_geoeffnet,Lusenschutzhaus_geoeffnet,Racheldiensthuette_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet
75784,2024-12-31 19:00:00,,,,,,,,,,...,True,True,False,True,True,True,True,True,True,True
75785,2024-12-31 20:00:00,,,,,,,,,,...,True,True,False,True,True,True,True,True,True,True
75786,2024-12-31 21:00:00,,,,,,,,,,...,True,True,False,True,True,True,True,True,True,True
75787,2024-12-31 22:00:00,,,,,,,,,,...,True,True,False,True,True,True,True,True,True,True
75788,2024-12-31 23:00:00,,,,,,,,,,...,True,True,False,True,True,True,True,True,True,True


In [137]:
# CREATE NEW REGION VARIABLE

# Remove MERGED from column names with this unnecessary label
df.columns = df.columns.str.replace(' MERGED', '', regex=False)

# Create a dictionary for mapping
location_mapping = {
    'Bayerisch Eisenstein IN': 'Falkenstein-Schwellhäusl',
    'Bayerisch Eisenstein OUT': 'Falkenstein-Schwellhäusl', 
    'Brechhäuslau IN': 'Falkenstein-Schwellhäusl', 
    'Brechhäuslau OUT': 'Falkenstein-Schwellhäusl', 
    'Deffernik IN': 'Falkenstein-Schwellhäusl',
    'Deffernik OUT': 'Falkenstein-Schwellhäusl',
    'Falkenstein 1 IN': 'Nationalparkzentrum Falkenstein', 
    'Falkenstein 1 OUT': 'Nationalparkzentrum Falkenstein',
    'Falkenstein 2 IN': 'Nationalparkzentrum Falkenstein', 
    'Falkenstein 2 OUT': 'Nationalparkzentrum Falkenstein',
    'Ferdinandsthal IN': 'Falkenstein-Schwellhäusl', 
    'Ferdinandsthal OUT': 'Falkenstein-Schwellhäusl', 
    'Gsenget IN': 'Scheuereck-Schachten-Trinkwassertalsperre', 
    'Gsenget OUT': 'Scheuereck-Schachten-Trinkwassertalsperre', 
    'Scheuereck IN': 'Scheuereck-Schachten-Trinkwassertalsperre',
    'Scheuereck OUT': 'Scheuereck-Schachten-Trinkwassertalsperre', 
    'Schillerstraße IN': 'Falkenstein-Schwellhäusl', 
    'Schillerstraße OUT': 'Falkenstein-Schwellhäusl', 
    'Trinkwassertalsperre IN': 'Scheuereck-Schachten-Trinkwassertalsperre',
    'Trinkwassertalsperre OUT': 'Scheuereck-Schachten-Trinkwassertalsperre',
    'Bucina IN': 'Lusen-Mauth-Finsterau',
    'Bucina OUT': 'Lusen-Mauth-Finsterau', 
    'Diensthüttenstraße IN': 'Rachel-Spiegelau', 
    'Diensthüttenstraße OUT': 'Rachel-Spiegelau',
    'Felswandergebiet IN': 'Lusen-Mauth-Finsterau', 
    'Felswandergebiet OUT': 'Lusen-Mauth-Finsterau',
    'Fredenbrücke IN': 'Lusen-Mauth-Finsterau', 
    'Fredenbrücke OUT': 'Lusen-Mauth-Finsterau', 
    'Gfäll IN': 'Rachel-Spiegelau', 
    'Gfäll OUT': 'Rachel-Spiegelau', 
    'Klingenbrunner Wald IN': 'Rachel-Spiegelau', 
    'Klingenbrunner Wald OUT': 'Rachel-Spiegelau', 
    'Klosterfilz IN': 'Rachel-Spiegelau', 
    'Klosterfilz OUT': 'Rachel-Spiegelau',
    'Lusen 1 IN': 'Nationalparkzentrum Lusen', 
    'Lusen 1 OUT': 'Nationalparkzentrum Lusen', 
    'Lusen 2 IN': 'Nationalparkzentrum Lusen',
    'Lusen 2 OUT': 'Nationalparkzentrum Lusen', 
    'Lusen 3 IN': 'Nationalparkzentrum Lusen', 
    'Lusen 3 OUT': 'Nationalparkzentrum Lusen',
    'Racheldiensthütte IN': 'Rachel-Spiegelau', 
    'Racheldiensthütte OUT': 'Rachel-Spiegelau',
    'Schwarzbachbrücke IN': 'Lusen-Mauth-Finsterau', 
    'Schwarzbachbrücke OUT': 'Lusen-Mauth-Finsterau', 
    'Waldhausreibe IN': 'Lusen-Mauth-Finsterau', 
    'Waldhausreibe OUT': 'Lusen-Mauth-Finsterau', 
    'Waldspielgelände IN': 'Rachel-Spiegelau', 
    'Waldspielgelände OUT': 'Rachel-Spiegelau', 
    'Wistlberg IN': 'Lusen-Mauth-Finsterau', 
    'Wistlberg OUT': 'Lusen-Mauth-Finsterau', 
    'Sagwassersäge IN': 'Lusen-Mauth-Finsterau',
    'Sagwassersäge OUT': 'Lusen-Mauth-Finsterau'
}

# Extract unique regions
regions = set(location_mapping.values())

# Iterate over each region
for region in regions:
    # Filter the keys in location_mapping that belong to the current region
    region_in_columns = [col for col in location_mapping if location_mapping[col] == region and ' IN' in col]
    region_out_columns = [col for col in location_mapping if location_mapping[col] == region and ' OUT' in col]

    # Sum the values for all IN columns of the current region, while retaining NaN where all are NaN
    df[f'{region} IN'] = df[region_in_columns].sum(axis=1, min_count=1)
    
    # Sum the values for all OUT columns of the current region, while retaining NaN where all are NaN
    df[f'{region} OUT'] = df[region_out_columns].sum(axis=1, min_count=1)

# Display the updated DataFrame
df.tail()

# EXPLANATION OF LOOPING FUNCTION BELOW
#min_count=1 in sum():
#The sum(axis=1, min_count=1) method ensures that if all values being summed are NaN, the result will be NaN.
#If at least one value is not NaN, it will compute the sum, ignoring the NaN values.
#Explanation:
#min_count=1: This parameter in the sum() function specifies the minimum number of non-NaN values required to perform the summation. If the count of non-NaN values is less than min_count, the result will be NaN.
#Result: The DataFrame will have the new region columns that sum the sensors while retaining NaN if all sensors in a region are NaN for a given row.

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Nationalparkzentrum Lusen IN,Nationalparkzentrum Lusen OUT,Falkenstein-Schwellhäusl IN,Falkenstein-Schwellhäusl OUT,Rachel-Spiegelau IN,Rachel-Spiegelau OUT,Scheuereck-Schachten-Trinkwassertalsperre IN,Scheuereck-Schachten-Trinkwassertalsperre OUT,Lusen-Mauth-Finsterau IN,Lusen-Mauth-Finsterau OUT
75784,2024-12-31 19:00:00,,,,,,,,,,...,,,,,,,,,,
75785,2024-12-31 20:00:00,,,,,,,,,,...,,,,,,,,,,
75786,2024-12-31 21:00:00,,,,,,,,,,...,,,,,,,,,,
75787,2024-12-31 22:00:00,,,,,,,,,,...,,,,,,,,,,
75788,2024-12-31 23:00:00,,,,,,,,,,...,,,,,,,,,,


In [138]:
df["Time"].loc['2023-01-01':'2024-07-22'].isnull().sum()

0

In [139]:
def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df = wr.s3.read_csv(path=path, **kwargs)
    return df
df_newfeatures = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/holidays_deltaweather_features_df.csv"
)
df_newfeatures.head()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Sunshine Duration (min),coco_2,Date,Distance_to_Nearest_Holiday_Bayern,Distance_to_Nearest_Holiday_CZ,ZScore_Daily_Max_Temperature (°C),ZScore_Daily_Max_Relative Humidity (%),ZScore_Daily_Max_Precipitation (mm),ZScore_Daily_Max_Wind Speed (km/h),ZScore_Daily_Max_Sunshine Duration (min)
0,2017-01-01 00:00:00,,,257.0,412.0,,,,,,...,0.0,,2017-01-01,0,0,,,,,
1,2017-01-01 01:00:00,,,,,,,,,,...,0.0,,2017-01-01,0,0,,,,,
2,2017-01-01 02:00:00,,,,,,,,,,...,0.0,,2017-01-01,0,0,,,,,
3,2017-01-01 03:00:00,,,,,,,,,,...,0.0,,2017-01-01,0,0,,,,,
4,2017-01-01 04:00:00,,,,,,,,,,...,0.0,,2017-01-01,0,0,,,,,


In [140]:
df['Time'] = pd.to_datetime(df['Time'])
df_newfeatures['Time'] = pd.to_datetime(df_newfeatures['Time'])

# Step 2: Select the columns you want to add from df_newfeatures
columns_to_add = [
    'ZScore_Daily_Max_Temperature (°C)',
    'ZScore_Daily_Max_Relative Humidity (%)',
    'ZScore_Daily_Max_Wind Speed (km/h)',
    'Distance_to_Nearest_Holiday_Bayern',
    'Distance_to_Nearest_Holiday_CZ'
]

# Ensure that the selected columns exist in df_newfeatures
selected_columns = [col for col in columns_to_add if col in df_newfeatures.columns]

# Step 3: Merge df with df_newfeatures on 'Time' and add the selected columns
df = pd.merge(df, df_newfeatures[['Time'] + selected_columns], on='Time', how='left')

# Optionally, you can display the merged dataframe
df.tail()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Rachel-Spiegelau OUT,Scheuereck-Schachten-Trinkwassertalsperre IN,Scheuereck-Schachten-Trinkwassertalsperre OUT,Lusen-Mauth-Finsterau IN,Lusen-Mauth-Finsterau OUT,ZScore_Daily_Max_Temperature (°C),ZScore_Daily_Max_Relative Humidity (%),ZScore_Daily_Max_Wind Speed (km/h),Distance_to_Nearest_Holiday_Bayern,Distance_to_Nearest_Holiday_CZ
75784,2024-12-31 19:00:00,,,,,,,,,,...,,,,,,,,,0.0,5.0
75785,2024-12-31 20:00:00,,,,,,,,,,...,,,,,,,,,0.0,5.0
75786,2024-12-31 21:00:00,,,,,,,,,,...,,,,,,,,,0.0,5.0
75787,2024-12-31 22:00:00,,,,,,,,,,...,,,,,,,,,0.0,5.0
75788,2024-12-31 23:00:00,,,,,,,,,,...,,,,,,,,,0.0,5.0


In [141]:
# Specify the data types
dtype_dict = {
    'Time': 'datetime64[ns]',
    'traffic_abs': 'float64',
    'Temperature (°C)': 'float64',
    'Relative Humidity (%)': 'float64',
    'Wind Speed (km/h)': 'float64',
    'Monat': 'float64',
    'Wochentag': 'category',
    'Wochenende': 'category',
    'Jahreszeit': 'category',
    'Laubfärbung': 'category',
    'Feiertag_Bayern': 'category',
    'Feiertag_CZ': 'category',
    'HEH_geoeffnet': 'category',
    'HZW_geoeffnet': 'category',
    'WGM_geoeffnet': 'category',
    'Lusenschutzhaus_geoeffnet': 'category',
    'Racheldiensthuette_geoeffnet': 'category',
    'Falkensteinschutzhaus_geoeffnet': 'category',
    'Schwellhaeusl_geoeffnet': 'category',
    'Schulferien_Bayern': 'category',
    'Schulferien_CZ': 'category',
    'sum_IN_abs': 'float64',
    'sum_OUT_abs': 'float64',
    'Falkenstein-Schwellhäusl IN': 'float64',
    'Rachel-Spiegelau IN': 'float64',
    'Nationalparkzentrum Falkenstein IN': 'float64',
    'Nationalparkzentrum Lusen IN': 'float64',
    'Lusen-Mauth-Finsterau IN': 'float64',
    'Scheuereck-Schachten-Trinkwassertalsperre IN': 'float64',
    'Falkenstein-Schwellhäusl OUT': 'float64',
    'Rachel-Spiegelau OUT': 'float64',
    'Nationalparkzentrum Falkenstein OUT': 'float64',
    'Nationalparkzentrum Lusen OUT': 'float64',
    'Lusen-Mauth-Finsterau OUT': 'float64',
    'Scheuereck-Schachten-Trinkwassertalsperre OUT': 'float64',
    'Bayerisch Eisenstein IN': 'float64',
    'Bayerisch Eisenstein OUT': 'float64',
    'Brechhäuslau IN': 'float64',
    'Brechhäuslau OUT': 'float64',
    'Deffernik IN': 'float64',
    'Deffernik OUT': 'float64',
    'Diensthüttenstraße IN': 'float64',
    'Diensthüttenstraße OUT': 'float64',
    'Felswandergebiet IN': 'float64',
    'Felswandergebiet OUT': 'float64',
    'Ferdinandsthal IN': 'float64',
    'Ferdinandsthal OUT': 'float64',
    'Fredenbrücke IN': 'float64',
    'Fredenbrücke OUT': 'float64',
    'Gfäll IN': 'float64',
    'Gfäll OUT': 'float64',
    'Gsenget IN': 'float64',
    'Gsenget OUT': 'float64',
    'Klingenbrunner Wald IN': 'float64',
    'Klingenbrunner Wald OUT': 'float64',
    'Klosterfilz IN': 'float64',
    'Klosterfilz OUT': 'float64',
    'Racheldiensthütte IN': 'float64',
    'Racheldiensthütte OUT': 'float64',
    'Sagwassersäge IN': 'float64',
    'Sagwassersäge OUT': 'float64',
    'Scheuereck IN': 'float64',
    'Scheuereck OUT': 'float64',
    'Schillerstraße IN': 'float64',
    'Schillerstraße OUT': 'float64',
    'Schwarzbachbrücke IN': 'float64',
    'Schwarzbachbrücke OUT': 'float64',
    'Falkenstein 2 OUT': 'float64',
    'Falkenstein 2 IN': 'float64',
    'Lusen 2 IN': 'float64',
    'Lusen 2 OUT': 'float64',
    'Lusen 3 IN': 'float64',
    'Lusen 3 OUT': 'float64',
    'Waldhausreibe IN': 'float64',
    'Waldhausreibe OUT': 'float64',
    'Waldspielgelände IN': 'float64',
    'Waldspielgelände OUT': 'float64',
    'Wistlberg IN': 'float64',
    'Wistlberg OUT': 'float64',
    'Bucina IN': 'float64',
    'Bucina OUT': 'float64',
    'Falkenstein 1 IN': 'float64',
    'Falkenstein 1 OUT': 'float64',
    'Lusen 1 IN': 'float64',
    'Lusen 1 OUT': 'float64',
    'Trinkwassertalsperre IN': 'float64',
    'Trinkwassertalsperre OUT': 'float64',
    'ZScore_Daily_Max_Temperature (°C)': 'float64',
    'ZScore_Daily_Max_Relative Humidity (%)': 'float64',
    'ZScore_Daily_Max_Wind Speed (km/h)': 'float64',
    'Distance_to_Nearest_Holiday_Bayern': 'float64',
    'Distance_to_Nearest_Holiday_CZ': 'float64'
}

# Apply data types
df = df.astype(dtype_dict)

# Set 'Time' column as index
df.set_index('Time', inplace=True)

# Add 'Hour' column based on the index
df["Hour"] = df.index.hour

# Convert 'Hour' to categorical
df['Hour'] = pd.Categorical(df['Hour'])

# Reset the index to make 'Time' a column again
df.reset_index(inplace=True)

df.tail()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Scheuereck-Schachten-Trinkwassertalsperre IN,Scheuereck-Schachten-Trinkwassertalsperre OUT,Lusen-Mauth-Finsterau IN,Lusen-Mauth-Finsterau OUT,ZScore_Daily_Max_Temperature (°C),ZScore_Daily_Max_Relative Humidity (%),ZScore_Daily_Max_Wind Speed (km/h),Distance_to_Nearest_Holiday_Bayern,Distance_to_Nearest_Holiday_CZ,Hour
75784,2024-12-31 19:00:00,,,,,,,,,,...,,,,,,,,0.0,5.0,19
75785,2024-12-31 20:00:00,,,,,,,,,,...,,,,,,,,0.0,5.0,20
75786,2024-12-31 21:00:00,,,,,,,,,,...,,,,,,,,0.0,5.0,21
75787,2024-12-31 22:00:00,,,,,,,,,,...,,,,,,,,0.0,5.0,22
75788,2024-12-31 23:00:00,,,,,,,,,,...,,,,,,,,0.0,5.0,23


In [142]:
df["Time"].loc['2023-01-01':'2024-07-22'].isnull().sum()

0

In [143]:
# Ensure 'Time' is in datetime format
df['Time'] = pd.to_datetime(df['Time'])

# Set 'Time' as the index
df.set_index('Time', inplace=True)

# Slice the data from January 1, 2023, to August 19, 2024
df = df.loc['2023-01-01':'2024-08-19']
# Display the info to check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14328 entries, 2023-01-01 00:00:00 to 2024-08-19 23:00:00
Data columns (total 93 columns):
 #   Column                                         Non-Null Count  Dtype   
---  ------                                         --------------  -----   
 0   Bayerisch Eisenstein IN                        14050 non-null  float64 
 1   Bayerisch Eisenstein OUT                       14050 non-null  float64 
 2   Brechhäuslau IN                                14049 non-null  float64 
 3   Brechhäuslau OUT                               14049 non-null  float64 
 4   Deffernik IN                                   13773 non-null  float64 
 5   Deffernik OUT                                  13773 non-null  float64 
 6   Diensthüttenstraße IN                          13906 non-null  float64 
 7   Diensthüttenstraße OUT                         13906 non-null  float64 
 8   Felswandergebiet IN                            13643 non-null  float64 
 9   Fels

In [144]:
df.tail()

Unnamed: 0_level_0,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,Felswandergebiet OUT,...,Scheuereck-Schachten-Trinkwassertalsperre IN,Scheuereck-Schachten-Trinkwassertalsperre OUT,Lusen-Mauth-Finsterau IN,Lusen-Mauth-Finsterau OUT,ZScore_Daily_Max_Temperature (°C),ZScore_Daily_Max_Relative Humidity (%),ZScore_Daily_Max_Wind Speed (km/h),Distance_to_Nearest_Holiday_Bayern,Distance_to_Nearest_Holiday_CZ,Hour
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-08-19 19:00:00,,,,,,,,,,,...,,,2.0,2.0,-0.489326,0.535474,0.794311,4.0,40.0,19
2024-08-19 20:00:00,,,,,,,,,,,...,,,0.0,3.0,-0.489326,0.535474,0.794311,4.0,40.0,20
2024-08-19 21:00:00,,,,,,,,,,,...,,,0.0,0.0,-0.489326,0.535474,0.794311,4.0,40.0,21
2024-08-19 22:00:00,,,,,,,,,,,...,,,0.0,0.0,-0.489326,0.535474,0.794311,4.0,40.0,22
2024-08-19 23:00:00,,,,,,,,,,,...,,,0.0,0.0,-0.489326,0.535474,0.794311,4.0,40.0,23


In [145]:
region_columns = [
    'Falkenstein-Schwellhäusl IN', 
    'Rachel-Spiegelau IN', 
    'Nationalparkzentrum Falkenstein IN',
    'Nationalparkzentrum Lusen IN', 
    'Lusen-Mauth-Finsterau IN', 
    'Scheuereck-Schachten-Trinkwassertalsperre IN',
    'Falkenstein-Schwellhäusl OUT', 
    'Rachel-Spiegelau OUT', 
    'Nationalparkzentrum Falkenstein OUT',
    'Nationalparkzentrum Lusen OUT', 
    'Lusen-Mauth-Finsterau OUT', 
    'Scheuereck-Schachten-Trinkwassertalsperre OUT'
]

for column in region_columns:
    if column in df.columns:
        missing_dates = df[df[column].isna()].index
        if not missing_dates.empty:
            first_missing_date = missing_dates[0]
            print(f"Column '{column}' has its first missing value on {first_missing_date}")
        else:
            print(f"Column '{column}' has no missing values")
    else:
        print(f"Column '{column}' is not in the DataFrame")

Column 'Falkenstein-Schwellhäusl IN' has its first missing value on 2024-08-08 11:00:00
Column 'Rachel-Spiegelau IN' has its first missing value on 2024-08-07 14:00:00
Column 'Nationalparkzentrum Falkenstein IN' has no missing values
Column 'Nationalparkzentrum Lusen IN' has no missing values
Column 'Lusen-Mauth-Finsterau IN' has its first missing value on 2024-07-23 00:00:00
Column 'Scheuereck-Schachten-Trinkwassertalsperre IN' has its first missing value on 2024-08-06 18:00:00
Column 'Falkenstein-Schwellhäusl OUT' has its first missing value on 2024-08-08 11:00:00
Column 'Rachel-Spiegelau OUT' has its first missing value on 2024-08-07 14:00:00
Column 'Nationalparkzentrum Falkenstein OUT' has no missing values
Column 'Nationalparkzentrum Lusen OUT' has no missing values
Column 'Lusen-Mauth-Finsterau OUT' has its first missing value on 2024-07-23 00:00:00
Column 'Scheuereck-Schachten-Trinkwassertalsperre OUT' has its first missing value on 2024-08-06 18:00:00


In [146]:
df["Time"].loc['2023-01-01':'2024-07-22'].isnull().sum()

KeyError: 'Time'

In [None]:
# Slice the data from January 1, 2023, to July 22, 2024
df = df.loc['2023-01-01':'2024-07-22']
# Display the info to check data types
df.info()

In [None]:
df["Time"].loc['2023-01-01':'2024-07-22'].isnull().sum()

In [None]:
df.tail()

In [None]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

In [None]:
print(df.info())

In [147]:
LATITUDE = 49.31452390542327
LONGITUDE = 12.711573421032

# Define start and end dates for inference
#start_date = datetime.now()
#end_date = start_date + pd.Timedelta(days=7)
start_date = datetime(2023, 1, 1)
end_date = datetime.now() + timedelta(days=7)

def get_hourly_data_forecasted(bavarian_forest):
    
    """
    Fetch hourly weather data for the Bavarian Forest - forecasted from todays date

    Returns:
        pd.DataFrame: Hourly weather data
    
    """
    data = Hourly(bavarian_forest, start_date, end_date)
    data = data.fetch()

    # Reset the index
    data.reset_index(inplace=True)
    return data 


def source_weather_data():

    """
    Source the weather data from METEOSTAT API

    Returns:
        pd.DataFrame: Hourly weather data for the Bavarian Forest National Park for the next 7 days
    """


    # Create a Point object for the Bavarian Forest National Park entry
    bavarian_forest = Point(lat=LATITUDE, lon=LONGITUDE)
    bavarian_forest.max_count = 10

    print(bavarian_forest.max_count)

    # Fetch hourly data for the location
    weather_hourly = get_hourly_data_forecasted(bavarian_forest)

    # Drop unnecessary columns
    weather_hourly = weather_hourly.drop(columns=['dwpt', 'wdir', 'wpgt', 'pres', 'tsun', 'prcp', 'snow'])

    # Convert the 'Time' column to datetime format
    weather_hourly['time'] = pd.to_datetime(weather_hourly['time'])
    return weather_hourly

# Source the weather data
weather_data_df = source_weather_data()

10


In [148]:
coco_to_coco_2_mapping = {
    1: 1,  # Clear
    2: 1,  # Fair
    3: 2,  # Cloudy
    4: 2,  # Overcast
    5: 2,  # Fog
    6: 5,  # Freezing Fog
    7: 3,  # Light Rain
    8: 3,  # Rain
    9: 3,  # Heavy Rain
    10: 5, # Freezing Rain
    11: 5, # Heavy Freezing Rain
    12: 5, # Sleet
    13: 5, # Heavy Sleet
    14: 4, # Light Snowfall
    15: 4, # Snowfall
    16: 4, # Heavy Snowfall
    17: 3, # Rain Shower
    18: 3, # Heavy Rain Shower
    19: 3, # Sleet Shower
    20: 5, # Heavy Sleet Shower
    21: 4, # Snow Shower
    22: 4, # Heavy Snow Shower
    23: 6, # Lightning
    24: 6, # Hail
    25: 6, # Thunderstorm\
    26: 6, # Heavy Thunderstorm
    27: 6  # Storm
}

# Creating the new 'coco_2' column based on the mapping
weather_data_df['coco_2'] = weather_data_df['coco'].map(coco_to_coco_2_mapping)

# Drop the original 'coco' column
weather_data_df = weather_data_df.drop(columns=['coco'])

weather_data_df.tail()

Unnamed: 0,time,temp,rhum,wspd,coco_2
15247,2024-09-27 07:00:00,10.6,91.0,14.8,3
15248,2024-09-27 08:00:00,11.7,88.0,16.7,3
15249,2024-09-27 09:00:00,12.7,83.0,16.7,3
15250,2024-09-27 10:00:00,13.7,75.0,18.5,3
15251,2024-09-27 11:00:00,14.3,73.0,20.4,3


In [149]:
weather_data_df["coco_2"].isnull()

0        False
1        False
2        False
3        False
4        False
         ...  
15247    False
15248    False
15249    False
15250    False
15251    False
Name: coco_2, Length: 15252, dtype: bool

In [150]:
# Step 1: Convert the 'time' column to datetime type
weather_data_df['time'] = pd.to_datetime(weather_data_df['time'])

# Step 2: Set the 'time' column as the index
weather_data_df = weather_data_df.set_index('time')

# Optionally, sort the index if needed
weather_data_df = weather_data_df.sort_index()

In [151]:
df['coco_2'] = weather_data_df['coco_2'].reindex(df.index)

# df['coco'] = weather_data_df['coco'].reindex(df.index)


df.tail()

Unnamed: 0_level_0,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,Felswandergebiet OUT,...,Scheuereck-Schachten-Trinkwassertalsperre OUT,Lusen-Mauth-Finsterau IN,Lusen-Mauth-Finsterau OUT,ZScore_Daily_Max_Temperature (°C),ZScore_Daily_Max_Relative Humidity (%),ZScore_Daily_Max_Wind Speed (km/h),Distance_to_Nearest_Holiday_Bayern,Distance_to_Nearest_Holiday_CZ,Hour,coco_2
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-08-19 19:00:00,,,,,,,,,,,...,,2.0,2.0,-0.489326,0.535474,0.794311,4.0,40.0,19,2
2024-08-19 20:00:00,,,,,,,,,,,...,,0.0,3.0,-0.489326,0.535474,0.794311,4.0,40.0,20,2
2024-08-19 21:00:00,,,,,,,,,,,...,,0.0,0.0,-0.489326,0.535474,0.794311,4.0,40.0,21,2
2024-08-19 22:00:00,,,,,,,,,,,...,,0.0,0.0,-0.489326,0.535474,0.794311,4.0,40.0,22,2
2024-08-19 23:00:00,,,,,,,,,,,...,,0.0,0.0,-0.489326,0.535474,0.794311,4.0,40.0,23,2


In [153]:
# Features to apply cyclic transformation
cyclic_features = ['Tag', 'Monat', 'Hour', 'Wochentag']

# Convert categorical features to numeric if they are not already
for feature in cyclic_features:
    if feature in df.columns:
        if pd.api.types.is_categorical_dtype(df[feature]):
            df[feature] = df[feature].cat.codes  # Convert categorical to numeric codes
        
        max_value = df[feature].max()  # Get max value for scaling
        
        # Apply sine and cosine transformations
        df[f'{feature}_sin'] = np.sin(2 * np.pi * df[feature] / max_value)
        df[f'{feature}_cos'] = np.cos(2 * np.pi * df[feature] / max_value)
    else:
        print(f"Warning: Feature '{feature}' not found in DataFrame")


In [155]:
# List of numeric features to normalize
standardized_features = ['Temperature (°C)', 'Relative Humidity (%)', 'Wind Speed (km/h)', 'Distance_to_Nearest_Holiday_Bayern', 'Distance_to_Nearest_Holiday_CZ']

# Loop through each numeric feature and apply z-score normalization
for feature in standardized_features:
    if feature in df.columns:
        mean_value = df[feature].mean()  # Calculate mean
        std_value = df[feature].std()    # Calculate standard deviation
        
        # Apply z-score normalization
        df[feature] = (df[feature] - mean_value) / std_value
    else:
        print(f"Warning: Feature '{feature}' not found in DataFrame")

In [None]:
df["Time"].loc['2023-01-01':'2024-07-22'].isnull().sum()

In [113]:
df = pd.get_dummies(df, columns=['Jahreszeit', 'coco_2'], drop_first=False)

In [None]:
df["Time"].loc['2023-01-01':'2024-07-22'].isnull().sum()

In [85]:
columns_to_drop = ['Tag', 'Monat', 'Wochentag', 'Hour']
df = df.drop(columns=columns_to_drop)

In [None]:
df["Time"].loc['2023-01-01':'2024-07-22'].isnull().sum()

In [None]:
print(df.columns.tolist())

In [None]:
# List of columns to update
columns_to_update = [
    'Jahreszeit_Frühling',
    'Jahreszeit_Herbst',
    'Jahreszeit_Sommer',
    'Jahreszeit_Winter',
    'coco_2_1',
    'coco_2_2',
    'coco_2_3',
    'coco_2_4',
    'coco_2_5',
    'coco_2_6'
]

# Replace TRUE with 1 and FALSE with 0, then convert to category
for column in columns_to_update:
    if column in df.columns:
        df[column] = df[column].replace({True: 1, False: 0})
        df[column] = df[column].astype('category')

# Verify changes
print(df[columns_to_update].dtypes)

In [None]:
print(df.dtypes)

In [88]:
for col in df.select_dtypes(include=['object', 'category']).columns:
    # Replace 'True' with 1 and 'False' with 0 if the column contains these values
    if df[col].astype(str).str.contains('True').any() or df[col].astype(str).str.contains('False').any():
        df[col] = df[col].replace({'True': 1, 'False': 0})
    
    # Convert column to integer if it was replaced
    if df[col].dtype == 'object' or df[col].dtype.name == 'category':
        df[col] = df[col].astype('int')  # Convert to integer type
        df[col] = df[col].astype('category')  # Convert to category dtype

In [None]:
df.isnull().sum()

In [None]:
# Define target and feature columns
target_vars_et = ['traffic_abs', 'sum_IN_abs', 'sum_OUT_abs', 'Lusen-Mauth-Finsterau IN', 'Lusen-Mauth-Finsterau OUT', 
               'Nationalparkzentrum Lusen IN', 'Nationalparkzentrum Lusen OUT', 'Rachel-Spiegelau IN', 'Rachel-Spiegelau OUT', 
               'Falkenstein-Schwellhäusl IN', 'Falkenstein-Schwellhäusl OUT', 
               'Scheuereck-Schachten-Trinkwassertalsperre IN', 'Scheuereck-Schachten-Trinkwassertalsperre OUT', 
              'Nationalparkzentrum Falkenstein IN', 'Nationalparkzentrum Falkenstein OUT']

numeric_features = ['Tag_sin', 'Tag_cos', 'Monat_sin', 'Monat_cos', 'Hour_sin', 'Hour_cos', 'Wochentag_sin', 'Wochentag_cos',
                    'Temperature (°C)', 'Relative Humidity (%)', 'Wind Speed (km/h)', 
                    'ZScore_Daily_Max_Temperature (°C)', 
                    'ZScore_Daily_Max_Relative Humidity (%)','ZScore_Daily_Max_Wind Speed (km/h)',
                    'Distance_to_Nearest_Holiday_Bayern','Distance_to_Nearest_Holiday_CZ']

categorical_features = ['Wochenende', 'Jahreszeit_Frühling', 'Jahreszeit_Herbst', 'Jahreszeit_Sommer', 'Jahreszeit_Winter', 'Laubfärbung',
                       'coco_2_1', 'coco_2_2', 'coco_2_3', 'coco_2_4', 'coco_2_5', 'coco_2_6',
                        'Schulferien_Bayern', 'Schulferien_CZ', 
                       'Feiertag_Bayern', 'Feiertag_CZ', 'HEH_geoeffnet', 'HZW_geoeffnet', 'WGM_geoeffnet', 
                       'Lusenschutzhaus_geoeffnet', 'Racheldiensthuette_geoeffnet', 'Falkensteinschutzhaus_geoeffnet', 
                        'Schwellhaeusl_geoeffnet']

for catfeature in categorical_features: 
    df[catfeature] = df[catfeature].astype(str)

# Dictionary to store dataframes
target_dataframes_et = {}

# Iterate over each target variable
for target in target_vars_et:
    if target in df.columns:
        # Select the target variable and features
        target_df_et = df[numeric_features + categorical_features + [target]].copy()
        target_dataframes_et[target] = target_df_et
        print(f"DataFrame for target variable '{target}' created.")
    else:
        print(f"Target variable '{target}' is not in the DataFrame columns.")

In [None]:
predictions_dict_et = {}

save_path = r"C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24"

for target in target_vars_et:
    # Ensure the target is in the dictionary of processed DataFrames
    if target in target_dataframes_et:
        df = target_dataframes_et[target]
        
        # Ensure the DataFrame has a date-time index
        if isinstance(df.index, pd.DatetimeIndex):
            # Define date ranges for training, testing, and unseen data
            train_start = '2023-01-01'
            train_end = '2023-12-31'
            test_start = '2024-01-01'
            test_end = '2024-04-30'
            unseen_start = '2024-05-01'
            unseen_end = '2024-07-22'
            
            # Split the data into train, test, and unseen sets based on date ranges
            df_train = df.loc[train_start:train_end]
            df_test = df.loc[test_start:test_end]
            df_unseen = df.loc[unseen_start:unseen_end]
            
            # Combine train and test data for model training
            df_train_and_test = pd.concat([df_train, df_test])
            
            # Setup PyCaret for the target variable with the combined data
            reg_setup = setup(data=df_train_and_test,
                              target=target, 
                              numeric_features=numeric_features, 
                              categorical_features=categorical_features,
                              fold=5,
                              preprocess=False,
                              data_split_shuffle=False,  # Do not shuffle data to maintain date order
                              session_id=123,
                              train_size=0.9)  # Use 90% of data for training 
            
            # Train the Extra Trees Regressor model
            extra_trees_model = create_model('et')
            
            # Predict on the unseen data
            predictions_unseen = predict_model(extra_trees_model, data=df_unseen)
            
            # Save the model
            save_model(extra_trees_model, f"{save_path}/extra_trees_{target}")
            
            # Save the predictions in the dictionary for future use
            predictions_dict_et[f"extra_trees_{target}"] = predictions_unseen
            
            print(f"Predictions for unseen data saved for {target}") 
            
            # Optionally, save the predictions to a CSV
            # predictions_unseen.to_csv(f"{save_path}/predictions_unseen_{target}.csv", index=False)


In [None]:
# Define the path to load models
save_path = r"C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24"

# Define target variable lists
target_vars_et = ['traffic_abs', 'sum_IN_abs', 'sum_OUT_abs', 'Lusen-Mauth-Finsterau IN', 'Lusen-Mauth-Finsterau OUT', 
               'Nationalparkzentrum Lusen IN', 'Nationalparkzentrum Lusen OUT', 'Rachel-Spiegelau IN', 'Rachel-Spiegelau OUT', 
               'Falkenstein-Schwellhäusl IN', 'Falkenstein-Schwellhäusl OUT', 
               'Scheuereck-Schachten-Trinkwassertalsperre IN', 'Scheuereck-Schachten-Trinkwassertalsperre OUT', 
               'Nationalparkzentrum Falkenstein IN', 'Nationalparkzentrum Falkenstein OUT']


# Plot feature importance for Extra Trees models
for target in target_vars_et:
    model_filename = f'extra_trees_{target}'
    full_model_path = os.path.join(save_path, model_filename)
    
    try:
        # Load the saved model
        loaded_model = load_model(full_model_path)
        
        # Plot feature importance
        print(f"Feature importance for Extra Trees model on target '{target}':")
        plot_model(loaded_model, plot='feature_all')
    
    except FileNotFoundError as e:
        print(f"File not found: {e}")
              

In [None]:
# Visualize predictions for Extra Trees models
for key, predictions_et in predictions_dict_et.items():
    target = key.split('_', 2)[-1]  # This assumes the format 'extra_trees_<target>'
    
    if "prediction_label" in predictions_et.columns and target in predictions_et.columns:
        predictions_vs_real_et = predictions_et[[target, "prediction_label"]].sort_index(ascending=True)
        
        # Create a line plot using Matplotlib
        plt.figure(figsize=(12, 6))
        plt.plot(predictions_vs_real_et.index, predictions_vs_real_et[target], label='Actual', color='blue')
        plt.plot(predictions_vs_real_et.index, predictions_vs_real_et["prediction_label"], label='Predicted', color='red')
        plt.xlabel('Index')
        plt.ylabel('Value')
        plt.title(f"Predictions vs. Real Values for {key}")
        plt.legend()
        plt.show()
    else:
        print(f"Columns '{target}' and 'prediction_label' not found in predictions for {key}")

In [None]:
for key, predictions_et in predictions_dict_et.items():
    target = key.split('_', 2)[-1]  # This assumes the format 'extra_trees_<target>'
    
    if "prediction_label" in predictions_et.columns and target in predictions_et.columns:
        # Resample predictions and actual values on a daily basis
        daily_prediction_comparison = predictions_et[[target, "prediction_label"]].resample("1d").sum()

        # Calculate the mean absolute error (MAE)
        daily_prediction_comparison["mae"] = abs(daily_prediction_comparison[target] - daily_prediction_comparison["prediction_label"])
        print(f"The MAE on a daily basis for {target} is {daily_prediction_comparison['mae'].mean()}.")

        # Plot the actual vs predicted values using Plotly Express
        fig = px.line(daily_prediction_comparison, y=[target, "prediction_label"],
                      labels={"value": "Value", "variable": "Legend"},
                      title=f"Daily Predictions vs Actuals for {target}")
        fig.show()

    else:
        print(f"Columns '{target}' and 'prediction_label' not found in predictions for {key}")


In [None]:
for key, predictions_et in predictions_dict_et.items():
    target = key.split('_', 2)[-1]  # This assumes the format 'extra_trees_<target>'
    
    if "prediction_label" in predictions_et.columns and target in predictions_et.columns:
        # Resample predictions and actual values on a daily basis
        daily_prediction_comparison = predictions_et[[target, "prediction_label"]].resample("1d").sum()

        # Calculate the mean absolute error (MAE) on a daily basis
        daily_prediction_comparison["mae"] = abs(daily_prediction_comparison[target] - daily_prediction_comparison["prediction_label"])

        # Print the average number of people visiting the park (or any other target)
        print(f"On average, {daily_prediction_comparison[target].mean()} people are visiting the park daily for {target}.")

        # Create a box plot of the MAE using Plotly Express
        fig_box = px.box(daily_prediction_comparison, y="mae", title=f"MAE Distribution for {target}")
        fig_box.show()

        # Identify the top 50 days with the highest error
        high_error_dates = daily_prediction_comparison["mae"].sort_values(ascending=False).head(50)
        print(high_error_dates)

        # Retrieve and print training data columns using PyCaret's get_config function
        X_train = get_config('X_train')
        X_train_columns = X_train.columns.to_list()
        print(f"Training columns for {target}: {X_train_columns}")

    else:
        print(f"Columns '{target}' and 'prediction_label' not found in predictions for {key}")



In [None]:
# Define start and end dates for inference
start_date = datetime.now()
end_date = datetime.now() + timedelta(days=7)
#start_date = datetime(2016, 1, 1)
#end_date = datetime (2024, 7, 22)

# Coordinates of the Bavarian Forest (Haselbach)
# These coordinates are based on the weather recommendation by Google for a Bavarian Forest Weather search
LATITUDE = 49.31452390542327
LONGITUDE = 12.711573421032

# Create an hourly date range for inference
inference_index = pd.date_range(
    start=pd.to_datetime(start_date),
    end=pd.to_datetime(end_date),
    freq="1h"
)

# Create an empty DataFrame for inference with X_train columns
inference_df = pd.DataFrame(index=inference_index, columns=X_train_columns)

# Ensure the index is at the hourly level by flooring it to the hour
inference_df.index = pd.DatetimeIndex(inference_df.index).floor('H')

# Drop rows in inference_df that are outside the START_TIME and END_TIME range
inference_df = inference_df[(inference_df.index >= pd.Timestamp(start_date).floor('H')) & 
                            (inference_df.index <= pd.Timestamp(end_date).floor('H'))]

inference_df.head()


In [None]:
#note df2 = visitor center data

def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df2 = wr.s3.read_csv(path=path, **kwargs)
    return df2
df2 = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/df_visitcenters_hourly.csv"
)

df2.tail()

In [35]:
# Ignore warnings
warnings.filterwarnings('ignore')

def fill_missing_values(data, parameters):
    """
    Fill missing values in the weather data using linear interpolation or zero values.

    Args:
        data (pandas.DataFrame): Processed hourly weather data.
        parameters (list): List of column names to process.

    Returns:
        pandas.DataFrame: DataFrame with missing values filled.
    """
    total_rows = data.shape[0]

    for parameter in parameters:
        # Calculate missing values and their percentage
        missing_values = data[parameter].isnull().sum()
        missing_percentage = (missing_values / total_rows) * 100

        # Calculate zero values and their percentage
        zero_values = data[parameter].eq(0).sum()
        zero_percentage = (zero_values / total_rows) * 100

        # Check for missing values in the 'Time' column
        if parameter == 'Time' and missing_values > 0:
            print(f'Missing values in Time column: {missing_percentage:.2f}%')
            print('Please check the missing values in the Time column')
            exit()

        if missing_values == 0:
            print(f'No missing values in {parameter} column')
        else:
            print(f'Missing values in {parameter} column: {missing_percentage:.2f}%')

            if zero_percentage > 60:
                # Fill missing values with 0.0 if zero values are significant
                print(f'Zero values in {parameter} column: {zero_percentage:.2f}%')
                data[parameter].fillna(0.0, inplace=True)
                print(f'Missing values in {parameter} column filled with 0.0')
            else:
                # Use linear interpolation to fill missing values
                data[parameter].interpolate(method='linear', inplace=True)
                # Round the interpolated values to 2 decimal places
                data[parameter] = data[parameter].round(2)
                print(f'Missing values in {parameter} column filled using linear interpolation')

    return data


def process_weather_data(weather_data_df):
    """
    Process the hourly weather data by filling missing values.

    Args:
        weather_data_df (pandas.DataFrame): Hourly weather data.
    
    Returns:
        pandas.DataFrame: Processed weather data with missing values filled.
    """


    # Get the list of columns to process
    parameters = weather_data_df.columns.to_list()

    print(f'Processing weather data with the following columns: {parameters}')

    # Fill missing values in the weather data
    imputed_data = fill_missing_values(weather_data_df, parameters)

    return imputed_data

process_weather_data=weather_data_df

In [None]:
# Identify common columns between inference_df and the new DataFrames
common_columns_df2 = df2.columns.intersection(inference_df.columns)
common_columns_process_weather_data = process_weather_data.columns.intersection(inference_df.columns)

# Merge df2 and process_weather_data into inference_df based on datetime index
inference_df.update(df2[common_columns_df2])
inference_df.update(process_weather_data[common_columns_process_weather_data])

In [262]:
def add_cyclic_features(df):
    """
    Applies cyclic transformations to various columns based on their specific cyclic patterns
    and updates the corresponding columns in the DataFrame.

    Parameters:
    df (pd.DataFrame): DataFrame with a DatetimeIndex and columns for which to apply cyclic transformations.

    Returns:
    pd.DataFrame: DataFrame with cyclic feature columns updated.
    """
    # Ensure the index is a DatetimeIndex
    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("The index of the DataFrame must be a DatetimeIndex")

    # Extract features from the datetime index
    df['day_of_month'] = df.index.day
    df['month'] = df.index.month
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek

    # Define maximum values for cyclic transformations
    max_values = {
        'day_of_month': 31,  # Maximum number of days in a month
        'month': 12,         # Maximum number of months in a year
        'hour': 24,          # Maximum number of hours in a day
        'day_of_week': 7     # Number of days in a week
    }

    # Define transformation columns
    transformations = {
        'day_of_month': ('Tag_sin', 'Tag_cos'),
        'month': ('Monat_sin', 'Monat_cos'),
        'hour': ('Hour_sin', 'Hour_cos'),
        'day_of_week': ('Wochentag_sin', 'Wochentag_cos')
    }

    # Apply transformations
    for feature, (sin_col, cos_col) in transformations.items():
        max_value = max_values[feature]
        df[sin_col] = np.sin(2 * np.pi * df[feature] / max_value)
        df[cos_col] = np.cos(2 * np.pi * df[feature] / max_value)
    
    # Drop the temporary columns used for transformations
    df.drop(columns=['day_of_month', 'month', 'hour', 'day_of_week'], inplace=True)

    return df

# Apply the function to inference_df
inference_df = add_cyclic_features(inference_df)

In [263]:
def normalize_weather_data(process_weather_data, inference_df):
    """
    Applies z-score normalization to 'temp', 'rhum', and 'wspd' columns in process_weather_data,
    then updates the corresponding columns in inference_df with the normalized data, 
    based on matching the DatetimeIndex.
    
    Parameters:
    process_weather_data (pd.DataFrame): DataFrame with weather data containing 'temp', 'rhum', and 'wspd' columns.
    inference_df (pd.DataFrame): DataFrame where normalized data will be placed, matching on DatetimeIndex.
    
    Returns:
    pd.DataFrame: inference_df with updated 'Temperature (°C)', 'Relative Humidity (%)', 'Wind Speed (km/h)' columns.
    """
    # Ensure both DataFrames have a DatetimeIndex
    if not isinstance(process_weather_data.index, pd.DatetimeIndex):
        raise ValueError("The index of process_weather_data must be a DatetimeIndex")
    if not isinstance(inference_df.index, pd.DatetimeIndex):
        raise ValueError("The index of inference_df must be a DatetimeIndex")
    
    # Ensure the weather columns exist in process_weather_data
    required_columns = ['temp', 'rhum', 'wspd']
    if not all(col in process_weather_data.columns for col in required_columns):
        raise ValueError(f"process_weather_data must contain columns: {required_columns}")
    
    # Z-score normalization function
    def z_score_normalize(series):
        return (series - series.mean()) / series.std()
    
    # Apply z-score normalization to 'temp', 'rhum', and 'wspd' columns
    process_weather_data['temp_normalized'] = z_score_normalize(process_weather_data['temp'])
    process_weather_data['rhum_normalized'] = z_score_normalize(process_weather_data['rhum'])
    process_weather_data['wspd_normalized'] = z_score_normalize(process_weather_data['wspd'])
    
    # Find common DatetimeIndex between the two DataFrames
    common_index = process_weather_data.index.intersection(inference_df.index)
    
    if common_index.empty:
        raise ValueError("No matching dates found between process_weather_data and inference_df")
    
    # Update corresponding columns in inference_df only where DatetimeIndex matches
    inference_df = inference_df.copy()  # Make a copy to avoid modifying original
    inference_df.loc[common_index, 'Temperature (°C)'] = process_weather_data.loc[common_index, 'temp_normalized']
    inference_df.loc[common_index, 'Relative Humidity (%)'] = process_weather_data.loc[common_index, 'rhum_normalized']
    inference_df.loc[common_index, 'Wind Speed (km/h)'] = process_weather_data.loc[common_index, 'wspd_normalized']
    
    return inference_df

# Usage
inference_df = normalize_weather_data(process_weather_data, inference_df)

In [None]:
def set_time_column_as_datetime_index(df2):
    """
    Sets the 'Time' column in df2 as the DatetimeIndex and drops the original 'Time' column.
    
    Parameters:
    df2 (pd.DataFrame): DataFrame with a 'Time' column that will be converted to the DatetimeIndex.
    
    Returns:
    pd.DataFrame: DataFrame with 'Time' column set as the DatetimeIndex.
    """
    if 'Time' not in df2.columns:
        raise ValueError("The column 'Time' does not exist in df2")
    
    # Convert 'Time' column to datetime
    df2['Time'] = pd.to_datetime(df2['Time'])
    
    # Set 'Time' column as the index
    df2 = df2.set_index('Time')
    
    return df2

# Example usage
df2 = set_time_column_as_datetime_index(df2)

df2.head()

In [265]:
def match_columns_and_update(df2, inference_df):
    """
    Updates the columns in inference_df with the corresponding data from df2,
    based on matching DatetimeIndex and common column names.
    
    Parameters:
    df2 (pd.DataFrame): DataFrame with data to update inference_df.
    inference_df (pd.DataFrame): DataFrame where matched columns will be updated with df2 data.
    
    Returns:
    pd.DataFrame: inference_df with updated data for matching columns based on the DatetimeIndex.
    """
    # Ensure both DataFrames have a DatetimeIndex
    if not isinstance(df2.index, pd.DatetimeIndex):
        raise ValueError("The index of df2 must be a DatetimeIndex")
    if not isinstance(inference_df.index, pd.DatetimeIndex):
        raise ValueError("The index of inference_df must be a DatetimeIndex")
    
    # Find the common DatetimeIndex between the two DataFrames
    common_index = df2.index.intersection(inference_df.index)
    
    if common_index.empty:
        raise ValueError("No matching dates found between df2 and inference_df")
    
    # Find the common columns between the two DataFrames
    common_columns = df2.columns.intersection(inference_df.columns)
    
    if common_columns.empty:
        raise ValueError("No matching columns found between df2 and inference_df")
    
    # Update corresponding columns in inference_df only where both index and columns match
    inference_df = inference_df.copy()  # Make a copy to avoid modifying original
    inference_df.loc[common_index, common_columns] = df2.loc[common_index, common_columns]
    
    return inference_df

# Example usage
inference_df = match_columns_and_update(df2, inference_df)


In [267]:
def add_coco_2_dummies(process_weather_data, inference_df):
    """
    Creates dummy variables for the 'coco_2' column in process_weather_data, 
    and updates the corresponding columns in inference_df where the DatetimeIndex matches.
    
    Parameters:
    process_weather_data (pd.DataFrame): DataFrame containing the 'coco_2' column.
    inference_df (pd.DataFrame): DataFrame where dummy columns will be updated, matching on DatetimeIndex.
    
    Returns:
    pd.DataFrame: inference_df with updated 'coco_2_1', 'coco_2_2', ..., 'coco_2_6' columns.
    """
    # Ensure both DataFrames have a DatetimeIndex
    if not isinstance(process_weather_data.index, pd.DatetimeIndex):
        raise ValueError("The index of process_weather_data must be a DatetimeIndex")
    if not isinstance(inference_df.index, pd.DatetimeIndex):
        raise ValueError("The index of inference_df must be a DatetimeIndex")
    
    # Ensure 'coco_2' exists in process_weather_data
    if 'coco_2' not in process_weather_data.columns:
        raise ValueError("'coco_2' column is missing in process_weather_data")

    # Get dummy codes for 'coco_2'
    coco_2_dummies = pd.get_dummies(process_weather_data['coco_2'], prefix='coco_2')

    # Find common DatetimeIndex between the two DataFrames
    common_index = process_weather_data.index.intersection(inference_df.index)

    if common_index.empty:
        raise ValueError("No matching dates found between process_weather_data and inference_df")

    # Select only the common index from coco_2_dummies
    coco_2_dummies_common = coco_2_dummies.loc[common_index]

    # Create the columns in inference_df if they don't exist
    for i in range(1, 7):  # Assuming there are 6 dummy columns
        column_name = f'coco_2_{i}'
        if column_name not in inference_df.columns:
            inference_df[column_name] = 0  # Initialize with zeros

    # Update inference_df with dummy codes where the DatetimeIndex matches
    inference_df.loc[common_index, coco_2_dummies_common.columns] = coco_2_dummies_common

    return inference_df

# Usage
inference_df = add_coco_2_dummies(process_weather_data, inference_df)


In [271]:
def add_jahreszeit_dummies(df2, inference_df):
    """
    Creates dummy variables for the 'Jahreszeit' column in df2,
    and updates the corresponding columns in inference_df where the DatetimeIndex matches.
    
    Parameters:
    df2 (pd.DataFrame): DataFrame containing the 'Jahreszeit' column.
    inference_df (pd.DataFrame): DataFrame where dummy columns will be updated, matching on DatetimeIndex.
    
    Returns:
    pd.DataFrame: inference_df with updated 'Jahreszeit_Frühling', 'Jahreszeit_Herbst', 
                  'Jahreszeit_Sommer', 'Jahreszeit_Winter' columns.
    """
    # Ensure both DataFrames have a DatetimeIndex
    if not isinstance(df2.index, pd.DatetimeIndex):
        raise ValueError("The index of df2 must be a DatetimeIndex")
    if not isinstance(inference_df.index, pd.DatetimeIndex):
        raise ValueError("The index of inference_df must be a DatetimeIndex")
    
    # Ensure 'Jahreszeit' exists in df2
    if 'Jahreszeit' not in df2.columns:
        raise ValueError("'Jahreszeit' column is missing in df2")

    # Get dummy codes for 'Jahreszeit'
    jahreszeit_dummies = pd.get_dummies(df2['Jahreszeit'], prefix='Jahreszeit')

    # Find common DatetimeIndex between the two DataFrames
    common_index = df2.index.intersection(inference_df.index)

    if common_index.empty:
        raise ValueError("No matching dates found between df2 and inference_df")

    # Select only the common index from jahreszeit_dummies
    jahreszeit_dummies_common = jahreszeit_dummies.loc[common_index]

    # Create the columns in inference_df if they don't exist
    for season in ['Frühling', 'Herbst', 'Sommer', 'Winter']:
        column_name = f'Jahreszeit_{season}'
        if column_name not in inference_df.columns:
            inference_df[column_name] = 0  # Initialize with zeros

    # Update inference_df with dummy codes where the DatetimeIndex matches
    inference_df.loc[common_index, jahreszeit_dummies_common.columns] = jahreszeit_dummies_common

    return inference_df

# Usage
inference_df = add_jahreszeit_dummies(df2, inference_df)


In [279]:
def convert_boolean_to_category(inference_df):
    """
    Converts instances of values equal to TRUE to 1 and FALSE to 0 in all object type columns of inference_df,
    then converts these columns to category type.
    
    Parameters:
    inference_df (pd.DataFrame): DataFrame containing object type columns.
    
    Returns:
    pd.DataFrame: Updated inference_df with converted columns as category type.
    """
    # Identify object type columns
    object_columns = inference_df.select_dtypes(include=['object']).columns

    for col in object_columns:
        # Convert TRUE/FALSE strings to 1/0
        inference_df[col] = inference_df[col].replace({'TRUE': 1, 'FALSE': 0})

        # Convert column to category type
        inference_df[col] = inference_df[col].astype('category')

    return inference_df

# Usage
inference_df = convert_boolean_to_category(inference_df)


In [None]:
def generate_forecasts_for_targets(inference_df, target_vars, model_path):
    """
    Generate forecasts for each target variable using corresponding models.

    Parameters:
    - inference_df: DataFrame containing features for predictions.
    - target_vars: List of target variables.
    - model_path: Path to save and load models.

    Returns:
    - forecasts_dict: Dictionary containing DataFrames with predictions for each target variable.
    """
    forecasts_dict = {}

    for target in target_vars:
        if target in inference_df.columns:
            # Create a copy of inference_df for the current target variable
            current_df = inference_df.copy()

            # Load the corresponding model for the target variable
            model = load_model(f"{model_path}/extra_trees_{target}")

            # Generate predictions using the model
            predictions = predict_model(model, data=current_df)

            # Store the predictions in the DataFrame
            current_df['predicted_' + target] = predictions['prediction_label']

            # Store the DataFrame in the dictionary
            forecasts_dict[target] = current_df

            print(f"Forecasts for target variable '{target}' added to DataFrame.")
        else:
            print(f"Target variable '{target}' is not in the inference DataFrame.")
    
    return forecasts_dict

forecasts = generate_forecasts_for_targets(inference_df, target_vars_et, save_path)


