In [121]:
import pandas as pd
import numpy as np
from pycaret.time_series import *
from pycaret.regression import *
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3
from pycaret import *
from sklearn.model_selection import train_test_split
import os
from pycaret.regression import load_model, plot_model
import plotly.express as px
import matplotlib.pyplot as plt
import logging

In [122]:
boto3.setup_default_session(profile_name='anthony_garove_fellow_dssgx_24')

bucket = "dssgx-munich-2024-bavarian-forest"
raw_data_folder = "raw-data"
preprocessed_data_folder = "preprocessed_data"

In [None]:
def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df = wr.s3.read_csv(path=path, **kwargs)
    return df
df = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/joined_sensor_weather_visitorcenter_2016-2024.csv"
)
df.head()

In [None]:
df.tail()

In [125]:
# Specify the columns to use
columns_to_use = [
'Time',  'Bayerisch Eisenstein IN',  'Bayerisch Eisenstein OUT',  'Brechhäuslau IN',  'Brechhäuslau OUT',  
'Deffernik IN',  'Deffernik OUT',  'Diensthüttenstraße IN',  'Diensthüttenstraße OUT',  'Felswandergebiet IN',  
'Felswandergebiet OUT',  'Ferdinandsthal IN',  'Ferdinandsthal OUT',  'Fredenbrücke IN',  'Fredenbrücke OUT',  
'Gfäll IN',  'Gfäll OUT',  'Gsenget IN',  'Gsenget OUT',  'Klingenbrunner Wald IN',  'Klingenbrunner Wald OUT',  
'Klosterfilz IN',  'Klosterfilz OUT',  'Racheldiensthütte IN',  'Racheldiensthütte OUT',  'Sagwassersäge IN',  
'Sagwassersäge OUT',  'Scheuereck IN',  'Scheuereck OUT',  'Schillerstraße IN',  'Schillerstraße OUT',  
'Schwarzbachbrücke IN',  'Schwarzbachbrücke OUT',  'Falkenstein 2 OUT',  'Falkenstein 2 IN',  'Lusen 2 IN',  
'Lusen 2 OUT',  'Lusen 3 IN',  'Lusen 3 OUT',  'Waldhausreibe IN',  'Waldhausreibe OUT',  'Waldspielgelände IN',  
'Waldspielgelände OUT',  'Wistlberg IN',  'Wistlberg OUT',  'Bucina MERGED IN',  'Bucina MERGED OUT',  
'Falkenstein 1 MERGED IN',  'Falkenstein 1 MERGED OUT',  'Lusen 1 MERGED IN',  'Lusen 1 MERGED OUT',  
'Trinkwassertalsperre MERGED IN',  'Trinkwassertalsperre MERGED OUT',  
'traffic_abs',  'sum_IN_abs',  'sum_OUT_abs',  'Temperature (°C)',  'Relative Humidity (%)',  
'Precipitation (mm)',  'Wind Speed (km/h)',  'Sunshine Duration (min)',  'Tag',  'Monat',  
'Wochentag',  'Wochenende',  'Jahreszeit',  'Laubfärbung',  'Schulferien_Bayern',  'Schulferien_CZ',  
'Feiertag_Bayern',  'Feiertag_CZ',  'HEH_geoeffnet',  'HZW_geoeffnet',  'WGM_geoeffnet',  
'Lusenschutzhaus_geoeffnet',  'Racheldiensthuette_geoeffnet',  'Falkensteinschutzhaus_geoeffnet',  
'Schwellhaeusl_geoeffnet'
]


In [None]:
# Filter the dataframe to only include the specified columns
df = df[columns_to_use]

# Display the first few rows to ensure the data is loaded correctly
df.head()

In [None]:
# CREATE NEW REGION VARIABLE

# Remove MERGED from column names with this unnecessary label
df.columns = df.columns.str.replace(' MERGED', '', regex=False)

# Create a dictionary for mapping
location_mapping = {
    'Bayerisch Eisenstein IN': 'Falkenstein-Schwellhäusl',
    'Bayerisch Eisenstein OUT': 'Falkenstein-Schwellhäusl', 
    'Brechhäuslau IN': 'Falkenstein-Schwellhäusl', 
    'Brechhäuslau OUT': 'Falkenstein-Schwellhäusl', 
    'Deffernik IN': 'Falkenstein-Schwellhäusl',
    'Deffernik OUT': 'Falkenstein-Schwellhäusl',
    'Falkenstein 1 IN': 'Nationalparkzentrum Falkenstein', 
    'Falkenstein 1 OUT': 'Nationalparkzentrum Falkenstein',
    'Falkenstein 2 IN': 'Nationalparkzentrum Falkenstein', 
    'Falkenstein 2 OUT': 'Nationalparkzentrum Falkenstein',
    'Ferdinandsthal IN': 'Falkenstein-Schwellhäusl', 
    'Ferdinandsthal OUT': 'Falkenstein-Schwellhäusl', 
    'Gsenget IN': 'Scheuereck-Schachten-Trinkwassertalsperre', 
    'Gsenget OUT': 'Scheuereck-Schachten-Trinkwassertalsperre', 
    'Scheuereck IN': 'Scheuereck-Schachten-Trinkwassertalsperre',
    'Scheuereck OUT': 'Scheuereck-Schachten-Trinkwassertalsperre', 
    'Schillerstraße IN': 'Falkenstein-Schwellhäusl', 
    'Schillerstraße OUT': 'Falkenstein-Schwellhäusl', 
    'Trinkwassertalsperre IN': 'Scheuereck-Schachten-Trinkwassertalsperre',
    'Trinkwassertalsperre OUT': 'Scheuereck-Schachten-Trinkwassertalsperre',
    'Bucina IN': 'Lusen-Mauth-Finsterau',
    'Bucina OUT': 'Lusen-Mauth-Finsterau', 
    'Diensthüttenstraße IN': 'Rachel-Spiegelau', 
    'Diensthüttenstraße OUT': 'Rachel-Spiegelau',
    'Felswandergebiet IN': 'Lusen-Mauth-Finsterau', 
    'Felswandergebiet OUT': 'Lusen-Mauth-Finsterau',
    'Fredenbrücke IN': 'Lusen-Mauth-Finsterau', 
    'Fredenbrücke OUT': 'Lusen-Mauth-Finsterau', 
    'Gfäll IN': 'Rachel-Spiegelau', 
    'Gfäll OUT': 'Rachel-Spiegelau', 
    'Klingenbrunner Wald IN': 'Rachel-Spiegelau', 
    'Klingenbrunner Wald OUT': 'Rachel-Spiegelau', 
    'Klosterfilz IN': 'Rachel-Spiegelau', 
    'Klosterfilz OUT': 'Rachel-Spiegelau',
    'Lusen 1 IN': 'Nationalparkzentrum Lusen', 
    'Lusen 1 OUT': 'Nationalparkzentrum Lusen', 
    'Lusen 2 IN': 'Nationalparkzentrum Lusen',
    'Lusen 2 OUT': 'Nationalparkzentrum Lusen', 
    'Lusen 3 IN': 'Nationalparkzentrum Lusen', 
    'Lusen 3 OUT': 'Nationalparkzentrum Lusen',
    'Racheldiensthütte IN': 'Rachel-Spiegelau', 
    'Racheldiensthütte OUT': 'Rachel-Spiegelau',
    'Schwarzbachbrücke IN': 'Lusen-Mauth-Finsterau', 
    'Schwarzbachbrücke OUT': 'Lusen-Mauth-Finsterau', 
    'Waldhausreibe IN': 'Lusen-Mauth-Finsterau', 
    'Waldhausreibe OUT': 'Lusen-Mauth-Finsterau', 
    'Waldspielgelände IN': 'Rachel-Spiegelau', 
    'Waldspielgelände OUT': 'Rachel-Spiegelau', 
    'Wistlberg IN': 'Lusen-Mauth-Finsterau', 
    'Wistlberg OUT': 'Lusen-Mauth-Finsterau', 
    'Sagwassersäge IN': 'Lusen-Mauth-Finsterau',
    'Sagwassersäge OUT': 'Lusen-Mauth-Finsterau'
}

# Extract unique regions
regions = set(location_mapping.values())

# Iterate over each region
for region in regions:
    # Filter the keys in location_mapping that belong to the current region
    region_in_columns = [col for col in location_mapping if location_mapping[col] == region and ' IN' in col]
    region_out_columns = [col for col in location_mapping if location_mapping[col] == region and ' OUT' in col]

    # Sum the values for all IN columns of the current region, while retaining NaN where all are NaN
    df[f'{region} IN'] = df[region_in_columns].sum(axis=1, min_count=1)
    
    # Sum the values for all OUT columns of the current region, while retaining NaN where all are NaN
    df[f'{region} OUT'] = df[region_out_columns].sum(axis=1, min_count=1)

# Display the updated DataFrame
df.tail()

# EXPLANATION OF LOOPING FUNCTION BELOW
#min_count=1 in sum():
#The sum(axis=1, min_count=1) method ensures that if all values being summed are NaN, the result will be NaN.
#If at least one value is not NaN, it will compute the sum, ignoring the NaN values.
#Explanation:
#min_count=1: This parameter in the sum() function specifies the minimum number of non-NaN values required to perform the summation. If the count of non-NaN values is less than min_count, the result will be NaN.
#Result: The DataFrame will have the new region columns that sum the sensors while retaining NaN if all sensors in a region are NaN for a given row.

In [None]:
def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df = wr.s3.read_csv(path=path, **kwargs)
    return df
df_newfeatures = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/holidays_deltaweather_features_df.csv"
)
df_newfeatures.head()

In [None]:
df['Time'] = pd.to_datetime(df['Time'])
df_newfeatures['Time'] = pd.to_datetime(df_newfeatures['Time'])

# Step 2: Select the columns you want to add from df_newfeatures
columns_to_add = [
    'ZScore_Daily_Max_Temperature (°C)',
    'ZScore_Daily_Max_Relative Humidity (%)',
    'ZScore_Daily_Max_Precipitation (mm)',
    'ZScore_Daily_Max_Wind Speed (km/h)',
    'ZScore_Daily_Max_Sunshine Duration (min)',
    'Distance_to_Nearest_Holiday_Bayern',
    'Distance_to_Nearest_Holiday_CZ'
]

# Ensure that the selected columns exist in df_newfeatures
selected_columns = [col for col in columns_to_add if col in df_newfeatures.columns]

# Step 3: Merge df with df_newfeatures on 'Time' and add the selected columns
df = pd.merge(df, df_newfeatures[['Time'] + selected_columns], on='Time', how='left')

# Optionally, you can display the merged dataframe
df.tail()

In [None]:
# Specify the data types
dtype_dict = {
    'Time': 'datetime64[ns]',
    'traffic_abs': 'float64',
    'Temperature (°C)': 'float64',
    'Relative Humidity (%)': 'float64',
    'Precipitation (mm)': 'float64',
    'Wind Speed (km/h)': 'float64',
    'Sunshine Duration (min)': 'float64',
    'Monat': 'float64',
    'Wochentag': 'category',
    'Wochenende': 'category',
    'Jahreszeit': 'category',
    'Laubfärbung': 'category',
    'Feiertag_Bayern': 'category',
    'Feiertag_CZ': 'category',
    'HEH_geoeffnet': 'category',
    'HZW_geoeffnet': 'category',
    'WGM_geoeffnet': 'category',
    'Lusenschutzhaus_geoeffnet': 'category',
    'Racheldiensthuette_geoeffnet': 'category',
    'Falkensteinschutzhaus_geoeffnet': 'category',
    'Schwellhaeusl_geoeffnet': 'category',
    'Schulferien_Bayern': 'category',
    'Schulferien_CZ': 'category',
    'sum_IN_abs': 'float64',
    'sum_OUT_abs': 'float64',
    'Falkenstein-Schwellhäusl IN': 'float64',
    'Rachel-Spiegelau IN': 'float64',
    'Nationalparkzentrum Falkenstein IN': 'float64',
    'Nationalparkzentrum Lusen IN': 'float64',
    'Lusen-Mauth-Finsterau IN': 'float64',
    'Scheuereck-Schachten-Trinkwassertalsperre IN': 'float64',
    'Falkenstein-Schwellhäusl OUT': 'float64',
    'Rachel-Spiegelau OUT': 'float64',
    'Nationalparkzentrum Falkenstein OUT': 'float64',
    'Nationalparkzentrum Lusen OUT': 'float64',
    'Lusen-Mauth-Finsterau OUT': 'float64',
    'Scheuereck-Schachten-Trinkwassertalsperre OUT': 'float64',
    'Bayerisch Eisenstein IN': 'float64',
    'Bayerisch Eisenstein OUT': 'float64',
    'Brechhäuslau IN': 'float64',
    'Brechhäuslau OUT': 'float64',
    'Deffernik IN': 'float64',
    'Deffernik OUT': 'float64',
    'Diensthüttenstraße IN': 'float64',
    'Diensthüttenstraße OUT': 'float64',
    'Felswandergebiet IN': 'float64',
    'Felswandergebiet OUT': 'float64',
    'Ferdinandsthal IN': 'float64',
    'Ferdinandsthal OUT': 'float64',
    'Fredenbrücke IN': 'float64',
    'Fredenbrücke OUT': 'float64',
    'Gfäll IN': 'float64',
    'Gfäll OUT': 'float64',
    'Gsenget IN': 'float64',
    'Gsenget OUT': 'float64',
    'Klingenbrunner Wald IN': 'float64',
    'Klingenbrunner Wald OUT': 'float64',
    'Klosterfilz IN': 'float64',
    'Klosterfilz OUT': 'float64',
    'Racheldiensthütte IN': 'float64',
    'Racheldiensthütte OUT': 'float64',
    'Sagwassersäge IN': 'float64',
    'Sagwassersäge OUT': 'float64',
    'Scheuereck IN': 'float64',
    'Scheuereck OUT': 'float64',
    'Schillerstraße IN': 'float64',
    'Schillerstraße OUT': 'float64',
    'Schwarzbachbrücke IN': 'float64',
    'Schwarzbachbrücke OUT': 'float64',
    'Falkenstein 2 OUT': 'float64',
    'Falkenstein 2 IN': 'float64',
    'Lusen 2 IN': 'float64',
    'Lusen 2 OUT': 'float64',
    'Lusen 3 IN': 'float64',
    'Lusen 3 OUT': 'float64',
    'Waldhausreibe IN': 'float64',
    'Waldhausreibe OUT': 'float64',
    'Waldspielgelände IN': 'float64',
    'Waldspielgelände OUT': 'float64',
    'Wistlberg IN': 'float64',
    'Wistlberg OUT': 'float64',
    'Bucina IN': 'float64',
    'Bucina OUT': 'float64',
    'Falkenstein 1 IN': 'float64',
    'Falkenstein 1 OUT': 'float64',
    'Lusen 1 IN': 'float64',
    'Lusen 1 OUT': 'float64',
    'Trinkwassertalsperre IN': 'float64',
    'Trinkwassertalsperre OUT': 'float64',
    'ZScore_Daily_Max_Temperature (°C)': 'float64',
    'ZScore_Daily_Max_Relative Humidity (%)': 'float64',
    'ZScore_Daily_Max_Precipitation (mm)': 'float64',
    'ZScore_Daily_Max_Wind Speed (km/h)': 'float64',
    'ZScore_Daily_Max_Sunshine Duration (min)': 'float64',
    'Distance_to_Nearest_Holiday_Bayern': 'float64',
    'Distance_to_Nearest_Holiday_CZ': 'float64'
}

# Apply data types
df = df.astype(dtype_dict)

# Set 'Time' column as index
df.set_index('Time', inplace=True)

# Add 'Hour' column based on the index
df["Hour"] = df.index.hour

# Convert 'Hour' to categorical
df['Hour'] = pd.Categorical(df['Hour'])

# Reset the index to make 'Time' a column again
df.reset_index(inplace=True)

df.tail()

In [None]:
# Ensure 'Time' is in datetime format
df['Time'] = pd.to_datetime(df['Time'])

# Set 'Time' as the index
df.set_index('Time', inplace=True)

# Slice the data from January 1, 2023, to August 19, 2024
df = df.loc['2023-01-01':'2024-08-19']
# Display the info to check data types
df.info()

In [None]:
df.tail()

In [None]:
region_columns = [
    'Falkenstein-Schwellhäusl IN', 
    'Rachel-Spiegelau IN', 
    'Nationalparkzentrum Falkenstein IN',
    'Nationalparkzentrum Lusen IN', 
    'Lusen-Mauth-Finsterau IN', 
    'Scheuereck-Schachten-Trinkwassertalsperre IN',
    'Falkenstein-Schwellhäusl OUT', 
    'Rachel-Spiegelau OUT', 
    'Nationalparkzentrum Falkenstein OUT',
    'Nationalparkzentrum Lusen OUT', 
    'Lusen-Mauth-Finsterau OUT', 
    'Scheuereck-Schachten-Trinkwassertalsperre OUT'
]

for column in region_columns:
    if column in df.columns:
        missing_dates = df[df[column].isna()].index
        if not missing_dates.empty:
            first_missing_date = missing_dates[0]
            print(f"Column '{column}' has its first missing value on {first_missing_date}")
        else:
            print(f"Column '{column}' has no missing values")
    else:
        print(f"Column '{column}' is not in the DataFrame")

In [None]:
# Slice the data from January 1, 2023, to July 22, 2024
df = df.loc['2023-01-01':'2024-07-22']
# Display the info to check data types
df.info()

In [None]:
df.tail()

In [None]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

In [None]:
print(df.info())

In [138]:
# Features to apply cyclic transformation
cyclic_features = ['Tag', 'Monat', 'Hour', 'Wochentag']

# Convert categorical features to numeric if they are not already
for feature in cyclic_features:
    if feature in df.columns:
        if pd.api.types.is_categorical_dtype(df[feature]):
            df[feature] = df[feature].cat.codes  # Convert categorical to numeric codes
        
        max_value = df[feature].max()  # Get max value for scaling
        
        # Apply sine and cosine transformations
        df[f'{feature}_sin'] = np.sin(2 * np.pi * df[feature] / max_value)
        df[f'{feature}_cos'] = np.cos(2 * np.pi * df[feature] / max_value)
        
        # Drop the original feature column
        df.drop(columns=[feature], inplace=True)
    else:
        print(f"Warning: Feature '{feature}' not found in DataFrame")


In [139]:
# List of numeric features to normalize
standardized_features = ['Temperature (°C)', 'Relative Humidity (%)', 'Precipitation (mm)', 'Wind Speed (km/h)', 'Sunshine Duration (min)']

# Loop through each numeric feature and apply z-score normalization
for feature in standardized_features:
    if feature in df.columns:
        mean_value = df[feature].mean()  # Calculate mean
        std_value = df[feature].std()    # Calculate standard deviation
        
        # Apply z-score normalization
        df[feature] = (df[feature] - mean_value) / std_value
    else:
        print(f"Warning: Feature '{feature}' not found in DataFrame")

In [None]:
# One hot-encode season variable
# Apply one-hot encoding to 'Jahreszeit' and update the original DataFrame
df = pd.get_dummies(df, columns=['Jahreszeit'], prefix='Jahreszeit')

# Print the names of the new columns
new_columns = df.columns
print("New columns after one-hot encoding 'Jahreszeit':")
print(new_columns.tolist())


In [141]:
# List of categorical columns to update
categorical_columns = [
    'Wochenende',
    'Jahreszeit_Frühling',
    'Jahreszeit_Herbst',
    'Jahreszeit_Sommer',
    'Jahreszeit_Winter',
    'Laubfärbung',
    'Schulferien_Bayern',
    'Schulferien_CZ',
    'Feiertag_Bayern',
    'Feiertag_CZ',
    'HEH_geoeffnet',
    'HZW_geoeffnet',
    'WGM_geoeffnet',
    'Lusenschutzhaus_geoeffnet',
    'Racheldiensthuette_geoeffnet',
    'Falkensteinschutzhaus_geoeffnet',
    'Schwellhaeusl_geoeffnet'
]

# Convert specified columns to categorical type and replace TRUE/FALSE with 1/0
for col in categorical_columns:
    if col in df.columns:
        # Replace TRUE with 1 and FALSE with 0
        df[col] = df[col].replace({True: 1, False: 0})
        
        # Convert column to categorical type
        df[col] = df[col].astype('category')


In [None]:
print(df.dtypes)

In [None]:
# Define target and feature columns
target_vars_et = ['traffic_abs', 'sum_IN_abs', 'sum_OUT_abs', 'Lusen-Mauth-Finsterau IN', 'Lusen-Mauth-Finsterau OUT', 
               'Nationalparkzentrum Lusen IN', 'Nationalparkzentrum Lusen OUT', 'Rachel-Spiegelau IN', 'Rachel-Spiegelau OUT', 
               'Falkenstein-Schwellhäusl IN', 'Falkenstein-Schwellhäusl OUT', 
               'Scheuereck-Schachten-Trinkwassertalsperre IN', 'Scheuereck-Schachten-Trinkwassertalsperre OUT', 
               'Nationalparkzentrum Falkenstein IN', 'Nationalparkzentrum Falkenstein OUT']

numeric_features = ['Tag_sin', 'Tag_cos', 'Monat_sin', 'Monat_cos', 'Hour_sin', 'Hour_cos', 'Wochentag_sin', 'Wochentag_cos',
                    'Temperature (°C)', 'Relative Humidity (%)', 'Precipitation (mm)', 'Wind Speed (km/h)', 
                    'Sunshine Duration (min)', 'ZScore_Daily_Max_Temperature (°C)', 
                    'ZScore_Daily_Max_Relative Humidity (%)','ZScore_Daily_Max_Precipitation (mm)',
                    'ZScore_Daily_Max_Wind Speed (km/h)','ZScore_Daily_Max_Sunshine Duration (min)',
                    'Distance_to_Nearest_Holiday_Bayern','Distance_to_Nearest_Holiday_CZ']

categorical_features = ['Wochenende', 'Jahreszeit_Frühling', 'Jahreszeit_Herbst', 'Jahreszeit_Sommer', 'Jahreszeit_Winter', 
                        'Laubfärbung', 'Schulferien_Bayern', 'Schulferien_CZ', 
                        'Feiertag_Bayern', 'Feiertag_CZ', 'HEH_geoeffnet', 'HZW_geoeffnet', 'WGM_geoeffnet', 
                        'Lusenschutzhaus_geoeffnet', 'Racheldiensthuette_geoeffnet', 'Falkensteinschutzhaus_geoeffnet', 
                        'Schwellhaeusl_geoeffnet']

for catfeature in categorical_features: 
    df[catfeature] = df[catfeature].astype(str)

# Dictionary to store dataframes
target_dataframes_et = {}

# Iterate over each target variable
for target in target_vars_et:
    if target in df.columns:
        # Select the target variable and features
        target_df_et = df[numeric_features + categorical_features + [target]].copy()
        target_dataframes_et[target] = target_df_et
        print(f"DataFrame for target variable '{target}' created.")
    else:
        print(f"Target variable '{target}' is not in the DataFrame columns.")

In [None]:
predictions_dict_et = {}

save_path = r"C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24"

for target in target_vars_et:
    # Ensure the target is in the dictionary of processed DataFrames
    if target in target_dataframes_et:
        df = target_dataframes_et[target]
        
        # Ensure the DataFrame has a date-time index
        if isinstance(df.index, pd.DatetimeIndex):
            # Define date ranges for training, testing, and unseen data
            train_start = '2023-01-01'
            train_end = '2023-12-31'
            test_start = '2024-01-01'
            test_end = '2024-04-30'
            unseen_start = '2024-05-01'
            unseen_end = '2024-07-22'
            
            # Split the data into train, test, and unseen sets based on date ranges
            df_train = df.loc[train_start:train_end]
            df_test = df.loc[test_start:test_end]
            df_unseen = df.loc[unseen_start:unseen_end]
            
            # Combine train and test data for model training
            df_train_and_test = pd.concat([df_train, df_test])
            
            # Setup PyCaret for the target variable with the combined data
            reg_setup = setup(data=df_train_and_test,
                              target=target, 
                              numeric_features=numeric_features, 
                              categorical_features=categorical_features,
                              fold=5,
                              preprocess=True,
                              data_split_shuffle=False,  # Do not shuffle data to maintain date order
                              session_id=123,
                              train_size=0.9)  # Use 90% of data for training 
            
            # Train the Extra Trees Regressor model
            extra_trees_model = create_model('et')
            
            # Predict on the unseen data
            predictions_unseen = predict_model(extra_trees_model, data=df_unseen)
            
            # Save the model
            save_model(extra_trees_model, f"{save_path}/extra_trees_{target}")
            
            # Save the predictions in the dictionary for future use
            predictions_dict_et[f"extra_trees_{target}"] = predictions_unseen
            
            print(f"Predictions for unseen data saved for {target}") 
            
            # Optionally, save the predictions to a CSV
            #predictions_unseen.to_csv(f"{save_path}/predictions_unseen_{target}.csv", index=False)




In [None]:
# Define the path to load models
save_path = r"C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24"

# Define target variable lists
target_vars_et = ['traffic_abs', 'sum_IN_abs', 'sum_OUT_abs', 'Lusen-Mauth-Finsterau IN', 'Lusen-Mauth-Finsterau OUT', 
               'Nationalparkzentrum Lusen IN', 'Nationalparkzentrum Lusen OUT', 'Rachel-Spiegelau IN', 'Rachel-Spiegelau OUT', 
               'Falkenstein-Schwellhäusl IN', 'Falkenstein-Schwellhäusl OUT', 
               'Scheuereck-Schachten-Trinkwassertalsperre IN', 'Scheuereck-Schachten-Trinkwassertalsperre OUT', 
               'Nationalparkzentrum Falkenstein IN', 'Nationalparkzentrum Falkenstein OUT']


# Plot feature importance for Extra Trees models
for target in target_vars_et:
    model_filename = f'extra_trees_{target}'
    full_model_path = os.path.join(save_path, model_filename)
    
    try:
        # Load the saved model
        loaded_model = load_model(full_model_path)
        
        # Plot feature importance
        print(f"Feature importance for Extra Trees model on target '{target}':")
        plot_model(loaded_model, plot='feature_all')
    
    except FileNotFoundError as e:
        print(f"File not found: {e}")
              

In [None]:
# Visualize predictions for Extra Trees models
for key, predictions_et in predictions_dict_et.items():
    target = key.split('_', 2)[-1]  # This assumes the format 'extra_trees_<target>'
    
    if "prediction_label" in predictions_et.columns and target in predictions_et.columns:
        predictions_vs_real_et = predictions_et[[target, "prediction_label"]].sort_index(ascending=True)
        
        # Create a line plot using Matplotlib
        plt.figure(figsize=(12, 6))
        plt.plot(predictions_vs_real_et.index, predictions_vs_real_et[target], label='Actual', color='blue')
        plt.plot(predictions_vs_real_et.index, predictions_vs_real_et["prediction_label"], label='Predicted', color='red')
        plt.xlabel('Index')
        plt.ylabel('Value')
        plt.title(f"Predictions vs. Real Values for {key}")
        plt.legend()
        plt.show()
    else:
        print(f"Columns '{target}' and 'prediction_label' not found in predictions for {key}")

In [None]:
for key, predictions_et in predictions_dict_et.items():
    target = key.split('_', 2)[-1]  # This assumes the format 'extra_trees_<target>'
    
    if "prediction_label" in predictions_et.columns and target in predictions_et.columns:
        # Resample predictions and actual values on a daily basis
        daily_prediction_comparison = predictions_et[[target, "prediction_label"]].resample("1d").sum()

        # Calculate the mean absolute error (MAE)
        daily_prediction_comparison["mae"] = abs(daily_prediction_comparison[target] - daily_prediction_comparison["prediction_label"])
        print(f"The MAE on a daily basis for {target} is {daily_prediction_comparison['mae'].mean()}.")

        # Plot the actual vs predicted values using Plotly Express
        fig = px.line(daily_prediction_comparison, y=[target, "prediction_label"],
                      labels={"value": "Value", "variable": "Legend"},
                      title=f"Daily Predictions vs Actuals for {target}")
        fig.show()

    else:
        print(f"Columns '{target}' and 'prediction_label' not found in predictions for {key}")


In [None]:
for key, predictions_et in predictions_dict_et.items():
    target = key.split('_', 2)[-1]  # This assumes the format 'extra_trees_<target>'
    
    if "prediction_label" in predictions_et.columns and target in predictions_et.columns:
        # Resample predictions and actual values on a daily basis
        daily_prediction_comparison = predictions_et[[target, "prediction_label"]].resample("1d").sum()

        # Calculate the mean absolute error (MAE) on a daily basis
        daily_prediction_comparison["mae"] = abs(daily_prediction_comparison[target] - daily_prediction_comparison["prediction_label"])

        # Print the average number of people visiting the park (or any other target)
        print(f"On average, {daily_prediction_comparison[target].mean()} people are visiting the park daily for {target}.")

        # Create a box plot of the MAE using Plotly Express
        fig_box = px.box(daily_prediction_comparison, y="mae", title=f"MAE Distribution for {target}")
        fig_box.show()

        # Identify the top 50 days with the highest error
        high_error_dates = daily_prediction_comparison["mae"].sort_values(ascending=False).head(50)
        print(high_error_dates)

        # Retrieve and print training data columns using PyCaret's get_config function
        X_train = get_config('X_train')
        X_train_columns = X_train.columns.to_list()
        print(f"Training columns for {target}: {X_train_columns}")

    else:
        print(f"Columns '{target}' and 'prediction_label' not found in predictions for {key}")



In [None]:
# Define start and end dates for inference
start_date = "2024-08-30 00:00"
end_date = "2024-09-07 23:00"

# Create an hourly date range for inference
inference_index = pd.date_range(
    start=pd.to_datetime(start_date),
    end=pd.to_datetime(end_date),
    freq="1h"
)

# Create an empty DataFrame for inference with X_train columns
inference_df = pd.DataFrame(index=inference_index, columns=X_train_columns)

# Add date-related features using cyclic transformations (e.g., Hour, Monat)
inference_df["Hour"] = inference_df.index.hour
inference_df["Monat"] = inference_df.index.month
inference_df["Jahr"] = inference_df.index.year

inference_df.head()


In [None]:
def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df2 = wr.s3.read_csv(path=path, **kwargs)
    return df2
df2 = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/df_visitcenters_hourly.csv"
)

df2.tail()

In [None]:
def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df3 = wr.s3.read_csv(path=path, **kwargs)
    return df3
df3 = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/processed_weather_data_inference_df.csv"
)

df3.tail()

In [169]:
# Convert the Time column in df2 and df3 to datetime
df2['Time'] = pd.to_datetime(df2['Time'])
df3['Time'] = pd.to_datetime(df3['Time'])

# Set the Time column as the index for both DataFrames
df2.set_index('Time', inplace=True)
df3.set_index('Time', inplace=True)

# Identify common columns between inference_df and the new DataFrames
common_columns_df2 = df2.columns.intersection(inference_df.columns)
common_columns_df3 = df3.columns.intersection(inference_df.columns)

# Merge df2 and df3 into inference_df based on datetime index
inference_df.update(df2[common_columns_df2])
inference_df.update(df3[common_columns_df3])


In [None]:
# Define the cutoff timestamp
cutoff_time = pd.Timestamp('2024-09-04 16:00')

# Drop rows in inference_df that are after the cutoff timestamp
inference_df = inference_df[inference_df.index <= cutoff_time]

# Check the last few rows after cutoff
inference_df

In [None]:
# Define start and end dates for inference
start_date = "2024-08-30 00:00"
end_date = "2024-09-07 23:00"

# Create an hourly date range for inference
inference_index = pd.date_range(
    start=pd.to_datetime(start_date),
    end=pd.to_datetime(end_date),
    freq="1h"
)

# Define target variables and features for inference DataFrames
target_vars_et = ['traffic_abs', 'sum_IN_abs', 'sum_OUT_abs', 'Lusen-Mauth-Finsterau IN', 'Lusen-Mauth-Finsterau OUT', 
               'Nationalparkzentrum Lusen IN', 'Rachel-Spiegelau IN', 'Rachel-Spiegelau OUT', 'Nationalparkzentrum Lusen OUT', 'Falkenstein-Schwellhäusl IN', 'Falkenstein-Schwellhäusl OUT', 
                        'Scheuereck-Schachten-Trinkwassertalsperre IN', 'Scheuereck-Schachten-Trinkwassertalsperre OUT', 
                        'Nationalparkzentrum Falkenstein IN', 'Nationalparkzentrum Falkenstein OUT']

# Create an empty DataFrame for inference with required columns
inference_dfs = {}

for target in target_vars_et:
    # Create an empty DataFrame for inference with necessary columns
    inference_df = pd.DataFrame(index=inference_index, columns=numeric_features + categorical_features)

    # Add date-related features using cyclic transformations (e.g., Hour, Monat)
    inference_df["Hour"] = inference_df.index.hour
    inference_df["Monat"] = inference_df.index.month
    inference_df["Jahr"] = inference_df.index.year

    # Convert categorical features to string type
    for catfeature in categorical_features:
        inference_df[catfeature] = inference_df[catfeature].astype(str)
    
    # Ensure the DataFrame has a date-time index
    inference_df.index.name = 'DateTime'
    
    # Load the pre-trained model for the target variable
    model = load_model(f"{save_path}/extra_trees_{target}")

    # Make predictions on the inference data
    pred_unseen = predict_model(model, data=inference_df)
    
    # Process predictions (if needed), ensuring consistency with prior code
    formatted_pred_unseen = pred_unseen  # Apply any necessary formatting or transformations here
    
    # Save predictions in dictionary
    predictions_dict_et[f"extra_trees_{target}"] = formatted_pred_unseen

    # Display the first few rows of predictions
    print(f"Predictions for {target}:\n", formatted_pred_unseen.head())


In [172]:
# Make predictions on the unseen data using the pre-trained model
#pred_unseen = predict_model(model, data=inference_df)

# Process predictions (if needed), ensuring consistency with prior code
#formatted_pred_unseen = pred_unseen  # Apply any necessary formatting or transformations here

# Display the first few rows of predictions
#formatted_pred_unseen.head()


In [173]:
#inference_df.head()

In [174]:
#pred_unseen

In [175]:
#from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()


#formatted_pred_unseen = pred_unseen
#formatted_pred_unseen.prediction_label = formatted_pred_unseen.prediction_label.round()
#formatted_pred_unseen['weekly_relative_traffic'] = scaler.fit_transform(formatted_pred_unseen['prediction_label'].values.reshape(-1, 1))
#formatted_pred_unseen.sort_values(by='weekly_relative_traffic',ascending=False)

In [None]:
def write_parquet_file_to_aws_s3(df: pd.DataFrame, path: str, **kwargs) -> pd.DataFrame:
    """Writes an individual Parquet file to AWS S3.

    Args:
        df (pd.DataFrame): The DataFrame to write.
        path (str): The path to the Parquet files on AWS S3.
        **kwargs: Additional arguments to pass to the to_parquet function.
    """
    try:
        wr.s3.to_parquet(df, path=path, **kwargs)
        print(f"DataFrame successfully written to {path}")
    except Exception as e:
        logging.error(f"Failed to write DataFrame to S3. Error: {e}")
    return

# Define the bucket and folder name as in your original code
bucket = 'dssgx-munich-2024-bavarian-forest'
preprocessed_data_folder = 's3://dssgx-munich-2024-bavarian-forest/preprocessed_data/'

# Write each target variable's predictions to S3 as Parquet files
for target in target_vars_et:
    formatted_pred_unseen = predictions_dict_et[f"extra_trees_{target}"]
    file_path = f"s3://{bucket}/{preprocessed_data_folder}/predictions_{target}.parquet"
    
    write_parquet_file_to_aws_s3(
        df=formatted_pred_unseen,
        path=file_path,
        index=False  # Typically, you might not want to save the index in the Parquet file
    )
