In [157]:
import pandas as pd
from pycaret.time_series import *
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3

boto3.setup_default_session(profile_name='patricio_ferreira_fellow_dssgx_24')

bucket = "dssgx-munich-2024-bavarian-forest"
raw_data_folder = "raw-data"
preprocessed_data_folder = "preprocessed_data"

def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df = wr.s3.read_csv(path=path, **kwargs)
    return df
df = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/joined_sensor_weather_visitorcenter_2016-2024.csv"
)
df.head()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Total
0,2016-01-01 00:00:00,,,,,,,,,,...,,,,,,,,,,
1,2016-01-01 01:00:00,,,,,,,,,,...,,,,,,,,,,
2,2016-01-01 02:00:00,,,,,,,,,,...,,,,,,,,,,
3,2016-01-01 03:00:00,,,,,,,,,,...,,,,,,,,,,
4,2016-01-01 04:00:00,,,,,,,,,,...,,,,,,,,,,


# First Draft

- sensor on wich gfäl started working
- Target =  Traffic Abs


In [158]:
import numpy as np

df['Time']=pd.to_datetime(df['Time']) 

df = df.set_index('Time').sort_index()

complete_df = df.copy()

In [159]:
drop_metric_cols = ['traffic_abs', 'traffic_norm', 'occupancy_abs', 'occupancy_norm', 'sum_IN_norm', 'sum_IN_abs', 'sum_OUT_norm', 'sum_OUT_abs']

df.drop(columns = drop_metric_cols, inplace=True)

df.columns

Index(['Bayerisch Eisenstein IN', 'Bayerisch Eisenstein OUT',
       'Brechhäuslau IN', 'Brechhäuslau OUT', 'Deffernik IN', 'Deffernik OUT',
       'Diensthüttenstraße IN', 'Diensthüttenstraße OUT',
       'Felswandergebiet IN', 'Felswandergebiet OUT', 'Ferdinandsthal IN',
       'Ferdinandsthal OUT', 'Fredenbrücke IN', 'Fredenbrücke OUT', 'Gfäll IN',
       'Gfäll OUT', 'Gsenget IN', 'Gsenget OUT', 'Klingenbrunner Wald IN',
       'Klingenbrunner Wald OUT', 'Klosterfilz IN', 'Klosterfilz OUT',
       'Racheldiensthütte IN', 'Racheldiensthütte OUT', 'Sagwassersäge IN',
       'Sagwassersäge OUT', 'Scheuereck IN', 'Scheuereck OUT',
       'Schillerstraße IN', 'Schillerstraße OUT', 'Schwarzbachbrücke IN',
       'Schwarzbachbrücke OUT', 'Falkenstein 2 OUT', 'Falkenstein 2 IN',
       'Lusen 2 IN', 'Lusen 2 OUT', 'Lusen 3 IN', 'Lusen 3 OUT',
       'Waldhausreibe IN', 'Waldhausreibe OUT', 'Waldspielgelände IN',
       'Waldspielgelände OUT', 'Wistlberg IN', 'Wistlberg OUT',
       'Bucina

In [160]:
weather_cols = [col for col in df.columns if "IN" not in col and "OUT" not in col]
weather_cols

['working_sensors',
 'diff_norm',
 'diff_abs',
 'Temperature (°C)',
 'Relative Humidity (%)',
 'Precipitation (mm)',
 'Wind Speed (km/h)',
 'Sunshine Duration (min)',
 'Tag',
 'Monat',
 'Jahr',
 'Wochentag',
 'Wochenende',
 'Jahreszeit',
 'Laubfärbung',
 'Besuchszahlen_HEH',
 'Besuchszahlen_HZW',
 'Besuchszahlen_WGM',
 'Parkpl_HEH_PKW',
 'Parkpl_HEH_BUS',
 'Parkpl_HZW_PKW',
 'Parkpl_HZW_BUS',
 'Schulferien_Bayern',
 'Schulferien_CZ',
 'Feiertag_Bayern',
 'Feiertag_CZ',
 'HEH_geoeffnet',
 'HZW_geoeffnet',
 'WGM_geoeffnet',
 'Lusenschutzhaus_geoeffnet',
 'Racheldiensthuette_geoeffnet',
 'Waldschmidthaus_geoeffnet',
 'Falkensteinschutzhaus_geoeffnet',
 'Schwellhaeusl_geoeffnet',
 'Temperatur',
 'Niederschlagsmenge',
 'Schneehoehe',
 'GS mit',
 'GS max',
 'Total']

In [161]:
print("Number of sensors working: ", df.loc["2021-10-01 10:00:00", "working_sensors"])
df = df[df.index >= "2021-10-01 10:00:00"]
df.head()

Number of sensors working:  32.0


Unnamed: 0_level_0,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,Felswandergebiet OUT,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Total
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-01 10:00:00,,,6.0,40.0,16.0,2.0,,,4.0,0.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0
2021-10-01 11:00:00,,,20.0,44.0,27.0,6.0,,,7.0,1.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0
2021-10-01 12:00:00,,,29.0,42.0,2.0,4.0,,,6.0,7.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0
2021-10-01 13:00:00,,,43.0,30.0,21.0,16.0,,,4.0,7.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0
2021-10-01 14:00:00,,,36.0,12.0,7.0,16.0,,,2.0,0.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0


In [162]:
# Filter the row corresponding to the given timestamp
row = df.loc["2021-10-01 10:00:00"]

# Get columns with non-null values
non_null_columns = row[row.notnull()].index.tolist()

sensor_cols = [col for col in df.columns if "IN" in col or "OUT" in col]



# Display the columns
selected_sensors_cols = [col for col in sensor_cols if col in non_null_columns]

In [163]:
df_16_sensors = df[selected_sensors_cols + weather_cols]
df_16_sensors.head()

Unnamed: 0_level_0,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Felswandergebiet IN,Felswandergebiet OUT,Ferdinandsthal IN,Ferdinandsthal OUT,Fredenbrücke IN,Fredenbrücke OUT,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Total
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-01 10:00:00,6.0,40.0,16.0,2.0,4.0,0.0,28.0,19.0,20.0,1.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0
2021-10-01 11:00:00,20.0,44.0,27.0,6.0,7.0,1.0,5.0,0.0,12.0,4.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0
2021-10-01 12:00:00,29.0,42.0,2.0,4.0,6.0,7.0,6.0,2.0,5.0,1.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0
2021-10-01 13:00:00,43.0,30.0,21.0,16.0,4.0,7.0,10.0,12.0,6.0,10.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0
2021-10-01 14:00:00,36.0,12.0,7.0,16.0,2.0,0.0,4.0,4.0,5.0,20.0,...,True,326.0,True,True,9.538889,0.0,0.0,148.479167,598.0,1.0


Create traffic for 16 sensors

In [164]:
df_16_sensors["traffic_abs"] = df_16_sensors[selected_sensors_cols].sum(axis=1)

## Comparition of normalized values from complete_df and 16_sensor_df

In [171]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

# Assuming `complete_df` and `df` have timestamps as their index
# Filter both DataFrames from the given timestamp
start_date = "2021-10-02 10:00:00"
end_date = "2024-03-31 10:00:00"
complete_df_filtered = complete_df.loc[start_date: end_date]
df_filtered = df_16_sensors.loc[start_date: end_date]

# Normalize the `traffic_abs` columns using MinMaxScaler
scaler = MinMaxScaler()

# Adding traffic_abs to avoid reshaping issues with indexes
complete_df_filtered['normalized_traffic_abs'] = scaler.fit_transform(complete_df_filtered[['traffic_abs']])
df_filtered['normalized_traffic_abs'] = scaler.fit_transform(df_filtered[['traffic_abs']])

# Combine data for plotting
combined_df = pd.concat([
    complete_df_filtered[['normalized_traffic_abs']].assign(Source='Complete'),
    df_filtered[['normalized_traffic_abs']].assign(Source='DF')
])

# Reset index for plotly compatibility
#combined_df.reset_index(inplace=True)

# Plot using Plotly Express
fig = px.line(
    combined_df,
    x=combined_df.index,
    y='normalized_traffic_abs',
    color='Source',
    title='Normalized Traffic Over Time',
    labels={'index': 'Timestamp', 'normalized_traffic_abs': 'Normalized Traffic'}
)

fig.show()

In [167]:
df_filtered.shape

(21865, 74)

In [144]:
complete_df_filtered

Unnamed: 0_level_0,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,Felswandergebiet OUT,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Total
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-02 10:00:00,,,7.0,40.0,17.0,2.0,,,6.0,0.0,...,True,516.0,True,True,12.560417,0.0,0.0,152.854167,627.0,1.0
2021-10-02 11:00:00,,,6.0,81.0,11.0,3.0,,,37.0,20.0,...,True,516.0,True,True,12.560417,0.0,0.0,152.854167,627.0,1.0
2021-10-02 12:00:00,,,38.0,48.0,15.0,16.0,,,0.0,1.0,...,True,516.0,True,True,12.560417,0.0,0.0,152.854167,627.0,1.0
2021-10-02 13:00:00,,,60.0,46.0,20.0,22.0,,,2.0,3.0,...,True,516.0,True,True,12.560417,0.0,0.0,152.854167,627.0,1.0
2021-10-02 14:00:00,,,60.0,33.0,6.0,29.0,,,2.0,3.0,...,True,516.0,True,True,12.560417,0.0,0.0,152.854167,627.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31 19:00:00,,,,,,,,,,,...,True,,True,True,,,,,,1.0
2024-12-31 20:00:00,,,,,,,,,,,...,True,,True,True,,,,,,1.0
2024-12-31 21:00:00,,,,,,,,,,,...,True,,True,True,,,,,,1.0
2024-12-31 22:00:00,,,,,,,,,,,...,True,,True,True,,,,,,1.0


In [124]:
# Specify the columns to use
columns_to_use = [
    'traffic_abs',
    'Temperature (°C)',
    'Relative Humidity (%)',
    'Precipitation (mm)',
    'Wind Speed (km/h)',
    'Sunshine Duration (min)',
    'Monat',
    'Wochentag',
    'Wochenende',
    'Jahreszeit',
    'Laubfärbung',
    'Feiertag_Bayern',
    'Feiertag_CZ',
    'HEH_geoeffnet',
    'HZW_geoeffnet',
    'WGM_geoeffnet',
    'Lusenschutzhaus_geoeffnet',
    'Racheldiensthuette_geoeffnet',
    'Falkensteinschutzhaus_geoeffnet',
    'Schwellhaeusl_geoeffnet',
    'Schulferien_Bayern',
    'Schulferien_CZ',
    'Jahr'
]

df_16_sensors = df_16_sensors[columns_to_use]

In [125]:
dtype_dict = {
    'traffic_abs': 'float64',
    'Temperature (°C)': 'float64',
    'Relative Humidity (%)': 'float64',
    'Precipitation (mm)': 'float64',
    'Wind Speed (km/h)': 'float64',
    'Sunshine Duration (min)': 'float64',
    'Monat': 'float64',
    'Wochentag': 'category',
    'Wochenende': 'category',
    'Jahreszeit': 'category',
    'Laubfärbung': 'category',
    'Feiertag_Bayern': 'category',
    'Feiertag_CZ': 'category',
    'HEH_geoeffnet': 'category',
    'HZW_geoeffnet': 'category',
    'WGM_geoeffnet': 'category',
    'Lusenschutzhaus_geoeffnet': 'category',
    'Racheldiensthuette_geoeffnet': 'category',
    'Falkensteinschutzhaus_geoeffnet': 'category',
    'Schwellhaeusl_geoeffnet': 'category',
    'Schulferien_Bayern': 'category',
    'Schulferien_CZ': 'category',
    'Jahr': 'float64'
}

# Apply the data types to the dataframe
df_16_sensors = df_16_sensors.astype(dtype_dict)

In [126]:
df_16_sensors = df_16_sensors.loc[:'2024-08-19']

In [127]:
df_16_sensors.isnull().sum()

traffic_abs                        0
Temperature (°C)                   0
Relative Humidity (%)              0
Precipitation (mm)                 0
Wind Speed (km/h)                  0
Sunshine Duration (min)            0
Monat                              0
Wochentag                          0
Wochenende                         0
Jahreszeit                         0
Laubfärbung                        0
Feiertag_Bayern                    0
Feiertag_CZ                        0
HEH_geoeffnet                      0
HZW_geoeffnet                      0
WGM_geoeffnet                      0
Lusenschutzhaus_geoeffnet          0
Racheldiensthuette_geoeffnet       0
Falkensteinschutzhaus_geoeffnet    0
Schwellhaeusl_geoeffnet            0
Schulferien_Bayern                 0
Schulferien_CZ                     0
Jahr                               0
dtype: int64

In [128]:
df_cleaned = df_16_sensors.asfreq('H')  

In [130]:

df_cleaned["Hour"] = df_cleaned.index.hour

In [131]:
# Import the time series module from PyCaret
from pycaret.time_series import setup, compare_models, save_model

from pycaret.regression import *

# Define the target variables
targets = ['traffic_abs']

numeric_features = ['Temperature (°C)',
       'Relative Humidity (%)', 'Precipitation (mm)', 'Wind Speed (km/h)',
       'Sunshine Duration (min)']
catgorical_features =['Hour','Monat', 'Wochentag', 'Wochenende',
       'Jahreszeit', 'Laubfärbung', 'Feiertag_Bayern', 'Feiertag_CZ',
       'HEH_geoeffnet', 'HZW_geoeffnet', 'WGM_geoeffnet',
       'Lusenschutzhaus_geoeffnet', 'Racheldiensthuette_geoeffnet',
       'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet',
       'Schulferien_Bayern', 'Schulferien_CZ', 'Jahr']

for catfeature in catgorical_features:
    df_cleaned[catfeature] = df_cleaned[catfeature].astype(str)



# Loop through each target
for target in targets:
    print(f"\nModeling for target: {target}\n")

    cols_for_modeling = [target] + numeric_features + catgorical_features
    
    # Initialize the PyCaret setup
    ts_setup = setup(
        data=df_cleaned[cols_for_modeling],
        target=target,
        train_size=0.9,
        session_id=42,  # For reproducibility
       # seasonal_period=24,  # Assumes daily seasonality for hourly data
       # fold_strategy='timeseries',  # Use time series cross-validation
        data_split_shuffle=True,
        fold=3,  # Number of folds in time series cross-validation
        #fh=24 * 14,  # Forecast horizon of 2 weeks (24 hours * 14 days)
        numeric_features=numeric_features,
        categorical_features=catgorical_features,
        verbose=False  # Suppress output for clarity
    )
    
    # Compare models and select the best one
    best_model = compare_models()
    
    # Save the best model
    save_model(best_model, f'best_model_{target}')

    print(f"Best model for {target} saved.\n")


Modeling for target: traffic_abs



Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,29.507,3316.6281,57.5876,0.8795,1.1356,1.4571,3.6333
lightgbm,Light Gradient Boosting Machine,31.1907,3828.5781,61.8716,0.8608,1.1322,1.4384,0.51
xgboost,Extreme Gradient Boosting,30.7878,3838.7603,61.9568,0.8605,0.9946,1.2321,0.55
et,Extra Trees Regressor,29.4366,4026.9704,63.4545,0.8537,0.6807,0.8225,5.2867
rf,Random Forest Regressor,32.1688,4618.0034,67.9428,0.8322,0.7216,0.9476,4.58
gbr,Gradient Boosting Regressor,41.8695,6103.7534,78.1254,0.7782,1.3797,2.2957,1.5567
lr,Linear Regression,64.9084,9742.3868,98.7002,0.646,2.1275,6.848,3.3
ridge,Ridge Regression,64.8961,9742.2755,98.6996,0.646,2.1273,6.8462,3.5567
br,Bayesian Ridge,64.8619,9742.2731,98.6996,0.646,2.1268,6.8411,0.5233
dt,Decision Tree Regressor,43.8498,9785.2053,98.9012,0.6446,0.8715,1.0856,0.46


Transformation Pipeline and Model Successfully Saved
Best model for traffic_abs saved.



In [None]:

plot_model(model, plot = 'feature_all')



NameError: name 'model' is not defined

In [None]:
pred_holdout = predict_model(model)
pred_holdout

NameError: name 'model' is not defined

In [None]:
import plotly.express as px

predictions_vs_real = pred_holdout[["traffic_abs", "prediction_label"]].sort_index(ascending=True)
px.line(predictions_vs_real)

NameError: name 'pred_holdout' is not defined