In [None]:
import pandas as pd
from pycaret.time_series import *
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data/cleaned_data/joined_sensor_weather_visitorcenter_2016-2024.csv', parse_dates=['Time'])


In [None]:
# Specify the columns to use
columns_to_use = [
    'Time',
    'traffic_norm',
    'traffic_abs',
    'Temperature (°C)',
    'Relative Humidity (%)',
    'Precipitation (mm)',
    'Wind Speed (km/h)',
    'Sunshine Duration (min)',
    'Monat',
    'Wochentag',
    'Wochenende',
    'Jahreszeit',
    'Laubfärbung',
    'Feiertag_Bayern',
    'Feiertag_CZ',
    'HEH_geoeffnet',
    'HZW_geoeffnet',
    'WGM_geoeffnet',
    'Lusenschutzhaus_geoeffnet',
    'Racheldiensthuette_geoeffnet',
    'Falkensteinschutzhaus_geoeffnet',
    'Schwellhaeusl_geoeffnet',
    'Schulferien_Bayern',
    'Schulferien_CZ',
    'Jahr'
]


In [None]:
# Filter the dataframe to only include the specified columns
df = df[columns_to_use]

# Display the first few rows to ensure the data is loaded correctly
df.head()

In [None]:
# Specify the data types
dtype_dict = {
    'Time': 'datetime64[ns]',
    'traffic_norm': 'float64',
    'traffic_abs': 'float64',
    'Temperature (°C)': 'float64',
    'Relative Humidity (%)': 'float64',
    'Precipitation (mm)': 'float64',
    'Wind Speed (km/h)': 'float64',
    'Sunshine Duration (min)': 'float64',
    'Monat': 'float64',
    'Wochentag': 'category',
    'Wochenende': 'category',
    'Jahreszeit': 'category',
    'Laubfärbung': 'category',
    'Feiertag_Bayern': 'category',
    'Feiertag_CZ': 'category',
    'HEH_geoeffnet': 'category',
    'HZW_geoeffnet': 'category',
    'WGM_geoeffnet': 'category',
    'Lusenschutzhaus_geoeffnet': 'category',
    'Racheldiensthuette_geoeffnet': 'category',
    'Falkensteinschutzhaus_geoeffnet': 'category',
    'Schwellhaeusl_geoeffnet': 'category',
    'Schulferien_Bayern': 'category',
    'Schulferien_CZ': 'category',
    'Jahr': 'float64'
}

# Apply the data types to the dataframe
df = df.astype(dtype_dict)

# Set 'Time' as the index
df.set_index('Time', inplace=True)



In [None]:
# Slice the data from January 1, 2023, to August 19, 2024
df = df.loc['2023-01-01':'2024-08-19']
# Display the info to check data types
df.info()

In [None]:
df

In [None]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

In [None]:
df_cleaned = df.drop(columns=['traffic_norm'])
df_cleaned = df_cleaned.asfreq('H')  # Set the frequency explicitly
df_cleaned.index


In [None]:
df_cleaned.iloc[0]

In [None]:
df_cleaned.columns

In [None]:
# Define the target variable
target = 'traffic_abs'

numeric_features =['Temperature (°C)', 'Relative Humidity (%)',
       'Precipitation (mm)', 'Wind Speed (km/h)', 'Sunshine Duration (min)']

# Initialize the PyCaret setup
ts_setup = setup(
    data=df_cleaned,  # Use the cleaned data
    target=target,
    
    #index='Time',  # Specify that 'Time' is the index
    session_id=42,  # For reproducibility
    #seasonal_period='auto',  # Let PyCaret automatically detect the seasonal period
    #fold_strategy='timeseries',  # Use time series cross-validation
    fold=3,  # Number of folds in time series cross-validation
    fh=24 * 14,  # Forecast horizon of 2 weeks (24 hours * 14 days)
    verbose=True  # Show detailed logs for debugging
)

# Compare models and select the best one
best_model = compare_models()

# Save the best model
save_model(best_model, 'best_model_traffic_abs')

print("Best model for traffic_abs saved.")

In [None]:
df.columns

In [None]:
df["Hour"] = df.index.hour
df



In [None]:
from pycaret.regression import *

# Define the target variables
targets = ['traffic_norm', 'traffic_abs']

numeric_features = ['Temperature (°C)',
       'Relative Humidity (%)', 'Precipitation (mm)', 'Wind Speed (km/h)',
       'Sunshine Duration (min)']
catgorical_features =['Hour','Monat', 'Wochentag', 'Wochenende',
       'Jahreszeit', 'Laubfärbung', 'Feiertag_Bayern', 'Feiertag_CZ',
       'HEH_geoeffnet', 'HZW_geoeffnet', 'WGM_geoeffnet',
       'Lusenschutzhaus_geoeffnet', 'Racheldiensthuette_geoeffnet',
       'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet',
       'Schulferien_Bayern', 'Schulferien_CZ', 'Jahr']

for catfeature in catgorical_features:
    df[catfeature] = df[catfeature].astype(str)



# Loop through each target
for target in targets:
    print(f"\nModeling for target: {target}\n")

    cols_for_modeling = [target] + numeric_features + catgorical_features
    
    # Initialize the PyCaret setup
    ts_setup = setup(
        data=df.dropna(subset="traffic_norm")[cols_for_modeling],
        target=target,
        train_size=0.9,
        session_id=42,  # For reproducibility
       # seasonal_period=24,  # Assumes daily seasonality for hourly data
       # fold_strategy='timeseries',  # Use time series cross-validation
        data_split_shuffle=True,
        fold=3,  # Number of folds in time series cross-validation
        #fh=24 * 14,  # Forecast horizon of 2 weeks (24 hours * 14 days)
        numeric_features=numeric_features,
        categorical_features=catgorical_features,
        verbose=False  # Suppress output for clarity
    )
    
    # Compare models and select the best one
    best_model = compare_models()
    
    # Save the best model
    save_model(best_model, f'best_model_{target}')

    print(f"Best model for {target} saved.\n")

model = create_model("et")


In [None]:
model = create_model("et")

In [None]:
plot_model(model, plot = 'feature_all')

In [None]:
models()

In [None]:
pred_holdout = predict_model(model)
pred_holdout

In [None]:
import plotly.express as px

predictions_vs_real = pred_holdout[["traffic_abs", "prediction_label"]].sort_index(ascending=True)
px.line(predictions_vs_real)

In [None]:
print(f"On average, {daily_prediction_comparison.traffic_abs.mean()} people are visiting the park daily.")

In [None]:
daily_prediction_comparison = predictions_vs_real.resample("1d").sum()
daily_prediction_comparison["mae"] = abs(daily_prediction_comparison["traffic_abs"] - daily_prediction_comparison["prediction_label"])

print(f"The MAE on a daily basis is {daily_prediction_comparison.mae.mean()}.")
px.line(daily_prediction_comparison[["traffic_abs", "prediction_label"]])

In [None]:
px.box(daily_prediction_comparison["mae"])

In [None]:
high_error_dates = daily_prediction_comparison["mae"].sort_values(ascending=False).head(50)
high_error_dates

In [None]:
X_train = get_config('X_train')
X_train_columns = X_train.columns.to_list()
X_train_columns

In [None]:
df

In [None]:
X_train

In [None]:
start_date = "2024-08-30 00:00"
end_date = "2024-09-07 23:00"

inference_index = pd.date_range(
    start=pd.to_datetime(start_date),
    end=pd.to_datetime(end_date),
    freq="1h"
)

inference_df = pd.DataFrame(
    index=inference_index,
    columns=X_train_columns
)
inference_df["Hour"] = inference_df.index.hour
inference_df



In [None]:
pred_unseen = predict_model(model, data = inference_df)