In [None]:
import pandas as pd
from pycaret.time_series import *
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3

boto3.setup_default_session(profile_name='') # Update with your profile name

bucket = "dssgx-munich-2024-bavarian-forest"
raw_data_folder = "raw-data"
preprocessed_data_folder = "preprocessed_data"

def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df = wr.s3.read_csv(path=path, **kwargs)
    return df
df = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/joined_sensor_weather_visitorcenter_2016-2024.csv"
)
df.head()

# First Draft

- sensor on wich gfäl started working
- Target =  Traffic Abs


In [158]:
import numpy as np

df['Time']=pd.to_datetime(df['Time']) 

df = df.set_index('Time').sort_index()

complete_df = df.copy()

In [None]:
drop_metric_cols = ['traffic_abs', 'traffic_norm', 'occupancy_abs', 'occupancy_norm', 'sum_IN_norm', 'sum_IN_abs', 'sum_OUT_norm', 'sum_OUT_abs']

df.drop(columns = drop_metric_cols, inplace=True)

df.columns

In [None]:
weather_cols = [col for col in df.columns if "IN" not in col and "OUT" not in col]
weather_cols

In [None]:
print("Number of sensors working: ", df.loc["2021-10-01 10:00:00", "working_sensors"])
df = df[df.index >= "2021-10-01 10:00:00"]
df.head()

In [162]:
# Filter the row corresponding to the given timestamp
row = df.loc["2021-10-01 10:00:00"]

# Get columns with non-null values
non_null_columns = row[row.notnull()].index.tolist()

sensor_cols = [col for col in df.columns if "IN" in col or "OUT" in col]



# Display the columns
selected_sensors_cols = [col for col in sensor_cols if col in non_null_columns]

In [None]:
df_16_sensors = df[selected_sensors_cols + weather_cols]
df_16_sensors.head()

Create traffic for 16 sensors

In [164]:
df_16_sensors["traffic_abs"] = df_16_sensors[selected_sensors_cols].sum(axis=1)

## Comparition of normalized values from complete_df and 16_sensor_df

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

# Assuming `complete_df` and `df` have timestamps as their index
# Filter both DataFrames from the given timestamp
start_date = "2021-10-02 10:00:00"
end_date = "2024-03-31 10:00:00"
complete_df_filtered = complete_df.loc[start_date: end_date]
df_filtered = df_16_sensors.loc[start_date: end_date]

# Normalize the `traffic_abs` columns using MinMaxScaler
scaler = MinMaxScaler()

# Adding traffic_abs to avoid reshaping issues with indexes
complete_df_filtered['normalized_traffic_abs'] = scaler.fit_transform(complete_df_filtered[['traffic_abs']])
df_filtered['normalized_traffic_abs'] = scaler.fit_transform(df_filtered[['traffic_abs']])

# Combine data for plotting
combined_df = pd.concat([
    complete_df_filtered[['normalized_traffic_abs']].assign(Source='Complete'),
    df_filtered[['normalized_traffic_abs']].assign(Source='DF')
])

# Reset index for plotly compatibility
#combined_df.reset_index(inplace=True)

# Plot using Plotly Express
fig = px.line(
    combined_df,
    x=combined_df.index,
    y='normalized_traffic_abs',
    color='Source',
    title='Normalized Traffic Over Time',
    labels={'index': 'Timestamp', 'normalized_traffic_abs': 'Normalized Traffic'}
)

fig.show()

In [124]:
# Specify the columns to use
columns_to_use = [
    'traffic_abs',
    'Temperature (°C)',
    'Relative Humidity (%)',
    'Precipitation (mm)',
    'Wind Speed (km/h)',
    'Sunshine Duration (min)',
    'Monat',
    'Wochentag',
    'Wochenende',
    'Jahreszeit',
    'Laubfärbung',
    'Feiertag_Bayern',
    'Feiertag_CZ',
    'HEH_geoeffnet',
    'HZW_geoeffnet',
    'WGM_geoeffnet',
    'Lusenschutzhaus_geoeffnet',
    'Racheldiensthuette_geoeffnet',
    'Falkensteinschutzhaus_geoeffnet',
    'Schwellhaeusl_geoeffnet',
    'Schulferien_Bayern',
    'Schulferien_CZ',
    'Jahr'
]

df_16_sensors = df_16_sensors[columns_to_use]

In [125]:
dtype_dict = {
    'traffic_abs': 'float64',
    'Temperature (°C)': 'float64',
    'Relative Humidity (%)': 'float64',
    'Precipitation (mm)': 'float64',
    'Wind Speed (km/h)': 'float64',
    'Sunshine Duration (min)': 'float64',
    'Monat': 'float64',
    'Wochentag': 'category',
    'Wochenende': 'category',
    'Jahreszeit': 'category',
    'Laubfärbung': 'category',
    'Feiertag_Bayern': 'category',
    'Feiertag_CZ': 'category',
    'HEH_geoeffnet': 'category',
    'HZW_geoeffnet': 'category',
    'WGM_geoeffnet': 'category',
    'Lusenschutzhaus_geoeffnet': 'category',
    'Racheldiensthuette_geoeffnet': 'category',
    'Falkensteinschutzhaus_geoeffnet': 'category',
    'Schwellhaeusl_geoeffnet': 'category',
    'Schulferien_Bayern': 'category',
    'Schulferien_CZ': 'category',
    'Jahr': 'float64'
}

# Apply the data types to the dataframe
df_16_sensors = df_16_sensors.astype(dtype_dict)

In [126]:
df_16_sensors = df_16_sensors.loc[:'2024-08-19']

In [128]:
df_cleaned = df_16_sensors.asfreq('H')  

In [130]:

df_cleaned["Hour"] = df_cleaned.index.hour

In [None]:
# Import the time series module from PyCaret
from pycaret.time_series import setup, compare_models, save_model

from pycaret.regression import *

# Define the target variables
targets = ['traffic_abs']

numeric_features = ['Temperature (°C)',
       'Relative Humidity (%)', 'Precipitation (mm)', 'Wind Speed (km/h)',
       'Sunshine Duration (min)']
catgorical_features =['Hour','Monat', 'Wochentag', 'Wochenende',
       'Jahreszeit', 'Laubfärbung', 'Feiertag_Bayern', 'Feiertag_CZ',
       'HEH_geoeffnet', 'HZW_geoeffnet', 'WGM_geoeffnet',
       'Lusenschutzhaus_geoeffnet', 'Racheldiensthuette_geoeffnet',
       'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet',
       'Schulferien_Bayern', 'Schulferien_CZ', 'Jahr']

for catfeature in catgorical_features:
    df_cleaned[catfeature] = df_cleaned[catfeature].astype(str)



# Loop through each target
for target in targets:
    print(f"\nModeling for target: {target}\n")

    cols_for_modeling = [target] + numeric_features + catgorical_features
    
    # Initialize the PyCaret setup
    ts_setup = setup(
        data=df_cleaned[cols_for_modeling],
        target=target,
        train_size=0.9,
        session_id=42,  # For reproducibility
       # seasonal_period=24,  # Assumes daily seasonality for hourly data
       # fold_strategy='timeseries',  # Use time series cross-validation
        data_split_shuffle=True,
        fold=3,  # Number of folds in time series cross-validation
        #fh=24 * 14,  # Forecast horizon of 2 weeks (24 hours * 14 days)
        numeric_features=numeric_features,
        categorical_features=catgorical_features,
        verbose=False  # Suppress output for clarity
    )
    
    # Compare models and select the best one
    best_model = compare_models()
    
    # Save the best model
    save_model(best_model, f'best_model_{target}')

    print(f"Best model for {target} saved.\n")

In [None]:

plot_model(model, plot = 'feature_all')



In [None]:
pred_holdout = predict_model(model)
pred_holdout

In [None]:
import plotly.express as px

predictions_vs_real = pred_holdout[["traffic_abs", "prediction_label"]].sort_index(ascending=True)
px.line(predictions_vs_real)