In [6]:
import pandas as pd
from pycaret.time_series import *
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3

boto3.setup_default_session(profile_name='patricio_ferreira_fellow_dssgx_24')

bucket = "dssgx-munich-2024-bavarian-forest"
raw_data_folder = "raw-data"
preprocessed_data_folder = "preprocessed_data"

def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df = wr.s3.read_csv(path=path, **kwargs)
    return df
df = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/joined_sensor_weather_visitorcenter_2016-2024.csv"
)
df.head()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Total
0,2016-01-01 00:00:00,,,,,,,,,,...,,,,,,,,,,
1,2016-01-01 01:00:00,,,,,,,,,,...,,,,,,,,,,
2,2016-01-01 02:00:00,,,,,,,,,,...,,,,,,,,,,
3,2016-01-01 03:00:00,,,,,,,,,,...,,,,,,,,,,
4,2016-01-01 04:00:00,,,,,,,,,,...,,,,,,,,,,


In [7]:
#drop old traffic columns

df.drop(columns= ['traffic_norm', 'traffic_abs', 'sum_IN_norm',
                  'sum_IN_abs', 'sum_OUT_norm', 'sum_OUT_abs',
                  'diff_norm', 'diff_abs'], 
                  inplace=True)



In [8]:
def process_sensor_data(df):
    """
    Processes sensor data by melting, splitting, and pivoting to separate IN/OUT directions, 
    and then merges back non-sensor columns.

    Args:
        df (pd.DataFrame): The input DataFrame containing sensor data and other related columns.

    Returns:
        pd.DataFrame: The processed DataFrame with separate IN and OUT columns for each sensor, 
        merged back with the original non-sensor data.
    """
    # Ensure the 'Time' column is in datetime format
    df['Time'] = pd.to_datetime(df['Time'])

    # List of sensor columns to melt with IN/OUT directions
    sensor_columns = [
        'Bayerisch Eisenstein', 'Brechhäuslau', 'Bucina', 'Deffernik', 'Diensthüttenstraße',
        'Falkenstein 1', 'Falkenstein 2', 'Felswandergebiet', 'Ferdinandsthal', 'Fredenbrücke',
        'Gfäll', 'Gsenget', 'Klingenbrunner Wald', 'Klosterfilz', 'Lusen 1', 'Lusen 2',
        'Lusen 3', 'Racheldiensthütte', 'Sagwassersäge', 'Scheuereck', 'Schillerstraße',
        'Schwarzbachbrücke', 'Trinkwassertalsperre', 'Waldhausreibe', 'Waldspielgelände',
        'Wistlberg'
    ]
    melt_columns = [f'{sensor} {direction}' for sensor in sensor_columns for direction in ['IN', 'OUT']]

    # Columns that will be preserved during melting
    id_vars = [
        'Time', 'Temperature (°C)', 'Relative Humidity (%)', 'Precipitation (mm)',
        'Wind Speed (km/h)', 'Sunshine Duration (min)', 'Tag', 'Monat', 'Jahr', 'Wochentag',
        'Wochenende', 'Jahreszeit', 'Laubfärbung', 'Besuchszahlen_HEH', 'Besuchszahlen_HZW',
        'Besuchszahlen_WGM', 'Parkpl_HEH_PKW', 'Parkpl_HEH_BUS', 'Parkpl_HZW_PKW', 'Parkpl_HZW_BUS',
        'Schulferien_Bayern', 'Schulferien_CZ', 'Feiertag_Bayern', 'Feiertag_CZ', 'HEH_geoeffnet',
        'HZW_geoeffnet', 'WGM_geoeffnet', 'Lusenschutzhaus_geoeffnet', 'Racheldiensthuette_geoeffnet',
        'Waldschmidthaus_geoeffnet', 'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet',
        'Temperatur', 'Niederschlagsmenge', 'Schneehoehe', 'GS mit', 'GS max'
    ]

    # Melt the DataFrame
    df_melted = df.melt(
        id_vars=id_vars,
        value_vars=melt_columns,
        var_name='Sensor_Direction',
        value_name='Count'
    )

    # Split Sensor_Direction into 'Sensor' and 'Direction'
    df_melted[['Sensor', 'Direction']] = df_melted['Sensor_Direction'].str.rsplit(' ', n=1, expand=True)
    df_melted.drop(columns=['Sensor_Direction'], inplace=True)

    # Extract hour from the 'Time' column
    df_melted['Hour'] = df_melted['Time'].dt.hour

    # Pivot the melted DataFrame to separate IN/OUT columns
    df_pivot = df_melted.pivot_table(
        index=['Time', 'Sensor', 'Hour'],
        columns='Direction',
        values='Count',
        aggfunc='sum'
    ).reset_index()

    # Merge back the non-sensor columns
    df_final = pd.merge(df_pivot, df[id_vars], on='Time', how='left')

    return df_final


In [9]:
df_long = process_sensor_data(df)

KeyError: "The following 'value_vars' are not present in the DataFrame: ['Bucina IN', 'Bucina OUT', 'Falkenstein 1 IN', 'Falkenstein 1 OUT', 'Lusen 1 IN', 'Lusen 1 OUT', 'Trinkwassertalsperre IN', 'Trinkwassertalsperre OUT']"