In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import awswrangler as wr
import boto3

boto3.setup_default_session(profile_name='patricio_ferreira_fellow_dssgx_24')

bucket = "dssgx-munich-2024-bavarian-forest"
raw_data_folder = "raw-data"
preprocessed_data_folder = "preprocessed_data"

def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.
    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.
    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """
    df = wr.s3.read_csv(path=path, **kwargs)
    return df
df = load_csv_files_from_aws_s3(
    path="s3://dssgx-munich-2024-bavarian-forest/preprocessed_data/joined_sensor_weather_visitorcenter_2016-2024.csv"
)
df.head()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Total
0,2016-01-01 00:00:00,,,,,,,,,,...,,,,,,,,,,
1,2016-01-01 01:00:00,,,,,,,,,,...,,,,,,,,,,
2,2016-01-01 02:00:00,,,,,,,,,,...,,,,,,,,,,
3,2016-01-01 03:00:00,,,,,,,,,,...,,,,,,,,,,
4,2016-01-01 04:00:00,,,,,,,,,,...,,,,,,,,,,


In [20]:
df[df["Time"] == "2024-03-31 03:00:00"].describe()

Unnamed: 0,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,Felswandergebiet OUT,...,Parkpl_HEH_BUS,Parkpl_HZW_PKW,Parkpl_HZW_BUS,Waldschmidthaus_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Total
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,,,,,,,,,,,...,0.0,135.0,0.0,675.0,13.576389,0.0,0.0,97.361111,544.0,1.0
std,,,,,,,,,,,...,,,,,,,,,,
min,,,,,,,,,,,...,0.0,135.0,0.0,675.0,13.576389,0.0,0.0,97.361111,544.0,1.0
25%,,,,,,,,,,,...,0.0,135.0,0.0,675.0,13.576389,0.0,0.0,97.361111,544.0,1.0
50%,,,,,,,,,,,...,0.0,135.0,0.0,675.0,13.576389,0.0,0.0,97.361111,544.0,1.0
75%,,,,,,,,,,,...,0.0,135.0,0.0,675.0,13.576389,0.0,0.0,97.361111,544.0,1.0
max,,,,,,,,,,,...,0.0,135.0,0.0,675.0,13.576389,0.0,0.0,97.361111,544.0,1.0


In [7]:
#drop old traffic columns

df.drop(columns= ['traffic_norm', 'traffic_abs', 'sum_IN_norm',
                  'sum_IN_abs', 'sum_OUT_norm', 'sum_OUT_abs',
                  'diff_norm', 'diff_abs'], 
                  inplace=True)

In [11]:
import pandas as pd

def process_sensor_data(df):
    """
    Processes sensor data by melting, splitting, renaming, and pivoting to separate IN/OUT directions, 
    and then merges back non-sensor columns.

    Args:
        df (pd.DataFrame): The input DataFrame containing sensor data and other related columns.

    Returns:
        pd.DataFrame: The processed DataFrame with separate IN and OUT columns for each sensor, 
        merged back with the original non-sensor data.
    """
    # Ensure the 'Time' column is in datetime format
    df['Time'] = pd.to_datetime(df['Time'])

    # Define the renaming dictionary
    rename_dict = {
        "Bayerisch Eisenstein": ['Bayerisch Eisenstein IN', 'Bayerisch Eisenstein OUT'],
        "Brechhäuslau": ['Brechhäuslau IN', 'Brechhäuslau OUT'],
        "Bučina": ['Bucina MERGED IN', 'Bucina MERGED OUT'],
        "Deffernik": ['Deffernik IN', 'Deffernik OUT'],
        "Diensthüttenstraße": ['Diensthüttenstraße IN', 'Diensthüttenstraße OUT'],
        "Felswandergebiet": ['Felswandergebiet IN', 'Felswandergebiet OUT'],
        "Ferdinandsthal": ['Ferdinandsthal IN', 'Ferdinandsthal OUT'],
        "Fredenbrücke": ['Fredenbrücke IN', 'Fredenbrücke OUT'],
        "Gfäll": ['Gfäll IN', 'Gfäll OUT'],
        "Gsenget": ['Gsenget IN', 'Gsenget OUT'],
        "Klingenbrunner Wald": ['Klingenbrunner Wald IN', 'Klingenbrunner Wald OUT'],
        "Klosterfilz": ['Klosterfilz IN', 'Klosterfilz OUT'],
        "Racheldiensthütte": ['Racheldiensthütte IN', 'Racheldiensthütte OUT'],
        "Sagwassersäge": ['Sagwassersäge IN','Sagwassersäge OUT'],
        "Scheuereck": ['Scheuereck IN', 'Scheuereck OUT'],
        "Schillerstraße": ['Schillerstraße IN', 'Schillerstraße OUT'],
        "Schwarzbachbrücke": ['Schwarzbachbrücke IN', 'Schwarzbachbrücke OUT'],
        "TFG Falkenstein 1": ['Falkenstein 1 MERGED IN', 'Falkenstein 1 MERGED OUT'],
        "TFG Falkenstein 2": ['Falkenstein 2 IN', 'Falkenstein 2 OUT'],
        "TFG Lusen 1": ['Lusen 1 MERGED IN', 'Lusen 1 MERGED OUT'],
        "TFG Lusen 2": ['Lusen 2 IN', 'Lusen 2 OUT'],
        "TFG Lusen 3": ['Lusen 3 IN', 'Lusen 3 OUT'],
        "Trinkwassertalsperre": ['Trinkwassertalsperre MERGED IN', 'Trinkwassertalsperre MERGED OUT'],
        "Waldhausreibe": ['Waldhausreibe IN', 'Waldhausreibe OUT'],
        "Waldspielgelände": ['Waldspielgelände IN', 'Waldspielgelände OUT'],
        "Wistlberg": ['Wistlberg IN', 'Wistlberg OUT'],
    }

    # Flatten the renaming dictionary to map sensors to their IN/OUT variants
    rename_map = {sensor: name for name, sensors in rename_dict.items() for sensor in sensors}

    # List of all sensor columns
    melt_columns = list(rename_map.keys())

    # Columns that will be preserved during melting
    id_vars = [
        'Time', 'Temperature (°C)', 'Relative Humidity (%)', 'Precipitation (mm)',
        'Wind Speed (km/h)', 'Sunshine Duration (min)', 'Tag', 'Monat', 'Jahr', 'Wochentag',
        'Wochenende', 'Jahreszeit', 'Laubfärbung', 'Besuchszahlen_HEH', 'Besuchszahlen_HZW',
        'Besuchszahlen_WGM', 'Parkpl_HEH_PKW', 'Parkpl_HEH_BUS', 'Parkpl_HZW_PKW', 'Parkpl_HZW_BUS',
        'Schulferien_Bayern', 'Schulferien_CZ', 'Feiertag_Bayern', 'Feiertag_CZ', 'HEH_geoeffnet',
        'HZW_geoeffnet', 'WGM_geoeffnet', 'Lusenschutzhaus_geoeffnet', 'Racheldiensthuette_geoeffnet',
        'Waldschmidthaus_geoeffnet', 'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet',
        'Temperatur', 'Niederschlagsmenge', 'Schneehoehe', 'GS mit', 'GS max'
    ]

    # Melt the DataFrame
    df_melted = df.melt(
        id_vars=id_vars,
        value_vars=melt_columns,
        var_name='Sensor_Direction',
        value_name='Count'
    )

    # Split Sensor_Direction into 'Sensor' and 'Direction'
    df_melted[['Sensor', 'Direction']] = df_melted['Sensor_Direction'].str.rsplit(' ', n=1, expand=True)

    # Map the Sensor names to their desired format using rename_dict
    df_melted['Sensor'] = df_melted['Sensor_Direction'].map(rename_map)

    # Drop the original Sensor_Direction column
    df_melted.drop(columns=['Sensor_Direction'], inplace=True)

    # Extract hour from the 'Time' column
    df_melted['Hour'] = df_melted['Time'].dt.hour

    # Pivot the melted DataFrame to separate IN/OUT columns
    df_pivot = df_melted.pivot_table(
        index=['Time', 'Sensor', 'Hour'],
        columns='Direction',
        values='Count',
        aggfunc='sum'
    ).reset_index()

    # Merge back the non-sensor columns
    df_final = pd.merge(df_pivot, df[id_vars], on='Time', how='left')

    return df_final


In [12]:
df_long = process_sensor_data(df)

In [13]:
df_long.head()

Unnamed: 0,Time,Sensor,Hour,IN,OUT,Temperature (°C),Relative Humidity (%),Precipitation (mm),Wind Speed (km/h),Sunshine Duration (min),...,Lusenschutzhaus_geoeffnet,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max
0,2016-01-01,Bayerisch Eisenstein,0,0.0,0.0,-1.4,96.0,0.0,0.0,0.0,...,,,,,,,,,,
1,2016-01-01,Brechhäuslau,0,0.0,0.0,-1.4,96.0,0.0,0.0,0.0,...,,,,,,,,,,
2,2016-01-01,Bučina,0,0.0,0.0,-1.4,96.0,0.0,0.0,0.0,...,,,,,,,,,,
3,2016-01-01,Deffernik,0,0.0,0.0,-1.4,96.0,0.0,0.0,0.0,...,,,,,,,,,,
4,2016-01-01,Diensthüttenstraße,0,0.0,0.0,-1.4,96.0,0.0,0.0,0.0,...,,,,,,,,,,
