## 1. Import Necessary Libraries

In [14]:
import pandas as pd
from functools import reduce

## 2. Load Datasets
Specify the file paths to your CSV files and load them into DataFrames.


In [15]:
# File paths (update these paths as necessary)
manual_sensors_path = 'data/cleaned_data/normalized_merged_data_historic_sensors_2016_2024.csv'
weather_data_path = 'data/cleaned_data/processed_weather_data_2016-24_forecasted_imputed.csv'
visitorcenter_data_path = 'data/cleaned_data/bf_visitcenters_hourly .csv'

# Load CSV files into DataFrames
manual_sensors = pd.read_csv(manual_sensors_path)
weather_data = pd.read_csv(weather_data_path)
visitorcenter_data = pd.read_csv(visitorcenter_data_path)

In [27]:
manual_sensors.head()

Unnamed: 0_level_0,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,Felswandergebiet OUT,...,Falkenstein 1 MERGED OUT,Lusen 1 MERGED IN,Lusen 1 MERGED OUT,Trinkwassertalsperre MERGED IN,Trinkwassertalsperre MERGED OUT,traffic,sum_IN,sum_OUT,diff,occupancy
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,,,,,,,,,,,...,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-01 01:00:00,,,,,,,,,,,...,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-01 02:00:00,,,,,,,,,,,...,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-01 03:00:00,,,,,,,,,,,...,,,,,,0.0,0.0,0.0,0.0,0.0
2016-01-01 04:00:00,,,,,,,,,,,...,,,,,,0.0,0.0,0.0,0.0,0.0


In [17]:
visitorcenter_data.columns

Index(['Time', 'Tag', 'Monat', 'Jahr', 'Wochentag', 'Wochenende', 'Jahreszeit',
       'Laubfärbung', 'Besuchszahlen_HEH', 'Besuchszahlen_HZW',
       'Besuchszahlen_WGM', 'Parkpl_HEH_PKW', 'Parkpl_HEH_BUS',
       'Parkpl_HZW_PKW', 'Parkpl_HZW_BUS', 'Schulferien_Bayern',
       'Schulferien_CZ', 'Feiertag_Bayern', 'Feiertag_CZ', 'HEH_geoeffnet',
       'HZW_geoeffnet', 'WGM_geoeffnet', 'Lusenschutzhaus_geoeffnet',
       'Racheldiensthuette_geoeffnet', 'Waldschmidthaus_geoeffnet',
       'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet',
       'Temperatur', 'Niederschlagsmenge', 'Schneehoehe', 'GS mit', 'GS max',
       'Total'],
      dtype='object')

## 3. Data Preprocessing
Ensure that the timestamp columns are in datetime format and set them as the index.

In [18]:
import pandas as pd

def prepare_data(df, timestamp_column='Time'):
    """
    Prepare DataFrame by ensuring the index is a DateTimeIndex, resampling to hourly frequency,
    and handling missing values.
    
    Parameters:
    - df: DataFrame containing the data.
    - timestamp_column: Name of the timestamp column to convert and set as the index.
    
    Returns:
    - df: DataFrame resampled to hourly frequency with missing values handled.
    """
    # Ensure the timestamp column is converted to datetime if it's not already the index
    if timestamp_column in df.columns:
        df[timestamp_column] = pd.to_datetime(df[timestamp_column])
        df.set_index(timestamp_column, inplace=True)
    
    # Ensure the index is a DateTimeIndex
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("Index must be a DateTimeIndex.")

    # Resample to hourly frequency and handle missing values
    df = df.asfreq('H')  # Set frequency to hourly
    #df = df.ffill().bfill()  # Impute missing values using forward fill and backward fill
    
    return df

# Example usage: Prepare each dataset
manual_sensors = prepare_data(manual_sensors)
weather_data = prepare_data(weather_data)
visitorcenter_data = prepare_data(visitorcenter_data)


## 4. Concatenate DataFrames
Combine all datasets based on the timestamp index.

In [19]:
# List of DataFrames to concatenate
data_frames = [manual_sensors, weather_data,visitorcenter_data]

# Concatenate DataFrames on the timestamp index
merged_data = reduce(lambda left, right: pd.concat([left, right], axis=1, join='outer'), data_frames)

# Optional: Handle any remaining missing values using ffill() and bfill()
#merged_data.ffill(inplace=True)  # Forward fill
#merged_data.bfill(inplace=True)  # Backward fill

# Save the combined dataset to a new CSV
merged_data.to_csv(r'data\cleaned_data\final_merged_dataset.csv')


## 5. Review the Combined Data
Check the first few rows of the merged DataFrame to ensure it looks correct.

In [20]:
# Display the first few rows of the merged dataset
merged_data.head()


Unnamed: 0_level_0,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Brechhäuslau IN,Brechhäuslau OUT,Deffernik IN,Deffernik OUT,Diensthüttenstraße IN,Diensthüttenstraße OUT,Felswandergebiet IN,Felswandergebiet OUT,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Total
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2016-01-01 01:00:00,,,,,,,,,,,...,,,,,,,,,,
2016-01-01 02:00:00,,,,,,,,,,,...,,,,,,,,,,
2016-01-01 03:00:00,,,,,,,,,,,...,,,,,,,,,,
2016-01-01 04:00:00,,,,,,,,,,,...,,,,,,,,,,


In [21]:
merged_data.columns

Index(['Bayerisch Eisenstein IN', 'Bayerisch Eisenstein OUT',
       'Brechhäuslau IN', 'Brechhäuslau OUT', 'Deffernik IN', 'Deffernik OUT',
       'Diensthüttenstraße IN', 'Diensthüttenstraße OUT',
       'Felswandergebiet IN', 'Felswandergebiet OUT', 'Ferdinandsthal IN',
       'Ferdinandsthal OUT', 'Fredenbrücke IN', 'Fredenbrücke OUT', 'Gfäll IN',
       'Gfäll OUT', 'Gsenget IN', 'Gsenget OUT', 'Klingenbrunner Wald IN',
       'Klingenbrunner Wald OUT', 'Klosterfilz IN', 'Klosterfilz OUT',
       'Racheldiensthütte IN', 'Racheldiensthütte OUT', 'Sagwassersäge IN',
       'Sagwassersäge OUT', 'Scheuereck IN', 'Scheuereck OUT',
       'Schillerstraße IN', 'Schillerstraße OUT', 'Schwarzbachbrücke IN',
       'Schwarzbachbrücke OUT', 'Falkenstein 2 OUT', 'Falkenstein 2 IN',
       'Lusen 2 IN', 'Lusen 2 OUT', 'Lusen 3 IN', 'Lusen 3 OUT',
       'Waldhausreibe IN', 'Waldhausreibe OUT', 'Waldspielgelände IN',
       'Waldspielgelände OUT', 'Wistlberg IN', 'Wistlberg OUT',
       'Bucina

In [22]:
merged_data.index.name

'Time'

### check for missing values in the aggregated dataset and duplicate timestamps in the index of dataset

In [23]:


# Check for missing values in the aggregated dataset
missing_values = merged_data.isnull().sum()

# Check if there are any rows with duplicate index values
duplicate_index = merged_data.index.duplicated().sum()

# Print the results
print("Missing values per column:")
print(missing_values[missing_values > 0])  # Display columns with missing values

if duplicate_index > 0:
    print(f"\nThere are {duplicate_index} rows with duplicate index values.")
else:
    print("\nNo duplicate index values found.")

# Optional: Display the duplicate rows if any
if duplicate_index > 0:
    print("\nDuplicate rows based on the index:")
    print(merged_data[merged_data.index.duplicated(keep=False)])


Missing values per column:
Bayerisch Eisenstein IN     60053
Bayerisch Eisenstein OUT    60053
Brechhäuslau IN              8134
Brechhäuslau OUT             8134
Deffernik IN                51747
                            ...  
Niederschlagsmenge          13200
Schneehoehe                 13416
GS mit                      13200
GS max                      13248
Total                        8784
Length: 96, dtype: int64

No duplicate index values found.


## Identify Numerical Features

In [24]:
# Get numerical features
numerical_features = merged_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features


['Bayerisch Eisenstein IN',
 'Bayerisch Eisenstein OUT',
 'Brechhäuslau IN',
 'Brechhäuslau OUT',
 'Deffernik IN',
 'Deffernik OUT',
 'Diensthüttenstraße IN',
 'Diensthüttenstraße OUT',
 'Felswandergebiet IN',
 'Felswandergebiet OUT',
 'Ferdinandsthal IN',
 'Ferdinandsthal OUT',
 'Fredenbrücke IN',
 'Fredenbrücke OUT',
 'Gfäll IN',
 'Gfäll OUT',
 'Gsenget IN',
 'Gsenget OUT',
 'Klingenbrunner Wald IN',
 'Klingenbrunner Wald OUT',
 'Klosterfilz IN',
 'Klosterfilz OUT',
 'Racheldiensthütte IN',
 'Racheldiensthütte OUT',
 'Sagwassersäge IN',
 'Sagwassersäge OUT',
 'Scheuereck IN',
 'Scheuereck OUT',
 'Schillerstraße IN',
 'Schillerstraße OUT',
 'Schwarzbachbrücke IN',
 'Schwarzbachbrücke OUT',
 'Falkenstein 2 OUT',
 'Falkenstein 2 IN',
 'Lusen 2 IN',
 'Lusen 2 OUT',
 'Lusen 3 IN',
 'Lusen 3 OUT',
 'Waldhausreibe IN',
 'Waldhausreibe OUT',
 'Waldspielgelände IN',
 'Waldspielgelände OUT',
 'Wistlberg IN',
 'Wistlberg OUT',
 'Bucina MERGED IN',
 'Bucina MERGED OUT',
 'Falkenstein 1 MERGED IN

## Identify Categorical Features

In [25]:
# Get categorical features
categorical_features = merged_data.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features


['Wochentag',
 'Wochenende',
 'Jahreszeit',
 'Laubfärbung',
 'Schulferien_Bayern',
 'Schulferien_CZ',
 'Feiertag_Bayern',
 'Feiertag_CZ',
 'HEH_geoeffnet',
 'HZW_geoeffnet',
 'WGM_geoeffnet',
 'Lusenschutzhaus_geoeffnet',
 'Racheldiensthuette_geoeffnet',
 'Falkensteinschutzhaus_geoeffnet',
 'Schwellhaeusl_geoeffnet']

## Identify Boolean Features

In [26]:
# Get boolean features
boolean_features = merged_data.select_dtypes(include=['bool']).columns.tolist()
boolean_features


[]