# Import Packages

In [51]:
import pandas as pd
import numpy as np
from prettytable import PrettyTable
import plotly.graph_objects as go
import plotly.express as px
import plotly as plt
import pycaret as pyc
from plotly.subplots import make_subplots
from pycaret.regression import *

# Load and Examine Raw Data

In [52]:
# Import raw data
file_path = r'C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24\outputs\Raw Data\national-park-vacation-times-houses-opening-times.xlsx'

# Load the Excel file to a data frame
df_visitcenters = pd.read_excel(file_path)

# Display the first few rows of the DataFrame
df_visitcenters.head()

Unnamed: 0,Datum,Wochentag,Besuchszahlen_HEH,Besuchszahlen_HZW,Besuchszahlen_WGM,Parkpl_HEH_PKW,Parkpl_HEH_BUS,Parkpl_HZW_PKW,Parkpl_HZW_BUS,Schulferien_Bayern,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Laubfärbung
0,2017-01-01,TTTT,571.0,872.0,55.0,469,1.0,156.0,0.0,1.0,...,0.0,0,1.0,1.0,0.2,0.0,12.0,59.86,345.0,0.0
1,2017-01-02,TTTT,241.0,527.0,90.0,184,1.0,87.0,0.0,1.0,...,0.0,0,1.0,1.0,-4.9,1.8,12.0,16.78,113.0,0.0
2,2017-01-03,TTTT,355.0,1237.0,53.0,246,2.0,115.0,0.0,1.0,...,0.0,0,1.0,1.0,-5.1,0.5,15.0,12.01,81.0,0.0
3,2017-01-04,TTTT,138.0,373.0,88.0,74,1.0,49.0,0.0,1.0,...,0.0,0,1.0,1.0,-4.1,13.3,30.0,11.71,83.0,0.0
4,2017-01-05,TTTT,281.0,406.0,24.0,179,4.0,55.0,0.0,1.0,...,0.0,0,1.0,1.0,-7.8,4.0,40.0,25.53,230.0,0.0


In [53]:
# Print summaries of the data
print(df_visitcenters.info)
print(df_visitcenters.columns)
print(df_visitcenters.describe())
print(df_visitcenters['Wochentag'].unique())

<bound method DataFrame.info of           Datum Wochentag  Besuchszahlen_HEH  Besuchszahlen_HZW  \
0    2017-01-01      TTTT              571.0              872.0   
1    2017-01-02      TTTT              241.0              527.0   
2    2017-01-03      TTTT              355.0             1237.0   
3    2017-01-04      TTTT              138.0              373.0   
4    2017-01-05      TTTT              281.0              406.0   
...         ...       ...                ...                ...   
2918 2024-12-28   Samstag                NaN                NaN   
2919 2024-12-29   Sonntag                NaN                NaN   
2920 2024-12-30    Montag                NaN                NaN   
2921 2024-12-31  Dienstag                NaN                NaN   
2922        NaT       NaN                NaN                NaN   

      Besuchszahlen_WGM Parkpl_HEH_PKW  Parkpl_HEH_BUS  Parkpl_HZW_PKW  \
0                  55.0            469             1.0           156.0   
1              

# Clean Data - Change Data Types

In [54]:
# Modify data types

# Change all binary variables (0, 1) from float64 type to bool type
for column in df_visitcenters.columns:
    if df_visitcenters[column].isin([0, 1, np.nan]).all():  # Check if all values are 0, 1, or NaN
        df_visitcenters[column] = df_visitcenters[column].astype('bool')  # Convert to binary type


In [55]:
# Verify changes to variable types
print(df_visitcenters.dtypes)

Datum                              datetime64[ns]
Wochentag                                  object
Besuchszahlen_HEH                         float64
Besuchszahlen_HZW                         float64
Besuchszahlen_WGM                         float64
Parkpl_HEH_PKW                             object
Parkpl_HEH_BUS                            float64
Parkpl_HZW_PKW                            float64
Parkpl_HZW_BUS                            float64
Schulferien_Bayern                        float64
Schulferien_CZ                             object
Feiertag_Bayern                              bool
Feiertag_CZ                                  bool
HEH_geoeffnet                                bool
HZW_geoeffnet                                bool
WGM_geoeffnet                             float64
Lusenschutzhaus_geoeffnet                    bool
Racheldiensthuette_geoeffnet                 bool
Waldschmidthaus_geoeffnet                  object
Falkensteinschutzhaus_geoeffnet              bool


In [56]:
# Convert remaining object type variables to category type (categorical variables with >3 levels)
for col in df_visitcenters.select_dtypes(include=['object']).columns:
    df_visitcenters[col] = df_visitcenters[col].astype('category')

print(df_visitcenters.dtypes)

Datum                              datetime64[ns]
Wochentag                                category
Besuchszahlen_HEH                         float64
Besuchszahlen_HZW                         float64
Besuchszahlen_WGM                         float64
Parkpl_HEH_PKW                           category
Parkpl_HEH_BUS                            float64
Parkpl_HZW_PKW                            float64
Parkpl_HZW_BUS                            float64
Schulferien_Bayern                        float64
Schulferien_CZ                           category
Feiertag_Bayern                              bool
Feiertag_CZ                                  bool
HEH_geoeffnet                                bool
HZW_geoeffnet                                bool
WGM_geoeffnet                             float64
Lusenschutzhaus_geoeffnet                    bool
Racheldiensthuette_geoeffnet                 bool
Waldschmidthaus_geoeffnet                category
Falkensteinschutzhaus_geoeffnet              bool


In [57]:
# Change remaining object type variables to numeric (float64 type)
df_visitcenters['Parkpl_HEH_PKW'] = pd.to_numeric(df_visitcenters['Parkpl_HEH_PKW'], errors='coerce')
df_visitcenters['Waldschmidthaus_geoeffnet'] = pd.to_numeric(df_visitcenters['Parkpl_HEH_PKW'], errors='coerce')

# Confirm changes
print(df_visitcenters['Parkpl_HEH_PKW'].dtype) 
print(df_visitcenters['Waldschmidthaus_geoeffnet'].dtype) 

float64
float64


In [58]:
# Change school holiday binary variables to bool
df_visitcenters['Schulferien_Bayern'] = df_visitcenters['Schulferien_Bayern'].astype(bool)
df_visitcenters['Schulferien_CZ'] = df_visitcenters['Schulferien_CZ'].astype(bool)

In [59]:
# Replace duplicate date with appropriate date

# Find indices of the date '9/29/2021'
indices = df_visitcenters[df_visitcenters['Datum'] == '9/29/2021'].index

# Ensure there is a second instance
if len(indices) > 1:
    # Replace the second instance with '9/29/2023'
    df_visitcenters.at[indices[1], 'Datum'] = '9/29/2023'
else:
    print("There is no second instance of '9/29/2021' in the DataFrame.")

# Create New Variables for Modeling

In [60]:
# Create new date variables: day, month, and year in separate columns
df_visitcenters['Datum'] = pd.to_datetime(df_visitcenters['Datum'])

# Add new columns for day, month, and year
df_visitcenters['Tag'] = df_visitcenters['Datum'].dt.day
df_visitcenters['Monat'] = df_visitcenters['Datum'].dt.month
df_visitcenters['Jahr'] = df_visitcenters['Datum'].dt.year

# Change day, month, year type for modeling purposes
df_visitcenters['Tag'] = df_visitcenters['Tag'].astype('Int64')
df_visitcenters['Monat'] = df_visitcenters['Monat'].astype('category')
df_visitcenters['Jahr'] = df_visitcenters['Jahr'].astype('Int64')

# Verify changes
print(df_visitcenters['Datum'].dtype) 
print(df_visitcenters['Tag'].dtype)
print(df_visitcenters['Monat'].dtype) 
print(df_visitcenters['Jahr'].dtype) 


datetime64[ns]
Int64
category
Int64


In [61]:
# Create a season variable based on month variables

df_visitcenters['Jahreszeit'] = df_visitcenters['Monat'].apply(
    lambda x: 'Frühling' if x in [3, 4, 5] else
              'Sommer' if x in [6, 7, 8] else
              'Herbst' if x in [9, 10, 11] else
              'Winter' if x in [12, 1, 2] else
              NaN
)

# Make season variable category type
df_visitcenters['Jahreszeit'] = df_visitcenters['Jahreszeit'].astype('category')

In [62]:
# Create a new column 'Day_of_Week' that shows the day of the week
df_visitcenters['Wochentag2'] = df_visitcenters['Datum'].dt.day_name()
df_visitcenters['Wochentag2'] = df_visitcenters['Wochentag2'].astype('category')
print(df_visitcenters['Wochentag2'].dtype)

category


In [63]:
# Define the translation mapping from English to German
translation_map = {
    'Monday': 'Montag',
    'Tuesday': 'Dienstag',
    'Wednesday': 'Mittwoch',
    'Thursday': 'Donnerstag',
    'Friday': 'Freitag',
    'Saturday': 'Samstag',
    'Sunday': 'Sonntag'
}

# Replace the English day names in the 'Wochentag2' column with German names
df_visitcenters['Wochentag2'] = df_visitcenters['Wochentag2'].replace(translation_map)

# Remove the 'Wochentag' column from the DataFrame
df_visitcenters = df_visitcenters.drop(columns=['Wochentag'])

# Rename 'Wochentag2' to 'Wochentag'
df_visitcenters = df_visitcenters.rename(columns={'Wochentag2': 'Wochentag'})

In [64]:
# Create weekend binary variable
df_visitcenters['Wochenende'] = df_visitcenters['Wochentag'].apply(lambda x: x in ['Samstag', 'Sonntag'])
df_visitcenters['Wochenende'] = df_visitcenters['Wochenende'].astype(bool)

In [65]:
# Re-order variables to put date-related variables next to each other
df_visitcenters = df_visitcenters[['Datum', 'Tag', 'Monat', 'Jahr', 'Wochentag', 'Wochenende', 'Jahreszeit', 'Laubfärbung',
                      'Besuchszahlen_HEH', 'Besuchszahlen_HZW', 'Besuchszahlen_WGM', 
                     'Parkpl_HEH_PKW', 'Parkpl_HEH_BUS', 'Parkpl_HZW_PKW', 'Parkpl_HZW_BUS', 
                     'Schulferien_Bayern', 'Schulferien_CZ', 'Feiertag_Bayern', 'Feiertag_CZ', 
                     'HEH_geoeffnet', 'HZW_geoeffnet', 'WGM_geoeffnet', 'Lusenschutzhaus_geoeffnet', 
                     'Racheldiensthuette_geoeffnet', 'Waldschmidthaus_geoeffnet', 
                     'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet', 'Temperatur', 
                     'Niederschlagsmenge', 'Schneehoehe', 'GS mit', 'GS max']]

In [66]:
df_visitcenters.head()

Unnamed: 0,Datum,Tag,Monat,Jahr,Wochentag,Wochenende,Jahreszeit,Laubfärbung,Besuchszahlen_HEH,Besuchszahlen_HZW,...,Lusenschutzhaus_geoeffnet,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max
0,2017-01-01,1,1.0,2017,Sonntag,True,Winter,False,571.0,872.0,...,True,False,469.0,True,True,0.2,0.0,12.0,59.86,345.0
1,2017-01-02,2,1.0,2017,Montag,False,Winter,False,241.0,527.0,...,True,False,184.0,True,True,-4.9,1.8,12.0,16.78,113.0
2,2017-01-03,3,1.0,2017,Dienstag,False,Winter,False,355.0,1237.0,...,True,False,246.0,True,True,-5.1,0.5,15.0,12.01,81.0
3,2017-01-04,4,1.0,2017,Mittwoch,False,Winter,False,138.0,373.0,...,True,False,74.0,True,True,-4.1,13.3,30.0,11.71,83.0
4,2017-01-05,5,1.0,2017,Donnerstag,False,Winter,False,281.0,406.0,...,True,False,179.0,True,True,-7.8,4.0,40.0,25.53,230.0


# Final Data Cleaning - Correct Specific Variables/Values that are Strange

In [67]:
# Correct the typo in specific value for column Schulferien_Bayern (from `10` to `0`)
df_visitcenters.loc[df_visitcenters['Datum'] == '2017-04-30', 'Schulferien_Bayern'] = 0

# Change to bool
df_visitcenters['Schulferien_Bayern'] = df_visitcenters['Schulferien_Bayern'].astype(bool)

print(df_visitcenters['Schulferien_Bayern'].unique())

[ True False]


In [68]:
# Correct Besuchszahlen_HEH variable (should be counts and not have any decimals)

# Apply np.ceil() to round up values with non-zero fractional parts to nearest whole number
df_visitcenters['Besuchszahlen_HEH'] = df_visitcenters['Besuchszahlen_HEH'].apply(
    lambda x: np.ceil(x) if pd.notna(x) and x % 1 != 0 else x
)

# Convert 'Besuchszahlen_HEH' to Int64 to retain NaN values
df_visitcenters['Besuchszahlen_HEH'] = df_visitcenters['Besuchszahlen_HEH'].astype('Int64')


In [69]:
# Correct WGM_geoffnet variable: replace single value of 11 with 1
df_visitcenters['WGM_geoeffnet'] = df_visitcenters['WGM_geoeffnet'].replace(11, 1)

# Convert 'WGM_geoeffnet' column to boolean type
df_visitcenters['WGM_geoeffnet'] = df_visitcenters['WGM_geoeffnet'].astype(bool)

In [70]:
# Remove unnecessary last row (2923 row)
if len(df_visitcenters) == 2923:
    # Drop the last row
    df_visitcenters = df_visitcenters.iloc[:-1]
    
# Display the updated DataFrame's shape to verify the change
print(df_visitcenters.shape)

(2922, 32)


# Final Cleaned Data Set

In [71]:
# Save out cleaned csv file

# Specify file path
file_path = r'C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24\outputs\Cleaned Data\df_visitcenters.csv'

# Export the DataFrame to a CSV file to above destination
df_visitcenters.to_csv(file_path, index=False)

# Visualizations

In [72]:
# Crosstabs - Not Necessary but Keeping Code

# Total variable for crosstabs
df_visitcenters['Total'] = 1

# Define row variables
row_vars = ['Besuchszahlen_HEH', 'Besuchszahlen_HZW', 'Besuchszahlen_WGM',
            'Parkpl_HEH_PKW', 'Parkpl_HEH_BUS', 'Parkpl_HZW_PKW']

# Create DataFrames for storing means, standard deviations, and counts
means_df = pd.DataFrame()
stds_df = pd.DataFrame()
counts_df = pd.DataFrame()

# Calculate means, standard deviations, and counts for each row variable
for row_var in row_vars:
    mean_df = df_visitcenters.groupby('Jahreszeit')[row_var].mean().reset_index()
    mean_df.rename(columns={row_var: 'Mean_' + row_var}, inplace=True)
    mean_df.set_index('Jahreszeit', inplace=True)
    means_df = pd.concat([means_df, mean_df], axis=1)

    std_df = df_visitcenters.groupby('Jahreszeit')[row_var].std().reset_index()
    std_df.rename(columns={row_var: 'SD_' + row_var}, inplace=True)
    std_df.set_index('Jahreszeit', inplace=True)
    stds_df = pd.concat([stds_df, std_df], axis=1)

    count_df = df_visitcenters.groupby('Jahreszeit')[row_var].count().reset_index()
    count_df.rename(columns={row_var: 'Count_' + row_var}, inplace=True)
    count_df.set_index('Jahreszeit', inplace=True)
    counts_df = pd.concat([counts_df, count_df], axis=1)

# Add rows for total means, standard deviations, and counts across all variables
means_df.loc['Total'] = means_df.mean()
stds_df.loc['Total'] = stds_df.std()
counts_df.loc['Total'] = counts_df.sum()

# Transpose the DataFrames to switch rows and columns
means_df = means_df.T
stds_df = stds_df.T
counts_df = counts_df.T

# Sort the DataFrames by the 'Total' row in descending order
means_df = means_df.sort_values(by='Total', ascending=False)
stds_df = stds_df.sort_values(by='Total', ascending=False)
counts_df = counts_df.sort_values(by='Total', ascending=False)

# Round the values to the nearest hundredth
means_df = means_df.round(2)
stds_df = stds_df.round(2)

# Move 'Total' to the first column
columns = ['Total'] + [col for col in means_df.columns if col != 'Total']
means_df = means_df[columns]
stds_df = stds_df[columns]
counts_df = counts_df[columns]

# Remove 'Mean_', 'SD_', and 'Count_' prefixes from variable names in the index
means_df.index = [name.replace('Mean_', '') for name in means_df.index]
stds_df.index = [name.replace('SD_', '') for name in stds_df.index]
counts_df.index = [name.replace('Count_', '') for name in counts_df.index]

# Convert DataFrames to PrettyTable
table = PrettyTable()
# Add custom headers
table.field_names = ["Variable"] + list(means_df.columns)

# Add rows with variable names, means, standard deviations, and counts
for idx, row in means_df.iterrows():
    # First row with variable names
    table.add_row([idx] + [''] * len(row))
    # Second row with "Mean" and mean values
    table.add_row(["  Mean"] + list(row))
    # Third row with "SDs" and standard deviation values
    if idx in stds_df.index:
        table.add_row(["  Std. Dev. "] + list(stds_df.loc[idx]))
    # Fourth row with "Count" and count values
    if idx in counts_df.index:
        table.add_row(["  Count"] + list(counts_df.loc[idx]))

# Print the formatted table
print(table)


+-------------------+--------+----------+--------+--------+--------+
|      Variable     | Total  | Frühling | Herbst | Sommer | Winter |
+-------------------+--------+----------+--------+--------+--------+
| Besuchszahlen_HEH |        |          |        |        |        |
|         Mean      | 399.09 |  306.46  | 437.65 | 654.55 | 197.71 |
|      Std. Dev.    | 41.38  |  285.47  | 331.57 | 299.08 | 232.2  |
|        Count      |  2738  |   736    |  637   |  674   |  691   |
| Besuchszahlen_HZW |        |          |        |        |        |
|         Mean      | 250.03 |  220.64  | 234.65 | 357.08 | 187.77 |
|      Std. Dev.    | 26.31  |  189.2   | 198.76 | 172.94 | 235.08 |
|        Count      |  2761  |   736    |  637   |  697   |  691   |
|   Parkpl_HEH_PKW  |        |          |        |        |        |
|         Mean      | 240.5  |  193.17  | 261.5  | 392.5  | 114.82 |
|      Std. Dev.    | 36.62  |  194.56  | 211.06 | 179.27 | 126.47 |
|        Count      |  2749  |   7

In [73]:
# Trends - Visitor Centers Separated
df_visitcenters2 = df_visitcenters.copy()

# Identify categorical columns
categorical_columns = df_visitcenters2.select_dtypes(include=['category']).columns

# Temporarily convert categorical columns to strings
df_visitcenters2[categorical_columns] = df_visitcenters2[categorical_columns].astype(str)

# Fill NaN values
df_visitcenters2.fillna(0, inplace=True)

# Convert columns back to categorical (if they were categorical originally)
df_visitcenters2[categorical_columns] = df_visitcenters2[categorical_columns].astype('category')

# Ensure 'Datum' is in datetime format
df_visitcenters2['Datum'] = pd.to_datetime(df_visitcenters2['Datum'])


# List of visitor center columns
visitor_center_columns = [
    'Besuchszahlen_HEH',
    'Besuchszahlen_HZW',
    'Besuchszahlen_WGM',
    'Parkpl_HEH_PKW',
    'Parkpl_HEH_BUS',
    'Parkpl_HZW_PKW',
    'Parkpl_HZW_BUS'
]

# Create subplots
fig = make_subplots(
    rows=4, cols=2,
    subplot_titles=visitor_center_columns
)

# Add a line trace for each visitor center column
for i, column in enumerate(visitor_center_columns):
    row = i // 2 + 1
    col = i % 2 + 1
    fig.add_trace(
        go.Scatter(
            x=df_visitcenters2['Datum'],
            y=df_visitcenters2[column],
            mode='lines+markers',
            name=column
        ),
        row=row, col=col
    )

# Update the layout
fig.update_layout(
    title='Daily Counts of Visitors for Each Visit Center',
    xaxis_title='Date',
    yaxis_title='Visitor Count',
    showlegend=False,  # Hide legend as it's not necessary for each subplot
    template='plotly_white',
    height=1200  # Adjust height to fit all subplots nicely
)

# Update x-axis and y-axis labels for all subplots
for i in range(1, 5):  # 4 rows
    for j in range(1, 3):  # 2 columns
        fig.update_xaxes(title_text="Date", row=i, col=j)
        fig.update_yaxes(title_text="Visitor Count", row=i, col=j)

# Show the figure
fig.show()

In [74]:
# List of visitor center columns

# List of visitor center columns
visitor_center_columns = [
    'Besuchszahlen_HEH',
    'Besuchszahlen_HZW',
    'Besuchszahlen_WGM',
    'Parkpl_HEH_PKW',
    'Parkpl_HEH_BUS',
    'Parkpl_HZW_PKW',
    'Parkpl_HZW_BUS'
]

# Create a copy of the dataframe
df_MA12 = df_visitcenters2.copy()

# Ensure the 'Datum' column is in datetime format
df_MA12['Datum'] = pd.to_datetime(df_MA12['Datum'])

# Ensure the visitor center columns are numeric
for column in visitor_center_columns:
    df_MA12[column] = pd.to_numeric(df_MA12[column], errors='coerce')

# Create 12-month moving averages for each visitor center column
for column in visitor_center_columns:
    df_MA12[f'MA12_{column}'] = df_MA12[column].rolling(window=12).mean()

# Separate numeric columns and categorical columns
numeric_cols = df_MA12.select_dtypes(include=['number']).columns
categorical_cols = df_MA12.select_dtypes(include=['category']).columns

# Fill NaN values only in numeric columns
df_MA12[numeric_cols] = df_MA12[numeric_cols].fillna(0)

# Plot the data and moving averages for each visitor center
for column in visitor_center_columns:
    fig = px.line(df_MA12, x='Datum', y=[column, f'MA12_{column}'], template='plotly_dark', 
                  title=f'{column} and 12-Month Moving Average')
    fig.show()

# Prediction Models/Forecasting

In [75]:
# Make copy of data file for modeling
df_visitcenters3 = df_visitcenters2.copy()

# Step 2: Identify boolean columns
bool_columns = df_visitcenters3.select_dtypes(include='bool').columns

# Step 3: Convert boolean columns to categorical type
df_visitcenters3[bool_columns] = df_visitcenters3[bool_columns].astype('category')

# Recode categorical features with text values to numeric values
# Create a dictionary to map the weekdays to their corresponding numeric values
weekday_mapping = {
    'Sonntag': 1,
    'Montag': 2,
    'Dienstag': 3,
    'Mittwoch': 4,
    'Donnerstag': 5,
    'Freitag': 6,
    'Samstag': 7
}

# Apply the mapping to the 'Wochentag' column
df_visitcenters3['Wochentag'] = df_visitcenters3['Wochentag'].map(weekday_mapping)

# Recode categorical features with text values to numeric values
# Create a dictionary to map the seasons to their corresponding numeric values
season_mapping = {
    'Winter': 1,
    'Frühling': 2,
    'Sommer': 3,
    'Herbst': 4
}

# Apply the mapping to the 'Jahreszeit' column
df_visitcenters3['Jahreszeit'] = df_visitcenters3['Jahreszeit'].map(season_mapping)

# Verify Changes
df_visitcenters3['Jahreszeit'].unique()
df_visitcenters3['Wochentag'].unique()

print(df_visitcenters.columns)
df_visitcenters3.dtypes

Index(['Datum', 'Tag', 'Monat', 'Jahr', 'Wochentag', 'Wochenende',
       'Jahreszeit', 'Laubfärbung', 'Besuchszahlen_HEH', 'Besuchszahlen_HZW',
       'Besuchszahlen_WGM', 'Parkpl_HEH_PKW', 'Parkpl_HEH_BUS',
       'Parkpl_HZW_PKW', 'Parkpl_HZW_BUS', 'Schulferien_Bayern',
       'Schulferien_CZ', 'Feiertag_Bayern', 'Feiertag_CZ', 'HEH_geoeffnet',
       'HZW_geoeffnet', 'WGM_geoeffnet', 'Lusenschutzhaus_geoeffnet',
       'Racheldiensthuette_geoeffnet', 'Waldschmidthaus_geoeffnet',
       'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet',
       'Temperatur', 'Niederschlagsmenge', 'Schneehoehe', 'GS mit', 'GS max',
       'Total'],
      dtype='object')


Datum                              datetime64[ns]
Tag                                         Int64
Monat                                    category
Jahr                                        Int64
Wochentag                                category
Wochenende                               category
Jahreszeit                               category
Laubfärbung                              category
Besuchszahlen_HEH                           Int64
Besuchszahlen_HZW                         float64
Besuchszahlen_WGM                         float64
Parkpl_HEH_PKW                            float64
Parkpl_HEH_BUS                            float64
Parkpl_HZW_PKW                            float64
Parkpl_HZW_BUS                            float64
Schulferien_Bayern                       category
Schulferien_CZ                           category
Feiertag_Bayern                          category
Feiertag_CZ                              category
HEH_geoeffnet                            category


In [76]:
# split data into train-test set
train = df_visitcenters3[df_visitcenters3['Jahr'] < 2021]
test = df_visitcenters3[df_visitcenters3['Jahr'] >= 2021]

# check shape
train.shape, test.shape

((1461, 33), (1461, 33))

In [77]:
# Define outcome variables and their corresponding cut-off dates
outcome_dates = {
    'Besuchszahlen_HEH': '2024-06-30 00:00:00',
    'Besuchszahlen_HZW': '2024-07-23 00:00:00',
    'Besuchszahlen_WGM': '2024-06-16 00:00:00',
    'Parkpl_HEH_PKW': '2024-07-17 00:00:00',
    'Parkpl_HEH_BUS': '2024-07-17 00:00:00',
    'Parkpl_HZW_PKW': '2024-07-11 00:00:00'
}

for outcome_var, cut_off_date in outcome_dates.items():
    print(f"Processing {outcome_var} with cut-off date {cut_off_date}")
    
    # Initialize setup for the current outcome variable
    s = setup(
        data=train,
        test_data=test,
        target=outcome_var,
        fold_strategy='timeseries',
        categorical_features=[
            'Monat', 'Wochentag', 'Wochenende', 'Jahreszeit', 'Laubfärbung',
            'Schulferien_Bayern', 'Schulferien_CZ', 'Feiertag_Bayern', 'Feiertag_CZ',
            'HEH_geoeffnet', 'HZW_geoeffnet', 'WGM_geoeffnet', 'Lusenschutzhaus_geoeffnet',
            'Racheldiensthuette_geoeffnet', 'Falkensteinschutzhaus_geoeffnet',
            'Schwellhaeusl_geoeffnet'
        ],
        fold=5,
        transform_target=True,
        session_id=123,
        data_split_shuffle=False,
        fold_shuffle=False
    )

    # Compare models and select the best one based on MSE
    best_model = compare_models(sort='MSE')

    # Generate predictions on the original dataset
    predictions = predict_model(best_model, data=df_visitcenters3)

    # Line plot with vertical line to indicate test-set separation
    fig = px.line(predictions, x='Datum', y=[outcome_var, "prediction_label"], template='plotly_dark')

    # Adding vertical rectangle for test-set separation
    cut_off_date_pd = pd.to_datetime(cut_off_date)
    fig.add_vrect(x0=cut_off_date_pd - pd.DateOffset(days=1), x1=cut_off_date_pd + pd.DateOffset(days=1), fillcolor="grey", opacity=0.25, line_width=0)

    # Display the plot
    fig.show()


Processing Besuchszahlen_HEH with cut-off date 2024-06-30 00:00:00


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Besuchszahlen_HEH
2,Target type,Regression
3,Original data shape,"(2922, 33)"
4,Transformed data shape,"(2922, 55)"
5,Transformed train set shape,"(1461, 55)"
6,Transformed test set shape,"(1461, 55)"
7,Numeric features,15
8,Date features,1
9,Categorical features,16


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,127.6229,34060.7102,181.2207,0.6997,1.1799,0.3417,0.12
rf,Random Forest Regressor,128.352,35197.4519,184.1248,0.6889,0.9044,0.3565,0.146
lightgbm,Light Gradient Boosting Machine,132.6994,36935.2696,188.4073,0.6745,1.0099,0.3661,0.146
ada,AdaBoost Regressor,137.3243,36989.0968,189.1513,0.6768,1.1267,0.3872,0.08
et,Extra Trees Regressor,133.3581,37279.4647,189.4091,0.6711,0.9144,0.3824,0.114
br,Bayesian Ridge,140.6695,41222.5042,199.079,0.6396,1.4141,0.3884,0.05
ridge,Ridge Regression,143.486,42820.1265,202.5696,0.6255,1.4167,0.3949,0.418
lasso,Lasso Regression,145.653,44309.8976,208.1028,0.6138,1.6919,0.3305,0.432
llar,Lasso Least Angle Regression,146.5725,44555.5386,208.8234,0.6114,1.7112,0.3326,0.044
en,Elastic Net,148.8903,44743.8407,209.5939,0.6086,1.8381,0.3283,0.356


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,92.6227,19787.2041,140.667,0.8266,1.4039,0.4389


Processing Besuchszahlen_HZW with cut-off date 2024-07-23 00:00:00


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Besuchszahlen_HZW
2,Target type,Regression
3,Original data shape,"(2922, 33)"
4,Transformed data shape,"(2922, 55)"
5,Transformed train set shape,"(1461, 55)"
6,Transformed test set shape,"(1461, 55)"
7,Numeric features,15
8,Date features,1
9,Categorical features,16


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,72.9385,13591.7871,115.4089,0.6957,0.8112,0.3382,0.13
gbr,Gradient Boosting Regressor,75.4693,14938.8262,121.2482,0.6654,0.9187,0.3209,0.126
rf,Random Forest Regressor,76.0125,15293.7292,122.4799,0.651,0.8572,0.3458,0.168
lightgbm,Light Gradient Boosting Machine,79.2661,15972.7203,125.4794,0.6447,1.0007,0.3238,0.132
br,Bayesian Ridge,84.335,17161.5454,128.6596,0.5748,1.1654,0.3406,0.048
ada,AdaBoost Regressor,89.6506,17555.6241,131.9861,0.6221,1.6527,0.3768,0.082
ridge,Ridge Regression,86.3143,18482.6753,133.5799,0.5542,1.0756,0.3572,0.05
llar,Lasso Least Angle Regression,95.7323,20083.0431,138.6661,0.4853,1.8289,0.3498,0.052
en,Elastic Net,96.4733,21162.7928,141.2089,0.4464,1.74,0.3603,0.048
lasso,Lasso Regression,97.406,21364.9801,142.1469,0.4434,1.7673,0.3608,0.05


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,29.9844,3441.2572,58.6622,0.9234,1.1397,0.1784


Processing Besuchszahlen_WGM with cut-off date 2024-06-16 00:00:00


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Besuchszahlen_WGM
2,Target type,Regression
3,Original data shape,"(2922, 33)"
4,Transformed data shape,"(2922, 55)"
5,Transformed train set shape,"(1461, 55)"
6,Transformed test set shape,"(1461, 55)"
7,Numeric features,15
8,Date features,1
9,Categorical features,16


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,26.9737,3021.7815,51.4722,0.2112,1.1137,0.9865,0.046
ada,AdaBoost Regressor,32.7157,3043.78,53.0545,0.1252,1.3609,1.4836,0.084
rf,Random Forest Regressor,29.5312,3228.07,53.7332,0.1035,1.1107,1.4273,0.168
lightgbm,Light Gradient Boosting Machine,30.8846,3244.3149,54.2111,0.1,1.0771,1.4513,0.152
llar,Lasso Least Angle Regression,30.8588,3356.4705,55.3802,0.0691,1.584,0.905,0.052
en,Elastic Net,30.9122,3356.5947,55.306,0.0732,1.5709,0.8984,0.046
lasso,Lasso Regression,30.978,3361.7664,55.4155,0.0682,1.5861,0.9105,0.048
huber,Huber Regressor,31.0956,3381.2136,55.2111,0.0833,1.5796,0.8816,0.052
omp,Orthogonal Matching Pursuit,31.3289,3453.2316,56.2901,0.0349,1.5989,0.9388,0.05
dummy,Dummy Regressor,34.1112,3618.0649,57.881,-0.0297,1.7355,1.0843,0.048


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Bayesian Ridge,22.7363,2090.3695,45.7206,0.2428,1.2463,0.8174


Processing Parkpl_HEH_PKW with cut-off date 2024-07-17 00:00:00


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Parkpl_HEH_PKW
2,Target type,Regression
3,Original data shape,"(2922, 33)"
4,Transformed data shape,"(2922, 55)"
5,Transformed train set shape,"(1461, 55)"
6,Transformed test set shape,"(1461, 55)"
7,Numeric features,15
8,Date features,1
9,Categorical features,16


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,1.8563,11.3266,3.1952,0.9997,0.07,0.0186,0.126
rf,Random Forest Regressor,1.6709,18.4947,3.7435,0.9995,0.1552,0.0221,0.148
dt,Decision Tree Regressor,2.7827,76.3911,6.9022,0.9979,0.0659,0.0224,0.054
ada,AdaBoost Regressor,7.8148,142.5404,11.2598,0.9967,0.2427,0.0555,0.09
lightgbm,Light Gradient Boosting Machine,4.8467,167.2548,11.6241,0.996,0.4034,0.0483,0.15
et,Extra Trees Regressor,6.4497,230.8237,12.7217,0.9958,0.5128,0.0417,0.124
br,Bayesian Ridge,36.3654,2927.5274,53.0768,0.9324,1.0931,0.2045,0.048
ridge,Ridge Regression,36.9085,3079.7733,54.4862,0.9273,1.0729,0.1889,0.044
knn,K Neighbors Regressor,42.5564,3455.8737,56.9825,0.9271,1.1656,0.2279,0.05
en,Elastic Net,43.6788,4010.5616,62.0906,0.9097,1.1979,0.2331,0.046


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,1.227,4.1849,2.0457,0.9999,0.0136,0.0073


Processing Parkpl_HEH_BUS with cut-off date 2024-07-17 00:00:00


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Parkpl_HEH_BUS
2,Target type,Regression
3,Original data shape,"(2922, 33)"
4,Transformed data shape,"(2922, 55)"
5,Transformed train set shape,"(1461, 55)"
6,Transformed test set shape,"(1461, 55)"
7,Numeric features,15
8,Date features,1
9,Categorical features,16


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.9048,6.6184,2.5569,0.1852,0.6607,0.6944,0.164
rf,Random Forest Regressor,2.0064,7.4812,2.7036,-0.0549,0.679,0.782,0.216
gbr,Gradient Boosting Regressor,2.0412,7.959,2.7846,-0.1583,0.6891,0.8346,0.152
lightgbm,Light Gradient Boosting Machine,2.1086,8.3467,2.8544,-0.1702,0.7045,0.8586,0.176
ridge,Ridge Regression,2.1866,8.5762,2.8822,-0.3029,0.7522,0.9038,0.052
br,Bayesian Ridge,2.1852,8.768,2.9178,-0.2328,0.7291,0.8407,0.054
ada,AdaBoost Regressor,2.2397,8.7754,2.9428,-0.1791,0.7554,0.765,0.096
omp,Orthogonal Matching Pursuit,2.2882,9.1274,3.0056,-0.1427,0.7611,0.7536,0.058
llar,Lasso Least Angle Regression,2.3204,9.3774,3.0455,-0.2388,0.7668,0.8098,0.052
lasso,Lasso Regression,2.3213,9.3815,3.0462,-0.2392,0.7672,0.8101,0.058


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,1.0436,5.6856,2.3844,0.4963,0.6449,0.3368


Processing Parkpl_HZW_PKW with cut-off date 2024-07-11 00:00:00


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Parkpl_HZW_PKW
2,Target type,Regression
3,Original data shape,"(2922, 33)"
4,Transformed data shape,"(2922, 55)"
5,Transformed train set shape,"(1461, 55)"
6,Transformed test set shape,"(1461, 55)"
7,Numeric features,15
8,Date features,1
9,Categorical features,16


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
llar,Lasso Least Angle Regression,21.7684,1066.9191,30.1042,0.3908,1.6031,0.2621,0.058
lasso,Lasso Regression,21.2648,1069.0363,29.976,0.3884,1.542,0.2613,0.066
en,Elastic Net,21.2022,1069.7185,29.9349,0.387,1.5394,0.2621,0.058
huber,Huber Regressor,21.4102,1078.873,30.123,0.3818,1.5941,0.2652,0.058
omp,Orthogonal Matching Pursuit,22.3551,1095.5405,30.6662,0.3802,1.652,0.2648,0.058
rf,Random Forest Regressor,19.16,1100.7864,29.1707,0.344,1.2797,0.237,0.19
ada,AdaBoost Regressor,22.3325,1143.761,31.7545,0.3651,1.6462,0.2827,0.096
gbr,Gradient Boosting Regressor,19.3131,1165.4442,30.3454,0.3126,1.2468,0.231,0.148
br,Bayesian Ridge,21.43,1179.7825,30.4787,0.3054,1.5526,0.2618,0.054
knn,K Neighbors Regressor,21.7242,1281.9206,32.1191,0.2508,1.4134,0.2762,0.056


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Lasso Least Angle Regression,22.8439,1295.152,35.9882,0.464,1.2414,0.4058
