# Import Packages

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

# Load and Examine Raw Data

In [2]:
# Import raw data
file_path = r'C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24\outputs\Raw Data\national-park-vacation-times-houses-opening-times.xlsx'

# Load the Excel file to a data frame
df_visitcenters = pd.read_excel(file_path)

# Display the first few rows of the DataFrame
df_visitcenters.head()

Unnamed: 0,Datum,Wochentag,Besuchszahlen_HEH,Besuchszahlen_HZW,Besuchszahlen_WGM,Parkpl_HEH_PKW,Parkpl_HEH_BUS,Parkpl_HZW_PKW,Parkpl_HZW_BUS,Schulferien_Bayern,...,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max,Laubfärbung
0,2017-01-01,TTTT,571.0,872.0,55.0,469,1.0,156.0,0.0,1.0,...,0.0,0,1.0,1.0,0.2,0.0,12.0,59.86,345.0,0.0
1,2017-01-02,TTTT,241.0,527.0,90.0,184,1.0,87.0,0.0,1.0,...,0.0,0,1.0,1.0,-4.9,1.8,12.0,16.78,113.0,0.0
2,2017-01-03,TTTT,355.0,1237.0,53.0,246,2.0,115.0,0.0,1.0,...,0.0,0,1.0,1.0,-5.1,0.5,15.0,12.01,81.0,0.0
3,2017-01-04,TTTT,138.0,373.0,88.0,74,1.0,49.0,0.0,1.0,...,0.0,0,1.0,1.0,-4.1,13.3,30.0,11.71,83.0,0.0
4,2017-01-05,TTTT,281.0,406.0,24.0,179,4.0,55.0,0.0,1.0,...,0.0,0,1.0,1.0,-7.8,4.0,40.0,25.53,230.0,0.0


In [3]:
# Print summaries of the data
print(df_visitcenters.info)
print(df_visitcenters.columns)
print(df_visitcenters.describe())
print(df_visitcenters['Wochentag'].unique())

<bound method DataFrame.info of           Datum Wochentag  Besuchszahlen_HEH  Besuchszahlen_HZW  \
0    2017-01-01      TTTT              571.0              872.0   
1    2017-01-02      TTTT              241.0              527.0   
2    2017-01-03      TTTT              355.0             1237.0   
3    2017-01-04      TTTT              138.0              373.0   
4    2017-01-05      TTTT              281.0              406.0   
...         ...       ...                ...                ...   
2918 2024-12-28   Samstag                NaN                NaN   
2919 2024-12-29   Sonntag                NaN                NaN   
2920 2024-12-30    Montag                NaN                NaN   
2921 2024-12-31  Dienstag                NaN                NaN   
2922        NaT       NaN                NaN                NaN   

      Besuchszahlen_WGM Parkpl_HEH_PKW  Parkpl_HEH_BUS  Parkpl_HZW_PKW  \
0                  55.0            469             1.0           156.0   
1              

# Clean Data - Change Data Types

In [4]:
# Modify data types

# Change all binary variables (0, 1) from float64 type to bool type
for column in df_visitcenters.columns:
    if df_visitcenters[column].isin([0, 1, np.nan]).all():  # Check if all values are 0, 1, or NaN
        df_visitcenters[column] = df_visitcenters[column].astype('bool')  # Convert to binary type


In [5]:
# Verify changes to variable types
print(df_visitcenters.dtypes)

Datum                              datetime64[ns]
Wochentag                                  object
Besuchszahlen_HEH                         float64
Besuchszahlen_HZW                         float64
Besuchszahlen_WGM                         float64
Parkpl_HEH_PKW                             object
Parkpl_HEH_BUS                            float64
Parkpl_HZW_PKW                            float64
Parkpl_HZW_BUS                            float64
Schulferien_Bayern                        float64
Schulferien_CZ                             object
Feiertag_Bayern                              bool
Feiertag_CZ                                  bool
HEH_geoeffnet                                bool
HZW_geoeffnet                                bool
WGM_geoeffnet                             float64
Lusenschutzhaus_geoeffnet                    bool
Racheldiensthuette_geoeffnet                 bool
Waldschmidthaus_geoeffnet                  object
Falkensteinschutzhaus_geoeffnet              bool


In [6]:
# Convert remaining object type variables to category type (categorical variables with >3 levels)
for col in df_visitcenters.select_dtypes(include=['object']).columns:
    df_visitcenters[col] = df_visitcenters[col].astype('category')

print(df_visitcenters.dtypes)

Datum                              datetime64[ns]
Wochentag                                category
Besuchszahlen_HEH                         float64
Besuchszahlen_HZW                         float64
Besuchszahlen_WGM                         float64
Parkpl_HEH_PKW                           category
Parkpl_HEH_BUS                            float64
Parkpl_HZW_PKW                            float64
Parkpl_HZW_BUS                            float64
Schulferien_Bayern                        float64
Schulferien_CZ                           category
Feiertag_Bayern                              bool
Feiertag_CZ                                  bool
HEH_geoeffnet                                bool
HZW_geoeffnet                                bool
WGM_geoeffnet                             float64
Lusenschutzhaus_geoeffnet                    bool
Racheldiensthuette_geoeffnet                 bool
Waldschmidthaus_geoeffnet                category
Falkensteinschutzhaus_geoeffnet              bool


In [7]:
# Change remaining object type variables to numeric (float64 type)
df_visitcenters['Parkpl_HEH_PKW'] = pd.to_numeric(df_visitcenters['Parkpl_HEH_PKW'], errors='coerce')
df_visitcenters['Waldschmidthaus_geoeffnet'] = pd.to_numeric(df_visitcenters['Parkpl_HEH_PKW'], errors='coerce')

# Confirm changes
print(df_visitcenters['Parkpl_HEH_PKW'].dtype) 
print(df_visitcenters['Waldschmidthaus_geoeffnet'].dtype) 

float64
float64


In [8]:
# Change school holiday binary variables to bool
df_visitcenters['Schulferien_Bayern'] = df_visitcenters['Schulferien_Bayern'].astype(bool)
df_visitcenters['Schulferien_CZ'] = df_visitcenters['Schulferien_CZ'].astype(bool)

In [9]:
# Replace duplicate date with appropriate date

# Find indices of the date '9/29/2021'
indices = df_visitcenters[df_visitcenters['Datum'] == '9/29/2021'].index

# Ensure there is a second instance
if len(indices) > 1:
    # Replace the second instance with '9/29/2023'
    df_visitcenters.at[indices[1], 'Datum'] = '9/29/2023'
else:
    print("There is no second instance of '9/29/2021' in the DataFrame.")

# Create New Variables for Modeling

In [10]:
# Create new date variables: day, month, and year in separate columns
df_visitcenters['Datum'] = pd.to_datetime(df_visitcenters['Datum'])

# Add new columns for day, month, and year
df_visitcenters['Tag'] = df_visitcenters['Datum'].dt.day
df_visitcenters['Monat'] = df_visitcenters['Datum'].dt.month
df_visitcenters['Jahr'] = df_visitcenters['Datum'].dt.year

# Change day, month, year type for modeling purposes
df_visitcenters['Tag'] = df_visitcenters['Tag'].astype('Int64')
df_visitcenters['Monat'] = df_visitcenters['Monat'].astype('category')
df_visitcenters['Jahr'] = df_visitcenters['Jahr'].astype('Int64')

# Verify changes
print(df_visitcenters['Datum'].dtype) 
print(df_visitcenters['Tag'].dtype)
print(df_visitcenters['Monat'].dtype) 
print(df_visitcenters['Jahr'].dtype) 


datetime64[ns]
Int64
category
Int64


In [11]:
# Create a season variable based on month variables

df_visitcenters['Jahreszeit'] = df_visitcenters['Monat'].apply(
    lambda x: 'Frühling' if x in [3, 4, 5] else
              'Sommer' if x in [6, 7, 8] else
              'Herbst' if x in [9, 10, 11] else
              'Winter' if x in [12, 1, 2] else
              NaN
)

# Make season variable category type
df_visitcenters['Jahreszeit'] = df_visitcenters['Jahreszeit'].astype('category')

In [12]:
# Create a new column 'Day_of_Week' that shows the day of the week
df_visitcenters['Wochentag2'] = df_visitcenters['Datum'].dt.day_name()
df_visitcenters['Wochentag2'] = df_visitcenters['Wochentag2'].astype('category')
print(df_visitcenters['Wochentag2'].dtype)

category


In [13]:
# Define the translation mapping from English to German
translation_map = {
    'Monday': 'Montag',
    'Tuesday': 'Dienstag',
    'Wednesday': 'Mittwoch',
    'Thursday': 'Donnerstag',
    'Friday': 'Freitag',
    'Saturday': 'Samstag',
    'Sunday': 'Sonntag'
}

# Replace the English day names in the 'Wochentag2' column with German names
df_visitcenters['Wochentag2'] = df_visitcenters['Wochentag2'].replace(translation_map)

# Remove the 'Wochentag' column from the DataFrame
df_visitcenters = df_visitcenters.drop(columns=['Wochentag'])

# Rename 'Wochentag2' to 'Wochentag'
df_visitcenters = df_visitcenters.rename(columns={'Wochentag2': 'Wochentag'})

In [14]:
# Create weekend binary variable
df_visitcenters['Wochenende'] = df_visitcenters['Wochentag'].apply(lambda x: x in ['Samstag', 'Sonntag'])
df_visitcenters['Wochenende'] = df_visitcenters['Wochenende'].astype(bool)

In [15]:
# Re-order variables to put date-related variables next to each other
df_visitcenters = df_visitcenters[['Datum', 'Tag', 'Monat', 'Jahr', 'Wochentag', 'Wochenende', 'Jahreszeit', 'Laubfärbung',
                      'Besuchszahlen_HEH', 'Besuchszahlen_HZW', 'Besuchszahlen_WGM', 
                     'Parkpl_HEH_PKW', 'Parkpl_HEH_BUS', 'Parkpl_HZW_PKW', 'Parkpl_HZW_BUS', 
                     'Schulferien_Bayern', 'Schulferien_CZ', 'Feiertag_Bayern', 'Feiertag_CZ', 
                     'HEH_geoeffnet', 'HZW_geoeffnet', 'WGM_geoeffnet', 'Lusenschutzhaus_geoeffnet', 
                     'Racheldiensthuette_geoeffnet', 'Waldschmidthaus_geoeffnet', 
                     'Falkensteinschutzhaus_geoeffnet', 'Schwellhaeusl_geoeffnet', 'Temperatur', 
                     'Niederschlagsmenge', 'Schneehoehe', 'GS mit', 'GS max']]

In [16]:
df_visitcenters.head()

Unnamed: 0,Datum,Tag,Monat,Jahr,Wochentag,Wochenende,Jahreszeit,Laubfärbung,Besuchszahlen_HEH,Besuchszahlen_HZW,...,Lusenschutzhaus_geoeffnet,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max
0,2017-01-01,1,1.0,2017,Sonntag,True,Winter,False,571.0,872.0,...,True,False,469.0,True,True,0.2,0.0,12.0,59.86,345.0
1,2017-01-02,2,1.0,2017,Montag,False,Winter,False,241.0,527.0,...,True,False,184.0,True,True,-4.9,1.8,12.0,16.78,113.0
2,2017-01-03,3,1.0,2017,Dienstag,False,Winter,False,355.0,1237.0,...,True,False,246.0,True,True,-5.1,0.5,15.0,12.01,81.0
3,2017-01-04,4,1.0,2017,Mittwoch,False,Winter,False,138.0,373.0,...,True,False,74.0,True,True,-4.1,13.3,30.0,11.71,83.0
4,2017-01-05,5,1.0,2017,Donnerstag,False,Winter,False,281.0,406.0,...,True,False,179.0,True,True,-7.8,4.0,40.0,25.53,230.0


# Final Data Cleaning - Correct Specific Variables/Values that are Strange

In [17]:
# Correct the typo in specific value for column Schulferien_Bayern (from `10` to `0`)
df_visitcenters.loc[df_visitcenters['Datum'] == '2017-04-30', 'Schulferien_Bayern'] = 0

# Change Schulferien_Bayern to bool type
df_visitcenters['Schulferien_Bayern'] = df_visitcenters['Schulferien_Bayern'].astype(bool)

print(df_visitcenters['Schulferien_Bayern'].unique())

[ True False]


  df_visitcenters.loc[df_visitcenters['Datum'] == '2017-04-30', 'Schulferien_Bayern'] = 0


In [18]:
# Correct Besuchszahlen_HEH variable (should be counts and not have any decimals)

# Apply np.ceil() to round up values with non-zero fractional parts to nearest whole number
df_visitcenters['Besuchszahlen_HEH'] = df_visitcenters['Besuchszahlen_HEH'].apply(
    lambda x: np.ceil(x) if pd.notna(x) and x % 1 != 0 else x
)

# Convert 'Besuchszahlen_HEH' to Int64 to retain NaN values
df_visitcenters['Besuchszahlen_HEH'] = df_visitcenters['Besuchszahlen_HEH'].astype('Int64')


In [19]:
# Correct WGM_geoffnet variable: replace single value of 11 with 1
df_visitcenters['WGM_geoeffnet'] = df_visitcenters['WGM_geoeffnet'].replace(11, 1)

# Convert 'WGM_geoeffnet' column to boolean type
df_visitcenters['WGM_geoeffnet'] = df_visitcenters['WGM_geoeffnet'].astype(bool)

In [20]:
# Remove unnecessary last row (2923 row)
if len(df_visitcenters) == 2923:
    # Drop the last row
    df_visitcenters = df_visitcenters.iloc[:-1]
    
# Display the updated DataFrame's shape to verify the change
print(df_visitcenters.shape)

(2922, 32)


In [21]:
# Detect Outliers in Outcomes of Interests (i.e., the different visitor center counts)

# List of columns to check for outliers
columns_to_check = [
    'Besuchszahlen_HEH',
    'Besuchszahlen_HZW',
    'Besuchszahlen_WGM',
    'Parkpl_HEH_PKW',
    'Parkpl_HEH_BUS',
    'Parkpl_HZW_PKW',
    'Parkpl_HZW_BUS'
]

# Initialize a dictionary to store the outliers
outliers = {}

# Function to detect outliers using the standard deviation method
def detect_outliers_std(df, column, num_sd=7):
    mean = df[column].mean()
    std_dev = df[column].std()
    
    # Define the bounds for outliers
    lower_bound = mean - num_sd * std_dev
    upper_bound = mean + num_sd * std_dev
    
    # Identify outliers
    outliers_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
    return df[outliers_mask][['Datum', column]]

# Apply the function to each column and store the results
for column in columns_to_check:
    outliers[column] = detect_outliers_std(df_visitcenters, column)

# Display the Datum column and the specific column with outliers
for column, outliers_df in outliers.items():
    print(f"Outliers detected in {column}:")
    print(outliers_df)
    print("\n")


Outliers detected in Besuchszahlen_HEH:
Empty DataFrame
Columns: [Datum, Besuchszahlen_HEH]
Index: []


Outliers detected in Besuchszahlen_HZW:
          Datum  Besuchszahlen_HZW
728  2018-12-30             1723.0
1092 2019-12-29             2263.0


Outliers detected in Besuchszahlen_WGM:
         Datum  Besuchszahlen_WGM
139 2017-05-20              429.0
245 2017-09-03              700.0
489 2018-05-05              535.0
609 2018-09-02              750.0
651 2018-10-14              428.0
874 2019-05-25              513.0
973 2019-09-01              850.0


Outliers detected in Parkpl_HEH_PKW:
Empty DataFrame
Columns: [Datum, Parkpl_HEH_PKW]
Index: []


Outliers detected in Parkpl_HEH_BUS:
Empty DataFrame
Columns: [Datum, Parkpl_HEH_BUS]
Index: []


Outliers detected in Parkpl_HZW_PKW:
          Datum  Parkpl_HZW_PKW
2316 2023-05-06           511.0


Outliers detected in Parkpl_HZW_BUS:
          Datum  Parkpl_HZW_BUS
206  2017-07-26             7.0
262  2017-09-20             7.0
501

In [22]:
for column in columns_to_check:
    # Use the outliers mask to identify the positions of the outliers and replace them with nan
    mean = df_visitcenters[column].mean()
    std_dev = df_visitcenters[column].std()
    
    lower_bound = mean - 7 * std_dev
    upper_bound = mean + 7 * std_dev
    
    df_visitcenters.loc[(df_visitcenters[column] < lower_bound) | (df_visitcenters[column] > upper_bound), column] = np.nan


In [23]:
# Created hourly level dataframe

# Generate a new DataFrame where each day is expanded into 24 rows (one per hour)
df_visitcenters_hourly= df_visitcenters.loc[df_visitcenters.index.repeat(24)].copy()

# Create the hourly timestamps by adding hours to the 'Datum' column
df_visitcenters_hourly['Datum'] = df_visitcenters_hourly['Datum'] + pd.to_timedelta(df_visitcenters_hourly.groupby(df_visitcenters_hourly.index).cumcount(), unit='h')

# Rename
df_visitcenters_hourly = df_visitcenters_hourly.rename(columns=lambda x: x.strip())
df_visitcenters_hourly = df_visitcenters_hourly.rename(columns={'Datum': 'Time'})

df_visitcenters_hourly.head()

Unnamed: 0,Time,Tag,Monat,Jahr,Wochentag,Wochenende,Jahreszeit,Laubfärbung,Besuchszahlen_HEH,Besuchszahlen_HZW,...,Lusenschutzhaus_geoeffnet,Racheldiensthuette_geoeffnet,Waldschmidthaus_geoeffnet,Falkensteinschutzhaus_geoeffnet,Schwellhaeusl_geoeffnet,Temperatur,Niederschlagsmenge,Schneehoehe,GS mit,GS max
0,2017-01-01 00:00:00,1,1.0,2017,Sonntag,True,Winter,False,571,872.0,...,True,False,469.0,True,True,0.2,0.0,12.0,59.86,345.0
0,2017-01-01 01:00:00,1,1.0,2017,Sonntag,True,Winter,False,571,872.0,...,True,False,469.0,True,True,0.2,0.0,12.0,59.86,345.0
0,2017-01-01 02:00:00,1,1.0,2017,Sonntag,True,Winter,False,571,872.0,...,True,False,469.0,True,True,0.2,0.0,12.0,59.86,345.0
0,2017-01-01 03:00:00,1,1.0,2017,Sonntag,True,Winter,False,571,872.0,...,True,False,469.0,True,True,0.2,0.0,12.0,59.86,345.0
0,2017-01-01 04:00:00,1,1.0,2017,Sonntag,True,Winter,False,571,872.0,...,True,False,469.0,True,True,0.2,0.0,12.0,59.86,345.0


# Final Cleaned Data Sets

In [24]:
# Rename 'Datum' column to 'Time'
df_visitcenters.rename(columns={'Datum': 'time'}, inplace=True)

# Convert 'Time' column to datetime
df_visitcenters['time'] = pd.to_datetime(df_visitcenters['time'])

In [25]:
# Save out cleaned csv file

# Daily-level Dataset
# Specify file path
file_path = r'C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24\outputs\Cleaned Data\df_visitcenters_daily.csv'

# Export the DataFrame to a CSV file to above destination
df_visitcenters.to_csv(file_path, index=False)

# Hourly-level Dataset
# Specify file path
file_path = r'C:\Users\garov\OneDrive\Documents\GitHub\bavarian-forest-visitor-monitoring-dssgx-24\outputs\Cleaned Data\df_visitcenters_hourly.csv'

# Export the DataFrame to a CSV file to above destination
df_visitcenters_hourly.to_csv(file_path, index=False)



In [26]:
csv_file_path = "outputs\Cleaned Data\df_visitcenters_daily.csv"

# Export the DataFrame to a CSV file to above destination
df_visitcenters.to_csv(csv_file_path, index=False)

df_visitcenters.set_index('time', inplace=True)

table = pa.Table.from_pandas(df_visitcenters,preserve_index=True)

parquet_file_path = "outputs\Cleaned Data\df_visitcenters_daily.parquet"

pq.write_table(table, parquet_file_path)

In [27]:
check_df  = pd.read_parquet(parquet_file_path, engine='pyarrow')

print(check_df.head())

            Tag  Monat  Jahr   Wochentag  Wochenende Jahreszeit  Laubfärbung  \
time                                                                           
2017-01-01    1    1.0  2017     Sonntag        True     Winter        False   
2017-01-02    2    1.0  2017      Montag       False     Winter        False   
2017-01-03    3    1.0  2017    Dienstag       False     Winter        False   
2017-01-04    4    1.0  2017    Mittwoch       False     Winter        False   
2017-01-05    5    1.0  2017  Donnerstag       False     Winter        False   

            Besuchszahlen_HEH  Besuchszahlen_HZW  Besuchszahlen_WGM  ...  \
time                                                                 ...   
2017-01-01                571              872.0               55.0  ...   
2017-01-02                241              527.0               90.0  ...   
2017-01-03                355             1237.0               53.0  ...   
2017-01-04                138              373.0           