In [8]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import numpy as np
import os

In [9]:
# check installed version
import pycaret
pycaret.__version__

'3.3.2'

In [10]:
# check installed version
import pyarrow
pyarrow.__version__

'15.0.0'

## Data Loading

In [11]:
import pandas as pd

def load_and_clean_visitor_count_data(filepath):
    """
    Load the CSV file, remove the last column, and return the cleaned DataFrame.
    
    Parameters:
    - filepath: str, path to the input CSV file.
    
    Returns:
    - DataFrame: The cleaned DataFrame without the last column.
    """
    try:
        # Load the CSV file into a DataFrame
        df = pd.read_csv(filepath, index_col='Time' , sep=',', parse_dates=True, low_memory=False)
        
        return df
    
    except FileNotFoundError:
        print(f"Error: The file at {filepath} was not found.")
    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")
    except pd.errors.ParserError:
        print("Error: There was a problem parsing the file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example usage
filepath = os.path.join('data', 'visitor_count', 'aggregated_historic_visitor_count.csv')
df = load_and_clean_visitor_count_data(filepath)


In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from pycaret.time_series import *

# Assuming 'df' is the dataframe with time as index and 95 sensors as columns (hourly frequency)
# Ensure index is DateTime and frequency is set correctly
df.index = pd.to_datetime(df.index)
df = df.asfreq('H')

# List to store forecast results for each sensor
forecasts = {}

# Loop through each sensor (column) to run the PyCaret model for the next year
for sensor in df.columns:
    print(f"Running forecast for {sensor}")
    
    # Setup PyCaret for the current sensor's data
    exp = setup(data=df[[sensor]], target=sensor, session_id=123, fold=3, fh=8760,  # Forecast horizon for 8760 hours (1 year)
                seasonal_period='H', verbose=False)
    
    # Compare models to find the best for this sensor
    best_model = compare_models()
    
    # Create the model for the sensor
    model = create_model(best_model)
    
    # Forecast the next 8760 hours (1 year) for this sensor
    forecast = predict_model(model, fh=8760)
    
    # Store the forecasted values
    forecast_index = pd.date_range(start=df.index[-1], periods=8761, freq='H')[1:]  # Generating 8760 hours from the last index
    forecast_series = pd.Series(forecast['Label'].values, index=forecast_index)
    forecasts[sensor] = forecast_series
    
    # Plot actual vs forecast for the sensor (last 100 data points and future 8760 points)
    plt.figure(figsize=(12, 6))
    plt.plot(df.index[-100:], df[sensor].tail(100), label='Actual', color='blue')
    plt.plot(forecast_series.index, forecast_series, label='Forecast (Next Year)', color='red')
    plt.title(f'Visitor Count Forecast for {sensor} - Next Year')
    plt.xlabel('Date')
    plt.ylabel('Number of Visitors')
    plt.legend()
    plt.grid(True)
    plt.show()



ValueError: cannot reindex on an axis with duplicate labels

In [None]:
duplicates = df.index.duplicated()
print(df[duplicates])


                     Unnamed: 0  Bayerisch Eisenstein IN  \
Time                                                       
2018-03-25 03:00:00        1995                      NaN   
2019-03-31 03:00:00        2139                      NaN   
2020-03-29 03:00:00        2115                      NaN   
2021-03-28 03:00:00        2067                      NaN   
2022-03-27 03:00:00        2043                      NaN   
...                         ...                      ...   
NaT                        8755                      NaN   
NaT                        8756                      NaN   
NaT                        8757                      NaN   
NaT                        8758                      NaN   
NaT                        8759                      NaN   

                     Bayerisch Eisenstein OUT  \
Time                                            
2018-03-25 03:00:00                       NaN   
2019-03-31 03:00:00                       NaN   
2020-03-29 03:00:00    