In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [26]:
selected_sensors_df = pd.read_csv("../data/selected_sensors2_cleaned.csv", index_col=0)
kazakhstan_holidays = pd.read_csv("../data/kazakhstan_holidays.csv", index_col=0)
selected_sensors_df

Unnamed: 0,full_date,2,5,6
0,2017-03-22,40.683844,39.022917,33.550382
1,2017-03-23,29.237465,24.606322,23.765278
2,2017-03-24,43.675636,32.662021,24.127526
3,2017-03-25,58.792217,73.688502,57.902710
4,2017-03-26,48.348401,41.901811,28.811111
...,...,...,...,...
1291,2020-10-03,50.642450,43.423905,28.360000
1292,2020-10-04,30.410000,17.520000,21.340000
1293,2020-10-05,29.590000,16.530000,20.040000
1294,2020-10-06,26.380000,13.220000,17.600000


In [17]:
df = selected_sensors_df[['full_date', '2']]
df = df.rename(columns={"full_date": "ds", "2": "y"})
df['unique_id'] = 2

## Prophet winter overfit 

In [18]:
def train_test_split(series, test_size=0.2):
    """
    Splits the series into train and test sets.

    Parameters:
        series (array-like): The transformed series to split.
        test_size (float): Proportion of the series to include in the test set (default: 0.2).
    
    Returns:
        tuple: (train_series, test_series)
    """
    try:
        n = len(series)
        test_count = int(n * test_size)
        train_series = series[:-test_count]
        test_series = series[-test_count:]
        return train_series, test_series
    except Exception as e:
        warnings.warn(f"Failed to split series into train and test sets: {e}")
        return series, None


In [25]:
test_date = "2020-03-01"
test_date = "2018-09-01"
test_end_date = "2019-04-01"

df_train = df[df.ds < test_date].copy()
df_test = df[(df.ds >= test_date) & (df.ds < test_end_date)].copy()

df_train.shape, df_test.shape

((528, 3), (212, 3))

In [27]:
df_train['ds'] = pd.to_datetime(df_train['ds'])
df_test['ds'] = pd.to_datetime(df_test['ds'])

In [None]:
from prophet import Prophet
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# Load and prepare data for Prophet
df_prophet = df_train.reset_index()
df_prophet.rename(columns={'date_time': 'ds', 'value': 'y'}, inplace=True)

# Define the Prophet model
model = Prophet(seasonality_mode='multiplicative', yearly_seasonality=True, weekly_seasonality=True, 
                holidays=kazakhstan_holidays
                )
model.add_seasonality(name='monthly', period=30.5, fourier_order=8)

future = model.make_future_dataframe(df=df_prophet, periods=len(df_test['value']))

In [None]:
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
import pandas as pd
import itertools

# Define your parameter grid
param_grid = {
    'changepoint_prior_scale': [0.01, 0.1, 0.5],
    'seasonality_prior_scale': [1.0, 10.0],
    'fourier_order': [5, 10, 15, 20, 25, 30],
    'use_holidays': [True, False]
}

# Generate all parameter combinations
all_params = [dict(zip(param_grid, v)) for v in itertools.product(*param_grid.values())]

# Split data into training and validation (last 6 months = winter)
df['ds'] = pd.to_datetime(df['ds'])
df = df.sort_values('ds')
train_cutoff = df['ds'].max() - pd.DateOffset(months=6)
df_train = df[df['ds'] < train_cutoff]
df_valid = df[df['ds'] >= train_cutoff]

# Store results
results = []

for params in all_params:
    print(f"Trying params: {params}")
    
    m = Prophet(
        changepoint_prior_scale=params['changepoint_prior_scale'],
        seasonality_prior_scale=params['seasonality_prior_scale'],
        yearly_seasonality=False,  # We'll define it manually
        holidays=kazakhstan_holidays if params['use_holidays'] else None
    )
    
    m.add_seasonality(name='yearly', period=365.25, fourier_order=params['fourier_order'])

    m.fit(df_train)
    future = m.make_future_dataframe(periods=len(df_valid), freq='D')
    forecast = m.predict(future)

    # Merge forecast with actuals
    forecast_valid = forecast[['ds', 'yhat']].set_index('ds').join(df_valid.set_index('ds'))
    
    # Calculate MAE on full validation set (assumed to be winter)
    mape = mean_absolute_percentage_error(forecast_valid['y'], forecast_valid['yhat'])
    
    results.append({
        'params': params,
        'mape': mape
    })

# Results sorted by best performance
results_df = pd.DataFrame(results).sort_values('mape')

In [None]:
from itertools import product

# Define parameter grid
param_grid = {
    'changepoint_prior_scale': np.arange(0.01, 0.25, 0.05),
    'seasonality_prior_scale': np.arange(0.5, 5, 0.5)
}

# Run a grid search
best_mape = float('inf')
best_params = None

for params in product(param_grid['changepoint_prior_scale'], param_grid['seasonality_prior_scale']):
    model = Prophet(changepoint_prior_scale=params[0], seasonality_prior_scale=params[1], yearly_seasonality=True, weekly_seasonality=True, seasonality_mode='multiplicative')
    model.fit(df_prophet)
    forecast = model.predict(future)
    
    # Evaluate
    forecast_values = forecast[['ds', 'yhat']].set_index('ds').loc[test_data_prophet['ds']]
    mape = mean_absolute_percentage_error(test_data_prophet['y'], forecast_values['yhat']) * 100
    
    if mape < best_mape:
        best_mape = mape
        best_params = params

print(f"Best Params: {best_params}, Best MAPE: {best_mape}%")
