In [4]:

# General packages
import os
from multiprocessing import Pool, cpu_count
from time import time
import shutil
from datetime import datetime
import glob
import re

# Data processing and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import h5py

# Machine learning and forecasting
import darts
from darts.dataprocessing.transformers import Scaler
from darts import TimeSeries
from darts.models import TransformerModel, ExponentialSmoothing
from darts.metrics import mape
from darts.utils.statistics import check_seasonality, plot_acf

# Time series forecasting with Prophet
import prophet
from prophet import Prophet

# PySpark for distributed processing
import pyspark
from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf, PandasUDFType

# Deep learning with Torch
import torch
import torch.nn as nn
import torch.optim as optim

# Machine learning and clustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import BisectingKMeans
from tslearn.clustering import silhouette_score, TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

# Custom functions
%run functions.py

### Looping the cluster code over all cluster data sets


In [34]:
# Define the path to the folder
data_folder_path = '/work/Data-Science-Liv/clustered_data'
forecast_folder_path = '/work/Data-Science-Liv/cluster_forecasts'

# Create the folders if they don't exist
if not os.path.exists(data_folder_path):
    os.makedirs(data_folder_path)

if not os.path.exists(forecast_folder_path):
    os.makedirs(forecast_folder_path)

# Define the file pattern to match
file_pattern = 'long_cluster_*_full.csv'

# Get a list of file paths that match the pattern
data_file_paths = glob.glob(f"{data_folder_path}/{file_pattern}")

# Initialize empty dictionaries to store forecasts for each cluster
forecasts_clusters = {}

# Iterate over the file paths
for data_file_path in data_file_paths:
    
    # Extract the cluster number from the file name
    match = re.search(r'long_cluster_(\d+)_full.csv', data_file_path)
    if match:
        cluster = int(match.group(1))
    else:
        continue
    
    df = pd.read_csv(data_file_path, parse_dates=['Year'])

    # Create test and train splits
    train_df, test_df = create_splits(df)

    # Obtain the representative train ts via yearly aggregation across all individual time series in the current cluster
    representative_df_train = create_representative_train(train_df)

    # Define model to be fit to the representative
    model = Prophet(
        growth='linear',
        seasonality_mode='additive',
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False
    )
    
    # Fit model to representative train ts
    model.fit(representative_df_train)

    #  Obtain the representative test ts
    representative_df_test = create_representative_test(test_df)
    
    #Generate forecasts on the test period for the representative test ts
    representative_forecasts = model.predict(df=representative_df_test.drop('y', axis=1))

    # Create the adjustment series and combine together train, test and all adjustment series
    df_actual_all = create_adjustment_series(train_df, test_df, representative_df_train, representative_df_test, representative_forecasts)

    ####  Generate forecasts for each countries time series 

    #Create adjustment forecasts  by fitting a simple linear model to the adjustment series
    adjustment_forecasts_df = df_actual_all.groupby(['Country']).apply(lambda x: get_linear_model_pred(x, representative_df_test)).reset_index(drop=False)
    adjustment_forecasts_df = adjustment_forecasts_df.rename(columns={'Year': 'ds'})

    #Merge all data so far with the adjustment forecasts
    final_output = pd.merge(df_actual_all, adjustment_forecasts_df, on=['Country', 'ds'], how='left')
    
    #Create forecasts for all individual time series in a cluster by adding together the representative forecasys and adjustment forecasts
    final_output['final_prediction'] = final_output['yhat'] + final_output['adjustment_Forecasts_test']

    # Store forecasts in a diactionary based on the current cluster
    forecasts_clusters[cluster] = final_output[['Country', 'ds', 'final_prediction']]

    # Save the current forecast DataFrame as a CSV file
    file_name = f'forecasts_cluster{cluster}.csv'
    file_path = os.path.join(forecast_folder_path, file_name)
    forecasts_clusters[cluster].to_csv(file_path, index=False)


17:28:44 - cmdstanpy - INFO - Chain [1] start processing
17:28:44 - cmdstanpy - INFO - Chain [1] done processing
17:28:45 - cmdstanpy - INFO - Chain [1] start processing
17:28:45 - cmdstanpy - INFO - Chain [1] done processing
17:28:46 - cmdstanpy - INFO - Chain [1] start processing
17:28:46 - cmdstanpy - INFO - Chain [1] done processing
17:28:47 - cmdstanpy - INFO - Chain [1] start processing
17:28:47 - cmdstanpy - INFO - Chain [1] done processing
