# Exponential Smoothing in der Bottom Up Variante (ES_bu) 



Lösung erzielte folgenden Score:


|------------------|
| 0.67139          | 



statsmodels installieren

In [1]:
!pip install statsmodels==0.13.2

Collecting statsmodels==0.13.2
  Downloading statsmodels-0.13.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Downloading statsmodels-0.13.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: statsmodels
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.14.1
    Uninstalling statsmodels-0.14.1:
      Successfully uninstalled statsmodels-0.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
plotnine 0.13.2 requires statsmodels>=0.14.0, but you have statsmodels 0.13.2 which is incompatible.
ydata-profiling 4.6.4 requires numpy<1.26,>=1.16.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSucc

In [12]:
from multiprocessing import Pool, cpu_count
from pathlib import Path
from typing import Tuple
import gc

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.base import BaseEstimator
from statsmodels.tsa.holtwinters import ExponentialSmoothing


class SklearnExponentialSmoothing(BaseEstimator):
    def __init__(
        self,
        trend: str = None,
        damped_trend: bool = False,
        seasonal: str = None,
        seasonal_periods: int = None,
    ) -> None:
        self.trend = trend
        self.damped_trend = damped_trend
        self.seasonal = seasonal
        self.seasonal_periods = seasonal_periods

        self.model = None

    def fit(self, X, y=None):
        self.model = ExponentialSmoothing(
            X,
            trend=self.trend,
            damped_trend=self.damped_trend,
            seasonal=self.seasonal,
            seasonal_periods=self.seasonal_periods,
        ).fit()
        return self

    def predict(self, start, end):
        return self.model.predict(start, end)

    def forecast(self, horizon: int) -> np.array:
        """Predicts `horizon` future values.

        Args:
            horizon (int): number of prediction after date of training.

        Returns:
            np.array: values predicted by the model.
        """
        return self.model.forecast(horizon)


class MultiOutputExponentialSmoothing(BaseEstimator):
    def __init__(
        self, params: dict = {"seasonal": "add", "seasonal_periods": 7, "trend": "add"}
    ):
        self.params = params

        self.models = [None]
        self.horizon = None

    def fit(self, X, y=None):
        """Fits models for each time series expressed as a row."""
        print(f"Fitting using {cpu_count()} CPUs in the system.")
        pool = Pool(cpu_count())
        X_train_iter = X.iterrows()
        print("X_train_iter", X_train_iter)
        self.models = pool.map(self.fit_one_model, tqdm(list(X_train_iter)))
        return self

    def fit_one_model(
        self, id_and_series: Tuple[int, pd.Series]
    ) -> Tuple[int, SklearnExponentialSmoothing]:
        model_id, time_series = id_and_series
        #print("id_and_series:", id_and_series)
        idx_days = [idx for idx in time_series.index if "d_" in idx]
        time_series = time_series[idx_days].to_numpy()
        model = SklearnExponentialSmoothing(**self.params)
        model.fit(time_series)
        return model_id, model

    def forecast(self, horizon: int) -> pd.DataFrame:
        """Predicts `horizon` forecasts."""
        all_forecasts = []
        for model_id, model in tqdm(self.models):
            yhat = pd.DataFrame([model.forecast(horizon)])
            yhat.columns = [f"F{i+1}" for i in range(horizon)]
            yhat.insert(0, "id", model_id)
            all_forecasts.append(yhat)
        return pd.concat(all_forecasts)


In [9]:
def reduce_mem_usage(df):
    """iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [10]:
# Dateipfad angeben
path = Path("../input/m5-forecasting-accuracy")

# Validierungs- und Evaluierungsdatensatz einlesen
validation = pd.read_csv(path / "sales_train_validation.csv")
evaluation = pd.read_csv(path / "sales_train_evaluation.csv")

In [5]:
display(validation.head())
display(validation.tail())

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3
30489,FOODS_3_827_WI_3_validation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
display(evaluation.head())
display(evaluation.tail())

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0
30489,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,2,0,5,4,0,2,2,5,1


In [13]:
model = MultiOutputExponentialSmoothing()
model.fit(validation)

Fitting using 4 CPUs in the system.
X_train_iter <generator object DataFrame.iterrows at 0x7ae589fbaff0>


 25%|██▌       | 7624/30490 [00:23<00:36, 627.30it/s]Process ForkPoolWorker-24:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkPoolWorker-23:
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-21:
Process ForkPoolWorker-22:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/opt/conda/lib/python3.10/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.10/mult

KeyboardInterrupt: 

  File "/opt/conda/lib/python3.10/site-packages/statsmodels/tsa/holtwinters/model.py", line 81, in f
    err = func(*args, **kwargs)
KeyboardInterrupt
  File "/opt/conda/lib/python3.10/site-packages/statsmodels/tsa/holtwinters/model.py", line 837, in _optimize_parameters
    params = self._get_starting_values(
  File "/opt/conda/lib/python3.10/site-packages/statsmodels/tsa/holtwinters/model.py", line 743, in _get_starting_values
    val = opt(point, hw_args)
KeyboardInterrupt
 31%|███▏      | 9530/30490 [21:04<1:23:06,  4.20it/s]
100%|██████████| 30490/30490 [23:31<00:00, 21.61it/s]  [A
 50%|█████     | 15248/30490 [21:23<18:25, 13.79it/s] 

In [None]:
sub_validation = model.forecast(28)
sub_validation["id"] = validation["id"].values

In [None]:
del validation
del model
gc.collect()

In [None]:
evaluation = reduce_mem_usage(evaluation)

In [None]:
model = MultiOutputExponentialSmoothing()
model.fit(evaluation)

In [None]:
sub_evaluation = model.forecast(28)

In [None]:
sub_evaluation["id"] = evaluation["id"].values

In [None]:
pd.concat([sub_validation, sub_evaluation]).to_csv("submission.csv", index=False)