In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-jan-2022/train.csv
/kaggle/input/tabular-playground-series-jan-2022/test.csv


In [None]:
!conda list statsmodels

In [None]:
!pip uninstall statsmodels -y
!pip install -U statsmodels

In [7]:
from pathlib import Path
from itertools import cycle
from random import randint

from statsmodels.tsa.api import STLForecast
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

from deap import creator as ga_cr, base as ga_b, algorithms as ga_algo, tools as ga_t

from sklearn.metrics import mean_squared_error

In [8]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [9]:
path = Path().cwd().parent / 'input'
files = list(path.rglob('*'))
files

[PosixPath('/kaggle/input/tabular-playground-series-jan-2022'),
 PosixPath('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv'),
 PosixPath('/kaggle/input/tabular-playground-series-jan-2022/train.csv'),
 PosixPath('/kaggle/input/tabular-playground-series-jan-2022/test.csv')]

In [10]:
train = pd.read_csv(files[2])
train['date'] = pd.to_datetime(train['date'], errors='coerce')
train = train.set_index('date')

test = pd.read_csv(files[3])
test['date'] = pd.to_datetime(test['date'], errors='coerce')
test = test.set_index('date')

train
test

Unnamed: 0_level_0,row_id,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-01,0,Finland,KaggleMart,Kaggle Mug,329
2015-01-01,1,Finland,KaggleMart,Kaggle Hat,520
2015-01-01,2,Finland,KaggleMart,Kaggle Sticker,146
2015-01-01,3,Finland,KaggleRama,Kaggle Mug,572
2015-01-01,4,Finland,KaggleRama,Kaggle Hat,911
...,...,...,...,...,...
2018-12-31,26293,Sweden,KaggleMart,Kaggle Hat,823
2018-12-31,26294,Sweden,KaggleMart,Kaggle Sticker,250
2018-12-31,26295,Sweden,KaggleRama,Kaggle Mug,1004
2018-12-31,26296,Sweden,KaggleRama,Kaggle Hat,1441


Unnamed: 0_level_0,row_id,country,store,product
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,26298,Finland,KaggleMart,Kaggle Mug
2019-01-01,26299,Finland,KaggleMart,Kaggle Hat
2019-01-01,26300,Finland,KaggleMart,Kaggle Sticker
2019-01-01,26301,Finland,KaggleRama,Kaggle Mug
2019-01-01,26302,Finland,KaggleRama,Kaggle Hat
...,...,...,...,...
2019-12-31,32863,Sweden,KaggleMart,Kaggle Hat
2019-12-31,32864,Sweden,KaggleMart,Kaggle Sticker
2019-12-31,32865,Sweden,KaggleRama,Kaggle Mug
2019-12-31,32866,Sweden,KaggleRama,Kaggle Hat


In [11]:
# Checking

def train_valid(df_wide):
    train_dfs, train_ids = {}, {}
    valid_dfs, valid_ids = {}, {}
    for cat in df_wide['country'].unique():
        for cat_ in df_wide['store'].unique():
            for cat__ in df_wide['product'].unique():
                mask = (df_wide['country']==cat) & (df_wide['store']==cat_) & (df_wide['product']==cat__)
                df = df_wide[mask]
                #train_ids[cat+cat_+cat__] = df['row_id']
                df = df.groupby(['country', 'store', 'product']).resample('D').sum()['num_sold']
                df = df.reset_index(level=[0,1,2], drop=True).squeeze()
                df.name = cat+cat_+cat__
                thresh = int(len(df)*0.8)
                df_train = df.iloc[:thresh]
                df_valid = df.iloc[thresh:]
                train_dfs[cat+cat_+cat__] = df_train
                valid_dfs[cat+cat_+cat__] = df_valid
                print(df_wide[mask].shape[0] == (df_train.shape[0] + df_valid.shape[0])) 
    return train_dfs, valid_dfs

train_dfs, valid_dfs = train_valid(train)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [12]:
class GA_Statsmodels():
    def __init__(self, 
                 params, 
                 eval_func,
                 eval_weights,
                 #
                 train_df,
                 valid_df,
                 #
                 sel_tournsize=2,
                 cx_uniform_prob=0.5, 
                 mut_shuffle_idx_prob=0.1, 
                 n_pop=10, 
                 n_gen=10, 
                 n_hof=1, 
                 cx_prob=0.5, 
                 mut_prob=0.1, 
                 n_jobs=1
                ):
        self.params = params
        self.eval_func = eval_func
        self.eval_weights = eval_weights
        
        self.train_df = train_df
        self.valid_df = valid_df
        
        self.sel_tournsize = sel_tournsize
        self.cx_uniform_prob = cx_uniform_prob
        self.mut_shuffle_idx_prob = mut_shuffle_idx_prob
        self.n_pop = n_pop
        self.n_gen = n_gen
        self.n_hof = n_hof
        self.cx_prob = cx_prob
        self.mut_prob = mut_prob
        
        self.n_jobs = n_jobs

        self._pad_params()
        self._create_fitness_and_indiv()
        self._register_indiv_and_pop_generators()
        self._register_eval_func()
        self._register_selection_crossover_mutation_methods()

    def _pad_params(self):
        """Pad params for crossover shuffle idx method"""
        assert isinstance(self.params, dict), 'Params must be a dict, i.e. estimator.get_params()'
        params_count = {k: len(v) for k,v in self.params.items()}
        max_length, max_key = -99, ''
        for k, v in params_count.items():
            if v <= max_length:
                continue
            else:
                max_key = k
                max_length = v
        assert isinstance(max_length, int), 'The max length between all params must be an int'
        # cycle through params for max length param, otherwise infinite cycle
        values_padded = (cycle(v) if k!=max_key else v for k,v in self.params.items())
        values_padded = zip(*values_padded)  # ('a', 1, 14), ('b', 2, 16), ('c', 3, 16) ...
        values_padded = zip(*values_padded)  # ('a', 'b', 'c'), (1, 2, 3), (14, 15, 16)...
        padded_params = {}
        for k, v in zip(self.params, values_padded):
            padded_params[k] = v
        self.padded_params = padded_params
        print('Params padded')

    def _create_fitness_and_indiv(self):
        """Create GA individual and fitness entities (classes)"""
        ga_cr.create('Fitness', ga_b.Fitness, weights=self.eval_weights)
        ga_cr.create('Individual', list, fitness=ga_cr.Fitness)
        print('GA entities created')

    def _gen_params_to_ga(self):
        """Generate index for each param for individual"""
        max_dict = len(self.padded_params)
        max_length = len(list(self.padded_params.values())[0])
        idxs = [randint(0, max_length-1) for _ in range(max_dict)]
        return idxs
    
    def _register_indiv_and_pop_generators(self):
        """Register GA individual and population generators"""
        self.tb = ga_b.Toolbox()

        if self.n_jobs > 1:
            from multiprocessing import Pool
            pool = Pool()
            self.tb.register("map", pool.map)

        self.tb.register("individual", ga_t.initIterate, ga_cr.Individual, self._gen_params_to_ga)
        #print('indiv', self.tb.individual())
        self.tb.register("population", ga_t.initRepeat, list, self.tb.individual)
        #print('population', self.tb.population(n=2))
        print('GA entities\' methods registered')
        
    def _register_eval_func(self):
        """Set GA evaluate individual function"""
        self.tb.register("evaluate",
                        self.eval_func,
                        padded_params=self.padded_params,
                        train_df=self.train_df,
                        valid_df=self.valid_df,
                        )
        #print(list(self.tb.evaluate(indiv) for indiv in self.tb.population(3)))
        print('GA eval function registered')
    
    def _register_selection_crossover_mutation_methods(self):
        self.tb.register("select", ga_t.selTournament, tournsize=self.sel_tournsize)
        self.tb.register("mate", ga_t.cxUniform, indpb=self.cx_uniform_prob)
        self.tb.register("mutate", ga_t.mutShuffleIndexes, indpb=self.mut_shuffle_idx_prob)
        print('GA sel-cx-mut methods registered')
        
    def run_ga_search(self):
        """GA Search"""
        pop = self.tb.population(n=self.n_pop)
        hof = ga_t.HallOfFame(self.n_hof)

        # Stats stdout
        stats = ga_t.Statistics(lambda ind: ind.fitness.values )
        #stats1 = ga_t.Statistics(lambda ind: ind.fitness.values[0] )
        #stats2 = ga_t.Statistics(lambda ind: ind.fitness.values[1] )
        #stats3 = ga_t.Statistics(lambda ind: ind.fitness.values[2] )
        stats = ga_t.MultiStatistics(accuracy=stats)
        stats.register("avg", np.mean)
        #stats.register("std", np.std)
        #stats.register("min", np.min)
        #stats.register("max", np.max)

        # History
        #hist = tools.History()
        #toolbox.decorate("select", hist.decorator)
        #tb.decorate("mate", hist.decorator)
        #tb.decorate("mutate", hist.decorator)
        #hist.update(pop)

        # GA Run
        pop, log = ga_algo.eaSimple(pop, self.tb, cxpb=self.cx_prob, 
                                    mutpb=self.mut_prob, ngen=self.n_gen, 
                                    stats=stats, halloffame=hof, verbose=True)
        
        # Convert back params
        hof_ = {}
        for i in range(self.n_hof):
            hof_['hof_' + str(i)] = self._ga_to_params(hof[i])

        return pop, log, hof_
    
    def _ga_to_params(self, idx_params):
        """Convert back idx to params"""
        res = {}
        for (k,v), idx in zip(self.padded_params.items(), idx_params):
            res[k] = v[idx]
        return res

In [13]:
helper = np.linspace(3, 50, 20).astype(int)

ts_params = {
    'ar': np.linspace(0, 19, 20).astype(int),
    'd': np.linspace(0, 19, 20).astype(int),
    'ma': np.linspace(0, 19, 20).astype(int),
    #'split': np.linspace(0.65, 0.9, num=10),
    'seasonal': helper[(helper%2)==True],
}

def ts_eval_indiv(individual, padded_params, train_df, valid_df):
    """Evaluate individual's genes (estimator's params)"""
    # Params
    p = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    
    # Data

    # Model
    try:
        res = STLForecast(train_df, 
                          ARIMA, 
                          model_kwargs={'order': (p['ar'], p['d'], p['ma'])},
                          seasonal=p['seasonal']
                         ).fit()
        forec = res.forecast(int(365*0.8))
        forec.name = train_df.name + '_F'
        valid = pd.merge(left=valid_df, right=forec, left_index=True, right_index=True)
        # TODO errors metrics
        err = mean_squared_error(valid.iloc[:, 0], valid.iloc[:, 1])
        
    except BaseException as e:
        print('=> Error:', e)
        return 1000000,

    # Risk
    #inputs = inputs.to(device)
    #risk = mean(prod(net(inputs)*10, dim=1))
    #if isnan(risk):
    #    risk = 10
    #else:
    #    risk = float(risk)
        
    # Complexity
    #compl = net.count_weights_biases()

    return (err,)

ts_weights = (-1,)

In [None]:
# testing
names = list(train_dfs.keys())
temp = train_dfs[names[0]]

names = list(valid_dfs.keys())
temp_ = valid_dfs[names[0]]
# TODO loop each train dataset 

ga_ts = GA_Statsmodels(ts_params, ts_eval_indiv, ts_weights, temp, temp_)
pop, log, hof = ga_ts.run_ga_search()

Params padded
GA entities created
GA entities' methods registered
GA eval function registered
GA sel-cx-mut methods registered


  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'


In [None]:
list(hof['hof_0'].values())

In [None]:
list(hof['hof_0'].values())
res = STLForecast(temp, 
                  ARIMA, 
                  model_kwargs={'order': list(hof['hof_0'].values())[:3] },
                  seasonal=list(hof['hof_0'].values())[3],
                 ).fit()
forec = res.forecast(365)
forec.name = temp.name + '_f'

fig, ax = plt.subplots(figsize=(12,6))
_ = ax.plot(temp)
_ = ax.plot(temp_)
_ = ax.plot(forec)

In [None]:
if False:
    res = ARIMA(temp, (2,2,0)).fit(disp=0)
    forec_periods = 24
    forec, stderr, conf_int = res.forecast(forec_periods)
    forec = pd.Series(forec, index=[temp.index.max()+pd.DateOffset(months=i+1) for i in range(forec_periods)])
    forec

    fig, ax = plt.subplots(figsize=(12,6))
    _ = ax.plot(temp)
    _ = ax.plot(forec)
    
    a = np.linspace(3, 50, 20).astype(int)
    a
    np.column_stack((
        a,
        (a%2)==True,
        a//2,
        (a//2)%2,
        ((a//2)%2)==True,
    ))
    a[(a%2)==True]