## Cut last entries from all timeseries
To test our algorithms, we are going to try predict **n** last entries given to us in the training dataset. For each timeseries, **n** is the horizon we will need to predict for M4 submission (Taken from *M4-info.csv*).

For each type of timeseries (Daily, Hourly, etc.) we are going to create train dataframe and test dataframe. Test dataframe will contain the last **n** entries we are going to predict when validating, train would be original training data without last **n** entries.

In [39]:
import multiprocessing as mp
import os
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import math

def count_nonnan(row):
    return sum(1 for c in row[1:] if not math.isnan(c))

sources = {
    'Daily': './data/Daily-train.csv',
    'Hourly': './data/Hourly-train.csv',
    'Monthly': './data/Monthly-train.csv',
    'Quarterly': './data/Quarterly-train.csv',
    'Weekly': './data/Weekly-train.csv',
    'Yearly': './data/Yearly-train.csv',
}

def cut(params):
    print()
    i, (sp, path) = params
    
    info = pd.read_csv('./data/M4-info.csv')
    sp_info = info.loc[info['SP'] == sp]
    
    timeseries = pd.read_csv(path)
    test = []
    for i in tqdm(range(timeseries.shape[0]), position=i, desc=sp):
        horizon = sp_info.iloc[i]['Horizon']
        length = count_nonnan(timeseries.iloc[i])
        test.append(timeseries.iloc[i, (length-horizon+1):(length+1)])
        timeseries.iloc[i, (length-horizon+1):(length+1)] = np.nan
    os.makedirs('./cut/full/train', exist_ok=True)
    os.makedirs('./cut/full/test', exist_ok=True)
    timeseries.to_csv('cut/full/train/{}.csv'.format(sp), index=False)
    
    test_df = pd.DataFrame(np.array(test))
    test_df.to_csv('cut/full/test/{}.csv'.format(sp), index=False)
    
    return True
        
        
with mp.Pool() as pool:
    list(tqdm(pool.imap(cut, enumerate(sources.items())), total=len(sources)))
        
        




A Jupyter Widget






A Jupyter Widget




A Jupyter Widget





A Jupyter Widget




A Jupyter Widget




A Jupyter Widget




A Jupyter Widget




Let's check the shapes of all training datasets as well

In [85]:
for _, path in sources.items():
    print(pd.read_csv(path).shape)

(4227, 9920)
(414, 961)
(48000, 2795)
(24000, 867)
(359, 2598)
(23000, 836)


Let's see that we cut hourly database correctly.

In [41]:
original = pd.read_csv('./data/Hourly-train.csv')
train = pd.read_csv('./cut/train/Hourly_full.csv')
test = pd.read_csv('./cut/test/Hourly_full.csv')
original.iloc[-5:,-49:]

Unnamed: 0,V913,V914,V915,V916,V917,V918,V919,V920,V921,V922,...,V952,V953,V954,V955,V956,V957,V958,V959,V960,V961
409,56.0,37.0,30.0,33.0,26.0,32.0,42.0,121.0,189.0,175.0,...,132.0,165.0,191.0,186.0,119.0,108.0,70.0,72.0,79.0,77.0
410,63.0,29.0,26.0,23.0,20.0,33.0,79.0,155.0,289.0,211.0,...,122.0,195.0,166.0,154.0,132.0,76.0,66.0,67.0,51.0,42.0
411,27.0,30.0,17.0,14.0,19.0,15.0,29.0,61.0,116.0,116.0,...,111.0,166.0,225.0,278.0,144.0,71.0,70.0,73.0,39.0,36.0
412,20.0,22.0,16.0,16.0,20.0,18.0,15.0,21.0,21.0,24.0,...,77.0,91.0,76.0,68.0,68.0,59.0,58.0,53.0,38.0,46.0
413,68.0,31.0,20.0,20.0,16.0,16.0,20.0,29.0,50.0,89.0,...,89.0,117.0,128.0,105.0,65.0,48.0,41.0,35.0,26.0,17.0


In [32]:
test.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
409,37.0,30.0,33.0,26.0,32.0,42.0,121.0,189.0,175.0,119.0,...,132.0,165.0,191.0,186.0,119.0,108.0,70.0,72.0,79.0,77.0
410,29.0,26.0,23.0,20.0,33.0,79.0,155.0,289.0,211.0,138.0,...,122.0,195.0,166.0,154.0,132.0,76.0,66.0,67.0,51.0,42.0
411,30.0,17.0,14.0,19.0,15.0,29.0,61.0,116.0,116.0,72.0,...,111.0,166.0,225.0,278.0,144.0,71.0,70.0,73.0,39.0,36.0
412,22.0,16.0,16.0,20.0,18.0,15.0,21.0,21.0,24.0,24.0,...,77.0,91.0,76.0,68.0,68.0,59.0,58.0,53.0,38.0,46.0
413,31.0,20.0,20.0,16.0,16.0,20.0,29.0,50.0,89.0,96.0,...,89.0,117.0,128.0,105.0,65.0,48.0,41.0,35.0,26.0,17.0


In [46]:
train.iloc[-5:,-49:]

Unnamed: 0,V913,V914,V915,V916,V917,V918,V919,V920,V921,V922,...,V952,V953,V954,V955,V956,V957,V958,V959,V960,V961
409,56.0,,,,,,,,,,...,,,,,,,,,,
410,63.0,,,,,,,,,,...,,,,,,,,,,
411,27.0,,,,,,,,,,...,,,,,,,,,,
412,20.0,,,,,,,,,,...,,,,,,,,,,
413,68.0,,,,,,,,,,...,,,,,,,,,,


Now, let's create datasets of 1000 and 10000 rows. For this we are going to sample from original dataset while preserving the original proportions of dataset types (Daily, Weekly, etc.):

In [103]:
from sklearn.model_selection import train_test_split


def get_stratified_sample(size, info):
    return train_test_split(info, train_size=size, test_size=None, stratify=info.SP)[0].sort_index()



# Those two functions not used for now
def get_group_sizes(size, num_groups):
    group_sizes = [math.floor(size/num_groups)] * num_groups

    for i in range(num_groups):
        if sum(group_sizes) < size:
            group_sizes[i % num_groups] += 1
        else:
            break
    return group_sizes
def get_evenly_distributed_sample(size, info):
    num_groups = len(info.SP.unique())
    group_sizes_iter = iter(get_group_sizes(size, num_groups) + [None])
    return info.groupby('SP').apply(lambda group: group.sample(n=next(group_sizes_iter)))


In [104]:
size = 1000
info = pd.read_csv('./data/M4-info.csv')

stratified_sample = get_stratified_sample(size, info)
evenly_distributed_sample = get_evenly_distributed_sample(size, info)

print('Original counts:')
print(info.SP.value_counts())

print('\nStratified counts:')
print(stratified_sample.SP.value_counts())

print('\nEvenly distributed counts:')
sample.SP.value_counts()

Original counts:
Monthly      48000
Quarterly    24000
Yearly       23000
Daily         4227
Hourly         414
Weekly         359
Name: SP, dtype: int64

Stratified counts:
Monthly      480
Quarterly    240
Yearly       230
Daily         42
Hourly         4
Weekly         4
Name: SP, dtype: int64

Evenly distributed counts:


Monthly      167
Daily        167
Hourly       167
Quarterly    166
Weekly       166
Yearly         1
Name: SP, dtype: int64

In [90]:
stratified_sample

Unnamed: 0,M4id,category,Frequency,Horizon,SP
11,Y12,Macro,1,6,Yearly
146,Y147,Macro,1,6,Yearly
156,Y157,Macro,1,6,Yearly
190,Y191,Macro,1,6,Yearly
236,Y237,Macro,1,6,Yearly
266,Y267,Macro,1,6,Yearly
333,Y334,Macro,1,6,Yearly
410,Y411,Macro,1,6,Yearly
563,Y564,Macro,1,6,Yearly
691,Y692,Macro,1,6,Yearly


Let's regenerate the 1000 and 10000 samples for all dataset types and save them to CSV.

In [111]:
def get_indexes(sample, source):
    filtered_sample = sample[sample.SP == source]
    return pd.to_numeric(filtered_sample.M4id.str.slice(1), errors='coerce') - 1

def write_samples():
    for size in tqdm([1000, 10000]):
        sample = get_stratified_sample(size, info)
        os.makedirs('./cut/{}/train'.format(str(size)), exist_ok=True)
        os.makedirs('./cut/{}/test'.format(str(size)), exist_ok=True)
        sample.to_csv('./cut/{}/M4-info.csv'.format(size))
        for source in tqdm(sources.keys(), total=len(sources)):
            train = pd.read_csv('./cut/full/train/{}.csv'.format(source))
            test = pd.read_csv('./cut/full/test/{}.csv'.format(source))
            
            indexes = get_indexes(sample, source)
            sample_train = train.iloc[indexes.values]
            sample_test = test.iloc[indexes.values]
            sample_train.to_csv('./cut/{}/train/{}.csv'.format(size, source))
            sample_test.to_csv('./cut/{}/test/{}.csv'.format(size, source))
        
        
write_samples() 

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget




Let's validate generated files:

In [133]:
for size in [1000, 10000]:
    info = pd.read_csv('./cut/{}/M4-info.csv'.format(size), index_col=0)
    for source in sources.keys():
        df_index = (source[0] + (pd.read_csv('./cut/{}/train/{}.csv'.format(size, source), index_col=0).index .to_series() + 1).astype('str')).values
        m4id = info[info.SP == source].M4id.values
        assert (df_index == m4id).all()