# Import Packages

In [6]:
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In the implementation of tsfresh.feature_extraction.settings, MinimalFCParameters extends ComprehensiveFCParameters and only retains those params whose attributes have the flag minimal

# Time series creation

In [11]:
monthly_variation = [0.0, 0.0, 0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 0.4, 0.2, 0.05, 0.0]
weekly_sales = [20, 25, 30, 25, 20, 30, 0]
constant_sales = [10, 10, 10, 10, 10, 10, 10]

In [12]:
product_0 = pd.DataFrame({'date': pd.date_range(start = '01/04/2021', end = '01/02/2022'),
                          'value': np.array(weekly_sales * 52) + np.random.normal(0,2,364),
                         'unique_id': [0]*364})

In [13]:
product_1 = pd.DataFrame({'date': pd.date_range(start = '01/04/2021', end = '01/02/2022'),
                          'value': np.array(constant_sales * 52) + np.random.normal(0,2,364),
                         'unique_id': [1]*364})

In [16]:
product_2 = pd.DataFrame({'date': pd.date_range(start = '01/04/2021', end = '01/02/2022'),
                          'value': np.array(weekly_sales * 52) + np.random.normal(0,2,364),
                         'unique_id': [2]*364})

In [55]:
product_3 = pd.DataFrame({'date': pd.date_range(start = '01/04/2021', end = '01/02/2022'),
                          'value': np.array(constant_sales * 52) + np.random.normal(0,2,364),
                         'unique_id': [3]*364})

Product 0 and 1 will be the training data and product 2 and 3 will be the test data for weekly varying and constant sales respectively.

In [32]:
X = pd.concat([product_0, product_1])

In [56]:
test = pd.concat([product_2, product_3])

Extract features using the tsfresh package

In [34]:
features = extract_features(timeseries_container = X,
                           column_id = 'unique_id',
                           column_value = 'value',
                           column_sort = 'date',
                           default_fc_parameters = MinimalFCParameters())

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.62s/it]


In [35]:
features

Unnamed: 0,value__sum_values,value__median,value__mean,value__length,value__standard_deviation,value__variance,value__root_mean_square,value__maximum,value__absolute_maximum,value__minimum
0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635
1,3629.25164,9.891223,9.970472,364.0,1.969193,3.877719,10.163071,15.242831,15.242831,4.64051


In [36]:
features = features.rename_axis('unique_id').reset_index()

In [37]:
X = X.merge(features, on = 'unique_id')
X.head()

Unnamed: 0,date,value,unique_id,value__sum_values,value__median,value__mean,value__length,value__standard_deviation,value__variance,value__root_mean_square,value__maximum,value__absolute_maximum,value__minimum
0,2021-01-04,20.446972,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635
1,2021-01-05,24.933529,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635
2,2021-01-06,26.406647,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635
3,2021-01-07,24.667227,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635
4,2021-01-08,17.722797,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635


In [38]:
X['day_of_week'] = X['date'].apply(lambda x: x.day_of_week)
X.head()

Unnamed: 0,date,value,unique_id,value__sum_values,value__median,value__mean,value__length,value__standard_deviation,value__variance,value__root_mean_square,value__maximum,value__absolute_maximum,value__minimum,day_of_week
0,2021-01-04,20.446972,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635,0
1,2021-01-05,24.933529,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635,1
2,2021-01-06,26.406647,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635,2
3,2021-01-07,24.667227,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635,3
4,2021-01-08,17.722797,0,7813.117427,24.060229,21.464608,364.0,9.752139,95.10421,23.576124,34.991762,34.991762,-3.988635,4


In [64]:
features = extract_features(timeseries_container = test,
                           column_id = 'unique_id',
                           column_value = 'value',
                           column_sort = 'date',
                           default_fc_parameters = MinimalFCParameters())

features = features.rename_axis('unique_id').reset_index()
test = test.merge(features, on = 'unique_id')
test['day_of_week'] = test['date'].apply(lambda x: x.day_of_week)
test.head()

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.62s/it]


Unnamed: 0,date,value,unique_id,value__sum_values_x,value__median_x,value__mean_x,value__length_x,value__standard_deviation_x,value__variance_x,value__root_mean_square_x,...,value__sum_values,value__median,value__mean,value__length,value__standard_deviation,value__variance,value__root_mean_square,value__maximum,value__absolute_maximum,value__minimum
0,2021-01-04,19.237755,2,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,...,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,35.241459,35.241459,-3.654935
1,2021-01-05,26.128039,2,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,...,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,35.241459,35.241459,-3.654935
2,2021-01-06,31.066775,2,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,...,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,35.241459,35.241459,-3.654935
3,2021-01-07,28.818512,2,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,...,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,35.241459,35.241459,-3.654935
4,2021-01-08,19.938313,2,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,...,7830.947806,23.823858,21.513593,364.0,9.844854,96.921145,23.659159,35.241459,35.241459,-3.654935


# Training 

In [61]:
def train_and_predict(flist, test_set):
    train_data = lgb.Dataset(X[flist], label = X['value'])
    model = lgb.train({}, train_data)
    pred = model.predict(test_set[flist])
    
    return pred

In [59]:
feature_list = features.columns.to_list() + ['day_of_week']

pred = train_and_predict(feature_list, test[test['unique_id'] == 2])

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 36
[LightGBM] [Info] Number of data points in the train set: 728, number of used features: 11
[LightGBM] [Info] Start training from score 15.717540


In [73]:
print(pred[:7])
print(test['value'][test['unique_id'] == 2].to_list()[:7])

[10.22829098  9.47956852  9.91327795  9.90118681  9.80190599 10.41837645
 10.05176254]
[19.23775549139016, 26.12803859259153, 31.06677474489474, 28.818511745910744, 19.938313404275373, 32.08586000840996, 0.30925257987842336]


In [68]:
pred = train_and_predict(feature_list, test[test['unique_id'] == 3])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36
[LightGBM] [Info] Number of data points in the train set: 728, number of used features: 11
[LightGBM] [Info] Start training from score 15.717540


In [71]:
print(pred[:7])
print(test[test['unique_id'] == 3]['value'].to_list()[:7])

[10.22829098  9.47956852  9.91327795  9.90118681  9.80190599 10.41837645
 10.05176254]
[7.727262192382208, 9.105461999810144, 10.970659371222489, 9.40033103388227, 9.887822956883964, 7.6760717400737635, 8.989685424363786]
