# Import Packages

In [2]:
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In the implementation of tsfresh.feature_extraction.settings, MinimalFCParameters extends ComprehensiveFCParameters and only retains those params whose attributes have the flag minimal

# Time series creation

In [3]:
monthly_variation = [0.0, 0.0, 0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 0.4, 0.2, 0.05, 0.0]
weekly_sales = [20, 25, 30, 25, 20, 30, 0]
constant_sales = [0, 0, 0, 0, 0, 0, 0]

In [4]:
product_0 = pd.DataFrame({'date': pd.date_range(start = '01/04/2021', end = '01/02/2022'),
                          'value': np.array(weekly_sales * 52) + np.random.normal(0,2,364),
                         'unique_id': [0]*364})

In [5]:
product_1 = pd.DataFrame({'date': pd.date_range(start = '01/04/2021', end = '01/02/2022'),
                          'value': np.array(constant_sales * 52) + np.random.normal(0,2,364),
                         'unique_id': [1]*364})

In [6]:
product_0_test = pd.DataFrame({'date': pd.date_range(start = '01/04/2021', end = '01/02/2022'),
                          'value': np.array(weekly_sales * 52) + np.random.normal(0,2,364),
                         'unique_id': [0]*364})

In [7]:
product_1_test = pd.DataFrame({'date': pd.date_range(start = '01/04/2021', end = '01/02/2022'),
                          'value': np.array(constant_sales * 52) + np.random.normal(0,2,364),
                         'unique_id': [1]*364})

Product 0 and 1 will be the training data and product 0 test and 1 test will be the test data for weekly varying and constant sales respectively.

In [8]:
X = pd.concat([product_0, product_1])

In [10]:
test = pd.concat([product_0_test, product_1_test])

# Extract features using the tsfresh package

In [11]:
features = extract_features(timeseries_container = X,
                           column_id = 'unique_id',
                           column_value = 'value',
                           column_sort = 'date',
                           default_fc_parameters = MinimalFCParameters())

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.58s/it]


In [18]:
features = extract_features(timeseries_container = X,
                           column_id = 'unique_id',
                           column_value = 'value',
                           column_sort = 'date')

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.72s/it]


In [20]:
for i in features.columns:
    print(i)

value__variance_larger_than_standard_deviation
value__has_duplicate_max
value__has_duplicate_min
value__has_duplicate
value__sum_values
value__abs_energy
value__mean_abs_change
value__mean_change
value__mean_second_derivative_central
value__median
value__mean
value__length
value__standard_deviation
value__variation_coefficient
value__variance
value__skewness
value__kurtosis
value__root_mean_square
value__absolute_sum_of_changes
value__longest_strike_below_mean
value__longest_strike_above_mean
value__count_above_mean
value__count_below_mean
value__last_location_of_maximum
value__first_location_of_maximum
value__last_location_of_minimum
value__first_location_of_minimum
value__percentage_of_reoccurring_values_to_all_values
value__percentage_of_reoccurring_datapoints_to_all_datapoints
value__sum_of_reoccurring_values
value__sum_of_reoccurring_data_points
value__ratio_value_number_to_time_series_length
value__sample_entropy
value__maximum
value__absolute_maximum
value__minimum
value__benfor

In [13]:
features = features.rename_axis('unique_id').reset_index()
X = X.merge(features, on = 'unique_id')
X.head()

Unnamed: 0,date,value,unique_id,value__sum_values,value__median,value__mean,value__length,value__standard_deviation,value__variance,value__root_mean_square,value__maximum,value__absolute_maximum,value__minimum
0,2021-01-04,20.889788,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496
1,2021-01-05,22.079658,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496
2,2021-01-06,27.503202,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496
3,2021-01-07,25.665027,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496
4,2021-01-08,19.615717,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496


In [14]:
X['day_of_week'] = X['date'].apply(lambda x: x.day_of_week)
X.head()

Unnamed: 0,date,value,unique_id,value__sum_values,value__median,value__mean,value__length,value__standard_deviation,value__variance,value__root_mean_square,value__maximum,value__absolute_maximum,value__minimum,day_of_week
0,2021-01-04,20.889788,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496,0
1,2021-01-05,22.079658,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496,1
2,2021-01-06,27.503202,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496,2
3,2021-01-07,25.665027,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496,3
4,2021-01-08,19.615717,0,7757.977359,23.6562,21.313125,364.0,9.94216,98.846545,23.517989,35.443102,35.443102,-4.734496,4


In [15]:
features = extract_features(timeseries_container = test,
                           column_id = 'unique_id',
                           column_value = 'value',
                           column_sort = 'date',
                           default_fc_parameters = MinimalFCParameters())

features

Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.53s/it]


Unnamed: 0,value__sum_values,value__median,value__mean,value__length,value__standard_deviation,value__variance,value__root_mean_square,value__maximum,value__absolute_maximum,value__minimum
0,7818.328851,24.022405,21.478925,364.0,9.758545,95.229205,23.591809,34.528338,34.528338,-5.882289
1,91.916279,0.243607,0.252517,364.0,2.059564,4.241806,2.074987,5.91084,5.91084,-5.233828


In [16]:
features = features.rename_axis('unique_id').reset_index()
test = test.merge(features, on = 'unique_id')
test['day_of_week'] = test['date'].apply(lambda x: x.day_of_week)
test.head()

Unnamed: 0,date,value,unique_id,value__sum_values,value__median,value__mean,value__length,value__standard_deviation,value__variance,value__root_mean_square,value__maximum,value__absolute_maximum,value__minimum,day_of_week
0,2021-01-04,21.67537,0,7818.328851,24.022405,21.478925,364.0,9.758545,95.229205,23.591809,34.528338,34.528338,-5.882289,0
1,2021-01-05,25.201087,0,7818.328851,24.022405,21.478925,364.0,9.758545,95.229205,23.591809,34.528338,34.528338,-5.882289,1
2,2021-01-06,30.552643,0,7818.328851,24.022405,21.478925,364.0,9.758545,95.229205,23.591809,34.528338,34.528338,-5.882289,2
3,2021-01-07,27.699566,0,7818.328851,24.022405,21.478925,364.0,9.758545,95.229205,23.591809,34.528338,34.528338,-5.882289,3
4,2021-01-08,21.382687,0,7818.328851,24.022405,21.478925,364.0,9.758545,95.229205,23.591809,34.528338,34.528338,-5.882289,4


# Training 

In [17]:
def train_and_predict(flist, test_set):
    train_data = lgb.Dataset(X[flist], label = X['value'])
    model = lgb.train({'min_data_in_bin':1, 'min_data_in_leaf':1}, train_data)
    pred = model.predict(test_set[flist])
    
    return pred

In [18]:
feature_list = features.columns.to_list() + ['day_of_week']
feature_list.remove('unique_id')
#feature_list = ['day_of_week', 'unique_id']

pred = train_and_predict(feature_list, X[feature_list])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 728, number of used features: 10
[LightGBM] [Info] Start training from score 10.684583


In [19]:
feature_list

['value__sum_values',
 'value__median',
 'value__mean',
 'value__length',
 'value__standard_deviation',
 'value__variance',
 'value__root_mean_square',
 'value__maximum',
 'value__absolute_maximum',
 'value__minimum',
 'day_of_week']

In [20]:
print(pred[:7])
print(test['value'][test['unique_id'] == 0].to_list()[:7])

[19.98347907 24.98660205 30.18533155 24.55543974 19.73459804 30.24742645
 -0.50298124]
[21.67536976730733, 25.201086795736142, 30.552643213716532, 27.69956590800705, 21.382686603896257, 27.47814409702308, 0.07110490568158105]


In [21]:
pred = train_and_predict(feature_list, test[test['unique_id'] == 1])

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 728, number of used features: 10
[LightGBM] [Info] Start training from score 10.684583


In [23]:
print(pred[:7])
print(test[test['unique_id'] == 1]['value'].to_list()[:7])

[ 0.55432464 -0.21288249 -0.07582725  0.0744885   0.02923533 -0.3805973
  0.40551855]
[-0.694383720106086, -0.8239007584586375, 2.581935649092304, 1.8782133237320426, 4.547755218379606, 3.9848309390159926, -2.1502782705686077]


# Conclusion

The values predicted by the lgbm model are close to the values generated from the true model in test. 