In [62]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import datetime
import pickle

from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.model_selection import train_test_split 
from sklearn import metrics, preprocessing, svm

from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [73]:
# read data
df = pd.read_excel('data/hist_data_nm_cat.xlsx')
df = df.rename(columns={'day': 'date'})
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [65]:
def model_predict(data, model):
    extraction_settings = ComprehensiveFCParameters()

    X = extract_features(data, column_id='id', column_sort='date', column_value='cashback',
                     default_fc_parameters=extraction_settings,
                     # we impute = remove all NaN features automatically
                     impute_function=impute)

    X = X[[
        'cashback__kurtosis',
        'cashback__mean_second_derivative_central',
        'cashback__fft_aggregated__aggtype_"variance"',
        'cashback__agg_autocorrelation__f_agg_"var"__maxlag_40',
        'cashback__fft_aggregated__aggtype_"centroid"',
        'cashback__cid_ce__normalize_True',
        'cashback__cid_ce__normalize_False',
        'cashback__variation_coefficient',
        'cashback__skewness'
    ]]
    
    return int(model.predict(X)[0])

In [67]:
# import model
model = Lasso()
with open('model.pkl', 'rb') as inp:
    model = pickle.load(inp)
    print(model.coef_)

[-8.15309652e+02  2.23446194e+01 -5.58261884e+00 -2.08953885e+04
  5.99008623e+02 -2.35407339e+03  6.61915608e-01 -1.67611587e+04
  8.64370158e+03]


In [72]:
# Get some test data
test_data = pd.DataFrame({'id': pd.Series(dtype='int'),
                   'cashback': pd.Series(dtype='int'),
                   'date': pd.Series(dtype='datetime64[ns]')})

start, end = 10, 43
test_name = 'Delta Sirius'

test_output = int(df[df['merchant_name'] == test_name][end:end+5]['cashback'].sum())

for i in range(end-start):
    row = df[df['merchant_name'] == test_name][start+i:start+i+1]
    test_data.loc[i] = [1, int(row['cashback'].iloc[0]), row.index.min()]
test_data

Unnamed: 0,id,cashback,date
0,1,3418,2022-01-11
1,1,3507,2022-01-12
2,1,530,2022-01-13
3,1,1254,2022-01-14
4,1,1717,2022-01-15
5,1,2364,2022-01-16
6,1,3577,2022-01-17
7,1,3907,2022-01-18
8,1,2283,2022-01-19
9,1,2902,2022-01-20


In [71]:
# test prediction
print(test_output, model_predict(test_data, model))

Feature Extraction: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.10s/it]

18476 23294



