In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import datetime
import pickle
import requests

from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.model_selection import train_test_split 
from sklearn import metrics, preprocessing, svm

from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [2]:
# read data
df = pd.read_csv('data/train_data.csv')
df.drop(['merchant_name'], axis=1, inplace=True)
df.drop(['category'], axis=1, inplace=True)
df

Unnamed: 0,date,cashback,id
0,2022-02-10,681.27,0
1,2022-02-11,1171.52,0
2,2022-02-12,1068.24,0
3,2022-02-13,1023.07,0
4,2022-02-14,559.56,0
...,...,...,...
27297,2022-10-29,195.97,2547
27298,2022-10-30,223.59,2547
27299,2022-10-31,153.72,2547
27300,2022-11-01,387.80,2547


In [3]:
def model_predict(data, model):
    extraction_settings = ComprehensiveFCParameters()

    X = extract_features(data, column_id='id', column_sort='date', column_value='cashback',
                     default_fc_parameters=extraction_settings,
                     # we impute = remove all NaN features automatically
                     impute_function=impute)

    X = X[[
        'cashback__kurtosis',
        'cashback__mean_second_derivative_central',
        'cashback__fft_aggregated__aggtype_"variance"',
        'cashback__agg_autocorrelation__f_agg_"var"__maxlag_40',
        'cashback__fft_aggregated__aggtype_"centroid"',
        'cashback__cid_ce__normalize_True',
        'cashback__cid_ce__normalize_False',
        'cashback__variation_coefficient',
        'cashback__skewness'
    ]]
    
    return int(model.predict(X)[0])

In [4]:
def test_model(all_data, model, tests=10):
    s_mae = 0
    s_mse = 0
    s_tinkoff_metric = 0

    s_tests = 0
    while s_tests < tests:
        rid = int(np.random.random()*100)%len(set(all_data['id']))
        dx = all_data.loc[all_data['id'] == rid]
        if len(dx.index) > 10:
            r_ind = 1+int(np.random.random()*100)%(len(dx.index)-6)
            test_data = dx[:r_ind]
            test_out = dx[r_ind:r_ind+5]['cashback'].sum()
            pred = model_predict(test_data, model)
            
            s_mae += abs(test_out-pred)
            s_mse += (test_out-pred)**2
            s_tinkoff_metric += (test_out - 5*pred)
            s_tests += 1
            
    return (tests, s_mae/tests, s_mse**0.5/tests, s_tinkoff_metric/tests)
        

In [82]:
def test_site(all_data, tests=30):
    s_tinkoff_spent = 0 #сумма выплат банка
    s_budget = 0 #сумма бюджетов всех акций
    s_res = 0 #сумма итоговых баллов
    
    s_tests = 0
    batch_size = 3

    URL = 'http://127.0.0.1:8080'

    while s_tests < tests:
        rid = int(np.random.random()*100)%len(set(all_data['id']))
        dx = all_data.loc[all_data['id'] == rid]

        budget = (dx['cashback'].sum())*np.random.random() + sum([dx.loc[i].cashback for i in list(dx.index)[:10]])
        s_budget += budget
        
        response = requests.post(f"{URL}/api/partners", json={"name": f"portner{rid}", "budget": budget})
        idPOnSite = response.json()['id']

        
        if len(dx.index) > 10:
            for i in dx.index:
                requests.put(f"{URL}/api/partners/{idPOnSite}/cashback", json={"date": dx.loc[i]['date'], "name": f"portner{rid}", "cashback": dx.loc[i]['cashback']})
                # PUT cashback from dx[i:i+batch_size]
                # GET partner info
            s_tests+=1
            # GET partner info -> spent_budget
            response = requests.get(f"{URL}/api/partners/{idPOnSite}")
            spent_budget = response.json()['spent_budget']
    
            tinkoff_spent = max(spent_budget - budget, 0)
            s_tinkoff_spent += tinkoff_spent
            res = spent_budget - 5 * tinkoff_spent
            s_res += res
            print(f'res={round(res, 2)}, budget={round(budget, 2)}, spent_budget={round(spent_budget, 2)}, tinkoff_spent={round(tinkoff_spent, 2)}')
    return s_res / s_tests, s_tinkoff_spent / s_tests, s_budget / s_tests

In [6]:
#test_site(df)

In [10]:
# import model
model = Lasso()
with open('models/model_2.0.pkl', 'rb') as inp:
    model = pickle.load(inp)
    print(model.coef_)

[ 3.33953680e+02 -2.59930007e+01 -4.15590525e+00 -1.22823178e+04
 -6.13099526e+02  4.62742613e+02  1.34316447e+00  5.50078129e+03
 -5.73484366e+03]


In [83]:
test_site(df)

res=78139.98, budget=156078.54, spent_budget=78139.98, tinkoff_spent=0
res=107535.97, budget=149948.0, spent_budget=160551.01, tinkoff_spent=10603.01
res=166401.11, budget=166676.04, spent_budget=166401.11, tinkoff_spent=0
res=1229.37, budget=6641.63, spent_budget=1229.37, tinkoff_spent=0
res=4503.66, budget=10297.41, spent_budget=4503.66, tinkoff_spent=0
res=1229.37, budget=7914.25, spent_budget=1229.37, tinkoff_spent=0
res=331346.34, budget=522198.88, spent_budget=569912.02, tinkoff_spent=47713.14
res=98354.85, budget=136340.03, spent_budget=145836.32, tinkoff_spent=9496.29
res=646.49, budget=2941.13, spent_budget=646.49, tinkoff_spent=0
res=781.98, budget=1922.6, spent_budget=781.98, tinkoff_spent=0
res=116696.68, budget=151780.14, spent_budget=160551.01, tinkoff_spent=8770.87
res=25882.88, budget=34607.79, spent_budget=25882.88, tinkoff_spent=0
res=654808.6, budget=788651.0, spent_budget=654808.6, tinkoff_spent=0
res=-122724.46, budget=255379.72, spent_budget=349905.77, tinkoff_spe

(66857.69303315257, 16139.26546003615, 210689.10467609178)