# Linear Models

## Import libraries

In [1]:
#!pip install matplotlib seaborn scikit-learn

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [3]:
from clickhouse_driver import Client


user_name = 'user_name'
pwd = 'password'

# creating connection ClickHouse
client = Client(host='clickhouse.lab.karpov.courses', port=9000,
                user=user_name, password=pwd, database='hardda')

# checking connection
result = client.execute("SELECT * FROM hardda.user_dm_events LIMIT 1")

# showing the result
for row in result[0:1]:
    print(row)

(datetime.date(2022, 2, 1), datetime.date(2022, 1, 31), 'android', 'f7411212fd0e2523e126cbfdd3f226c211212', '4beb10e1-aeeb-4c52-acd2-ce1ddbc1fc24b10e1', 22, 11, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0)


## Tasks

### Task 1.

 **Write an SQL query to load the data for all available payment_date dates for transactions of types (`'basic sale'`, `'fast sale'`, `'quick sale'`). Group -sum(amount) by the payment_date field. Note that amount in the table is negative, since money is debited from the account, so we multiply by -1.**

In [4]:
query = '''
SELECT 
  payment_date, 
  -SUM(amount)
FROM 
  user_transactions
WHERE 
  type IN ('basic sale', 'fast sale', 'quick sale')
GROUP BY 
  payment_date
'''

In [5]:
result = client.execute(query)

In [6]:
len(result)

511

In [7]:
df = pd.DataFrame(result, columns=['payment_date', 'volume'])

df

Unnamed: 0,payment_date,volume
0,2021-02-02,3396867
1,2021-02-03,3504675
2,2021-02-04,3321098
3,2021-02-05,3317318
4,2021-02-06,2763316
...,...,...
506,2023-01-27,7606314
507,2023-01-28,6243591
508,2023-01-29,6866600
509,2023-01-30,8337223


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   payment_date  511 non-null    object
 1   volume        511 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.1+ KB


In [9]:
df['payment_date'] = pd.to_datetime(df['payment_date'], format='%Y-%m-%d')

In [10]:
df = df.sort_values('payment_date').reset_index(drop=True)

In [11]:
df

Unnamed: 0,payment_date,volume
0,2021-02-02,3396867
1,2021-02-03,3504675
2,2021-02-04,3321098
3,2021-02-05,3317318
4,2021-02-06,2763316
...,...,...
506,2023-01-27,7606314
507,2023-01-28,6243591
508,2023-01-29,6866600
509,2023-01-30,8337223


Checking for missing dates. 

In [12]:
(df.payment_date - df.payment_date.shift(1)).mean()

Timedelta('1 days 10:15:31.764705882')

We can see that there are some missing dates in our dataset. And we have to fix it. 

In [13]:
date_range = pd.date_range(start=df.payment_date.min(), 
                           end=df.payment_date.max(),
                           freq='D')

In [14]:
len(date_range)

729

In [15]:
df_date_range = pd.DataFrame({'payment_date': date_range})

In [16]:
df_date_range

Unnamed: 0,payment_date
0,2021-02-02
1,2021-02-03
2,2021-02-04
3,2021-02-05
4,2021-02-06
...,...
724,2023-01-27
725,2023-01-28
726,2023-01-29
727,2023-01-30


In [17]:
df_full = pd.merge(df_date_range, df, how='left', on='payment_date')

In [18]:
df_full

Unnamed: 0,payment_date,volume
0,2021-02-02,3396867.0
1,2021-02-03,3504675.0
2,2021-02-04,3321098.0
3,2021-02-05,3317318.0
4,2021-02-06,2763316.0
...,...,...
724,2023-01-27,7606314.0
725,2023-01-28,6243591.0
726,2023-01-29,6866600.0
727,2023-01-30,8337223.0


In [19]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   volume        511 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 17.1 KB


Let's impute missing values with zeroes. 

In [20]:
df_full.fillna(0, inplace=True)

In [21]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   volume        729 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 17.1 KB


### Task 2.  

**Generate additional features for our dataset, using 30, 61, 91 days lag and exponential weighted moving average with 7, 30, 91 days window size. Also create a binary feature was it a work day or a weekend.** 

In [22]:
def generate_features_for_series(
    data: pd.DataFrame,
    date_colname='payment_date',
    value_colname= 'volume'
):

    data_ = data.copy()
    data_.sort_values(date_colname, inplace=True)
   
    # is_holiday feature
    data_['weekday'] = data_[date_colname].dt.weekday
    data_.loc[data_['weekday'].isin([5, 6]), 'is_holiday'] = 1
    data_['is_holiday'] = data_['is_holiday'].fillna(0)
    data_.drop('weekday', axis=1, inplace=True)

    # lagging days features
    data_['lag_30d'] = data_[value_colname].shift(30)
    data_['lag_61d'] = data_[value_colname].shift(61)
    data_['lag_91d'] = data_[value_colname].shift(91)

    # exponential weighted moving avereage features
    data_['rolling_7d'] = data_['lag_30d'].ewm(span=7).mean()
    data_['rolling_30d'] = data_['lag_30d'].ewm(span=30).mean()
    data_['rolling_91d'] = data_['lag_30d'].ewm(span=91).mean()
   
    # removing NaN values
    data_.dropna(inplace=True)
   
    return data_

In [23]:
data = generate_features_for_series(df_full)

In [24]:
data

Unnamed: 0,payment_date,volume,is_holiday,lag_30d,lag_61d,lag_91d,rolling_7d,rolling_30d,rolling_91d
91,2021-05-04,4585501.0,0.0,0.0,3298677.0,3396867.0,1.052007e+06,1.837514e+06,2.137795e+06
92,2021-05-05,4842622.0,0.0,3707972.0,2632327.0,3504675.0,1.715998e+06,1.960023e+06,2.183332e+06
93,2021-05-06,4289897.0,0.0,3929579.0,2489894.0,3321098.0,2.269393e+06,2.088896e+06,2.233610e+06
94,2021-05-07,0.0,0.0,3719324.0,2604353.0,3317318.0,2.631876e+06,2.195482e+06,2.276087e+06
95,2021-05-08,0.0,1.0,0.0,2141489.0,2763316.0,1.973907e+06,2.052080e+06,2.211456e+06
...,...,...,...,...,...,...,...,...,...
724,2023-01-27,7606314.0,0.0,5576730.0,5560131.0,6605566.0,4.042208e+06,4.284195e+06,4.362926e+06
725,2023-01-28,6243591.0,1.0,4879158.0,6110948.0,5888153.0,4.251446e+06,4.322580e+06,4.374148e+06
726,2023-01-29,6866600.0,1.0,3997769.0,6505339.0,6081790.0,4.188027e+06,4.301624e+06,4.365966e+06
727,2023-01-30,8337223.0,0.0,2698006.0,0.0,0.0,3.815521e+06,4.198165e+06,4.329706e+06


### Task 3. 

**Train a linear regression model and measure the quality of your model's performance.** 

In [25]:
X_ = data.drop(['payment_date', 'volume'], axis=1)
Y_ = data.volume

model = LinearRegression()

model.fit(X_, Y_)

In [26]:
data['preds'] = model.predict(X_)

In [27]:
MSE = mean_squared_error(Y_, data.preds)
MAE = mean_absolute_error(Y_, data.preds)

print(f'MSE: {MSE:.0f}, MAE: {MAE:.0f}')

MSE: 7659274556248, MAE: 2454155


### Task 4. 

**In machine learning, it can be useful to use asymmetric metrics. Especially in cases where we want to give only a lower or upper bound on the prediction. For example, a great example of such a metric is the use of quadratic error for over-prediction, and absolute error for under-prediction on each object (or vice versa). And then - averaging the losses calculated for each object.**

Asymmetric = [Prediction > Target] (Prediction - Target)^2 + [Prediction < Target] |Prediction - Target|

In [28]:
preds_lst = data.preds.to_list()

sum_ = 0
for idx in range(len(preds_lst)):
    if preds_lst[idx] > Y_.values[idx]:
        sum_ += (preds_lst[idx] - Y_.values[idx])**2
    else:
        sum_ += np.abs(preds_lst[idx] - Y_.values[idx])
        
print(f'Assymetric error value: {sum_ / len(preds_lst):.0f}')

Assymetric error value: 4838181948427


### Task 5. 

**Calculate another asymmetric metric.**

Asymmetric = [Prediction < Target] (Prediction - Target)^2 + [Prediction > Target] |Prediction - Target|

In [29]:
preds_lst = data.preds.to_list()

sum_ = 0
for idx in range(len(preds_lst)):
    if preds_lst[idx] < Y_.values[idx]:
        sum_ += (preds_lst[idx] - Y_.values[idx])**2
    else:
        sum_ += np.abs(preds_lst[idx] - Y_.values[idx])
        
print(f'Assymetric error value: {sum_ / len(preds_lst):.0f}')

Assymetric error value: 2821095061977


### Task 6. 

**Select top-3 features having the most significant negative impact on our predicted values.**

In [30]:
betas, beta0 = [round(x,3) for x in model.coef_], model.intercept_

print(f'y =')
print(f'{beta0} +')
for i in range(len(betas)):
    print(f'{betas[i]} * {X_.columns[i]} +')

y =
1741555.5688577197 +
-255284.496 * is_holiday +
-0.025 * lag_30d +
-0.054 * lag_61d +
-0.073 * lag_91d +
0.321 * rolling_7d +
-1.581 * rolling_30d +
2.043 * rolling_91d +


- is_holiday
- rolling_30d
- lag_91d

### Task 7. 

**Calculate what the forecast of your model will be if all the features that we counted have a value of 1 for a certain day?**

In [31]:
ans = 0

for i in range(len(model.coef_)):
    ans += model.coef_[i]
    
round(model.intercept_ + ans, 3)

1486271.702

### Task 8. 

**What will be the forecast of the model if the given day is a working day.**

In [32]:
ans = 0

for i in range(len(model.coef_)):
    ans += model.coef_[i]
    
round(model.intercept_ + ans - model.coef_[0], 3)

1741556.199

### Task 9. 

**Let's try to use segmentation approach to predict time series. Load data using SQL query aggregated by payment_date and type.**

In [33]:
query = '''
SELECT 
  payment_date,
  type,
  -SUM(amount)
FROM 
  user_transactions
WHERE 
  type IN ('basic sale', 'fast sale', 'quick sale')
GROUP BY 
  payment_date,
  type
'''

In [34]:
result = client.execute(query)

In [35]:
len(result)

1531

In [36]:
df = pd.DataFrame(result, columns=['payment_date', 'sale_type', 'volume'])

df

Unnamed: 0,payment_date,sale_type,volume
0,2021-04-27,quick sale,1566030
1,2022-07-11,fast sale,2461446
2,2021-12-29,quick sale,1237786
3,2022-11-27,fast sale,1771166
4,2023-01-08,quick sale,1436429
...,...,...,...
1526,2022-11-20,fast sale,1534786
1527,2021-04-26,quick sale,1752833
1528,2021-10-21,fast sale,1880598
1529,2022-09-28,basic sale,2617040


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1531 entries, 0 to 1530
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   payment_date  1531 non-null   object
 1   sale_type     1531 non-null   object
 2   volume        1531 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 36.0+ KB


In [38]:
df['payment_date'] = pd.to_datetime(df['payment_date'], format='%Y-%m-%d')

In [39]:
df = df.sort_values('payment_date').reset_index(drop=True)

In [40]:
df

Unnamed: 0,payment_date,sale_type,volume
0,2021-02-02,quick sale,1214139
1,2021-02-02,basic sale,830390
2,2021-02-02,fast sale,1352338
3,2021-02-03,quick sale,1274827
4,2021-02-03,fast sale,1438638
...,...,...,...
1526,2023-01-30,fast sale,2896360
1527,2023-01-30,basic sale,3261225
1528,2023-01-31,basic sale,3314615
1529,2023-01-31,quick sale,2603906


In [41]:
df_full.fillna(0, inplace=True)

### Task 10. 

**Forecast volume for each sale type and then calculate the overall forecast by summing those forecasts.**

Forecast for `quick sale`.

In [42]:
df_quick = df[df['sale_type'] == 'quick sale'].reset_index(drop=True)

In [43]:
df_quick

Unnamed: 0,payment_date,sale_type,volume
0,2021-02-02,quick sale,1214139
1,2021-02-03,quick sale,1274827
2,2021-02-04,quick sale,1272215
3,2021-02-05,quick sale,1131727
4,2021-02-06,quick sale,961952
...,...,...,...
505,2023-01-27,quick sale,2184930
506,2023-01-28,quick sale,1550079
507,2023-01-29,quick sale,1898020
508,2023-01-30,quick sale,2179638


In [44]:
df_quick_full = pd.merge(df_date_range, df_quick, how='left', on='payment_date')

In [45]:
df_quick_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   sale_type     510 non-null    object        
 2   volume        510 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 22.8+ KB


In [46]:
df_quick_full.fillna(0, inplace=True)

In [47]:
df_quick_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   sale_type     729 non-null    object        
 2   volume        729 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 22.8+ KB


In [48]:
data_quick = generate_features_for_series(df_quick_full)

In [49]:
data_quick

Unnamed: 0,payment_date,sale_type,volume,is_holiday,lag_30d,lag_61d,lag_91d,rolling_7d,rolling_30d,rolling_91d
91,2021-05-04,quick sale,1611785.0,0.0,0.0,1127938.0,1214139.0,3.539594e+05,6.331820e+05,7.440603e+05
92,2021-05-05,quick sale,1999040.0,0.0,1352975.0,883341.0,1274827.0,6.037133e+05,6.803261e+05,7.617196e+05
93,2021-05-06,quick sale,1499572.0,0.0,1419857.0,960981.0,1272215.0,8.077492e+05,7.287155e+05,7.806688e+05
94,2021-05-07,0,0.0,0.0,1293265.0,922074.0,1131727.0,9.291282e+05,7.656216e+05,7.953241e+05
95,2021-05-08,0,0.0,1.0,0.0,758255.0,961952.0,6.968461e+05,7.156137e+05,7.727402e+05
...,...,...,...,...,...,...,...,...,...,...
724,2023-01-27,quick sale,2184930.0,0.0,1461917.0,1376615.0,1474592.0,1.058102e+06,1.139190e+06,1.183268e+06
725,2023-01-28,quick sale,1550079.0,1.0,1160699.0,1729728.0,1357773.0,1.083752e+06,1.140578e+06,1.182778e+06
726,2023-01-29,quick sale,1898020.0,1.0,1075921.0,1799433.0,1454473.0,1.081794e+06,1.136406e+06,1.180455e+06
727,2023-01-30,quick sale,2179638.0,0.0,565812.0,0.0,0.0,9.527984e+05,1.099594e+06,1.167093e+06


In [50]:
X_ = data_quick.drop(['payment_date', 'volume', 'sale_type'], axis=1)
Y_ = data_quick.volume

model = LinearRegression()

model.fit(X_, Y_)

In [51]:
data_quick['quick_preds'] = model.predict(X_)
data_quick['quick_Y'] = Y_

Forecast for `basic sale`.

In [52]:
df_basic = df[df['sale_type'] == 'basic sale'].reset_index(drop=True)

In [53]:
df_basic

Unnamed: 0,payment_date,sale_type,volume
0,2021-02-02,basic sale,830390
1,2021-02-03,basic sale,791210
2,2021-02-04,basic sale,819105
3,2021-02-05,basic sale,749195
4,2021-02-06,basic sale,746440
...,...,...,...
506,2023-01-27,basic sale,3063495
507,2023-01-28,basic sale,2707335
508,2023-01-29,basic sale,2754180
509,2023-01-30,basic sale,3261225


In [54]:
df_basic_full = pd.merge(df_date_range, df_basic, how='left', on='payment_date')

In [55]:
df_basic_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   sale_type     511 non-null    object        
 2   volume        511 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 22.8+ KB


In [56]:
df_basic_full.fillna(0, inplace=True)

In [57]:
df_basic_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   sale_type     729 non-null    object        
 2   volume        729 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 22.8+ KB


In [58]:
data_basic = generate_features_for_series(df_basic_full)

In [59]:
data_basic

Unnamed: 0,payment_date,sale_type,volume,is_holiday,lag_30d,lag_61d,lag_91d,rolling_7d,rolling_30d,rolling_91d
91,2021-05-04,basic sale,1129570.0,0.0,0.0,948405.0,830390.0,2.689877e+05,4.809985e+05,5.573742e+05
92,2021-05-05,basic sale,1148880.0,0.0,1004845.0,758465.0,791210.0,4.529520e+05,5.153087e+05,5.703514e+05
93,2021-05-06,basic sale,1087360.0,0.0,1091265.0,730705.0,819105.0,6.125302e+05,5.529950e+05,5.853496e+05
94,2021-05-07,0,0.0,0.0,993930.0,761665.0,749195.0,7.078802e+05,5.818201e+05,5.970311e+05
95,2021-05-08,0,0.0,1.0,0.0,599720.0,746440.0,5.309101e+05,5.438176e+05,5.800778e+05
...,...,...,...,...,...,...,...,...,...,...
724,2023-01-27,basic sale,3063495.0,0.0,2510225.0,2412350.0,2648040.0,1.746218e+06,1.814747e+06,1.779634e+06
725,2023-01-28,basic sale,2707335.0,1.0,2182915.0,2383960.0,2456890.0,1.855392e+06,1.838500e+06,1.788401e+06
726,2023-01-29,basic sale,2754180.0,1.0,1696135.0,2677635.0,2501290.0,1.815578e+06,1.829315e+06,1.786395e+06
727,2023-01-30,basic sale,3261225.0,0.0,1132610.0,0.0,0.0,1.644836e+06,1.784366e+06,1.772183e+06


In [60]:
X_ = data_basic.drop(['payment_date', 'volume', 'sale_type'], axis=1)
Y_ = data_basic.volume

model = LinearRegression()

model.fit(X_, Y_)

In [61]:
data_basic['basic_preds'] = model.predict(X_)
data_basic['basic_Y'] = Y_

Forecast for `fast sale`.

In [62]:
df_fast = df[df['sale_type'] == 'fast sale'].reset_index(drop=True)

In [63]:
df_fast

Unnamed: 0,payment_date,sale_type,volume
0,2021-02-02,fast sale,1352338
1,2021-02-03,fast sale,1438638
2,2021-02-04,fast sale,1229778
3,2021-02-05,fast sale,1436396
4,2021-02-06,fast sale,1054924
...,...,...,...
505,2023-01-27,fast sale,2357889
506,2023-01-28,fast sale,1986177
507,2023-01-29,fast sale,2214400
508,2023-01-30,fast sale,2896360


In [64]:
df_fast_full = pd.merge(df_date_range, df_fast, how='left', on='payment_date')

In [65]:
df_fast_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   sale_type     510 non-null    object        
 2   volume        510 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 22.8+ KB


In [66]:
df_fast_full.fillna(0, inplace=True)

In [67]:
df_fast_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   sale_type     729 non-null    object        
 2   volume        729 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 22.8+ KB


In [68]:
data_fast = generate_features_for_series(df_fast_full)

In [69]:
data_fast

Unnamed: 0,payment_date,sale_type,volume,is_holiday,lag_30d,lag_61d,lag_91d,rolling_7d,rolling_30d,rolling_91d
91,2021-05-04,fast sale,1844146.0,0.0,0.0,1222334.0,1352338.0,4.290599e+05,7.233335e+05,8.363605e+05
92,2021-05-05,fast sale,1694702.0,0.0,1350152.0,990521.0,1438638.0,6.593329e+05,7.643881e+05,8.512611e+05
93,2021-05-06,fast sale,1702965.0,0.0,1418457.0,798208.0,1229778.0,8.491139e+05,8.071855e+05,8.675919e+05
94,2021-05-07,0,0.0,0.0,1432129.0,920614.0,1436396.0,9.948677e+05,8.480397e+05,8.837323e+05
95,2021-05-08,0,0.0,1.0,0.0,783514.0,1054924.0,7.461508e+05,7.926485e+05,8.586379e+05
...,...,...,...,...,...,...,...,...,...,...
724,2023-01-27,fast sale,2357889.0,0.0,1604588.0,1771166.0,2482934.0,1.237888e+06,1.330258e+06,1.400023e+06
725,2023-01-28,fast sale,1986177.0,1.0,1535544.0,1997260.0,2073490.0,1.312302e+06,1.343502e+06,1.402969e+06
726,2023-01-29,fast sale,2214400.0,1.0,1225713.0,2028271.0,2126027.0,1.290655e+06,1.335903e+06,1.399116e+06
727,2023-01-30,fast sale,2896360.0,0.0,999584.0,0.0,0.0,1.217887e+06,1.314205e+06,1.390430e+06


In [70]:
X_ = data_fast.drop(['payment_date', 'volume', 'sale_type'], axis=1)
Y_ = data_fast.volume

model = LinearRegression()

model.fit(X_, Y_)

In [71]:
data_fast['fast_preds'] = model.predict(X_)
data_fast['fast_Y'] = Y_

Merging all predictions. 

In [72]:
data_quick.head(5)

Unnamed: 0,payment_date,sale_type,volume,is_holiday,lag_30d,lag_61d,lag_91d,rolling_7d,rolling_30d,rolling_91d,quick_preds,quick_Y
91,2021-05-04,quick sale,1611785.0,0.0,0.0,1127938.0,1214139.0,353959.397373,633181.97576,744060.335095,973198.9,1611785.0
92,2021-05-05,quick sale,1999040.0,0.0,1352975.0,883341.0,1274827.0,603713.30139,680326.088977,761719.598347,995310.9,1999040.0
93,2021-05-06,quick sale,1499572.0,0.0,1419857.0,960981.0,1272215.0,807749.228101,728715.517565,780668.77019,1017302.0,1499572.0
94,2021-05-07,0,0.0,0.0,1293265.0,922074.0,1131727.0,929128.171995,765621.633433,795324.138793,1042241.0,0.0
95,2021-05-08,0,0.0,1.0,0.0,758255.0,961952.0,696846.127678,715613.725645,772740.176041,948451.9,0.0


In [73]:
data_basic.head(5)

Unnamed: 0,payment_date,sale_type,volume,is_holiday,lag_30d,lag_61d,lag_91d,rolling_7d,rolling_30d,rolling_91d,basic_preds,basic_Y
91,2021-05-04,basic sale,1129570.0,0.0,0.0,948405.0,830390.0,268987.659335,480998.477641,557374.169111,972433.0,1129570.0
92,2021-05-05,basic sale,1148880.0,0.0,1004845.0,758465.0,791210.0,452951.996976,515308.729237,570351.365516,985302.8,1148880.0
93,2021-05-06,basic sale,1087360.0,0.0,1091265.0,730705.0,819105.0,612530.249342,552995.044195,585349.574022,1002958.0,1087360.0
94,2021-05-07,0,0.0,0.0,993930.0,761665.0,749195.0,707880.187728,581820.146468,597031.081606,1017753.0,0.0
95,2021-05-08,0,0.0,1.0,0.0,599720.0,746440.0,530910.139792,543817.552283,580077.833175,1018234.0,0.0


In [74]:
data_fast.head(5)

Unnamed: 0,payment_date,sale_type,volume,is_holiday,lag_30d,lag_61d,lag_91d,rolling_7d,rolling_30d,rolling_91d,fast_preds,fast_Y
91,2021-05-04,fast sale,1844146.0,0.0,0.0,1222334.0,1352338.0,429059.884901,723333.484315,836360.530042,1160097.0,1844146.0
92,2021-05-05,fast sale,1694702.0,0.0,1350152.0,990521.0,1438638.0,659332.916774,764388.068168,851261.105357,1181756.0,1694702.0
93,2021-05-06,fast sale,1702965.0,0.0,1418457.0,798208.0,1229778.0,849113.939495,807185.497453,867591.878307,1221565.0,1702965.0
94,2021-05-07,0,0.0,0.0,1432129.0,920614.0,1436396.0,994867.705725,848039.732615,883732.26157,1211104.0,0.0
95,2021-05-08,0,0.0,1.0,0.0,783514.0,1054924.0,746150.777882,792648.543419,858637.8686,1075536.0,0.0


In [75]:
merged_df = pd.merge(data_quick[['payment_date', 'quick_preds', 'quick_Y']],
                     data_basic[['payment_date', 'basic_preds', 'basic_Y']],
                     how='inner',
                     on='payment_date')

In [76]:
merged_df = pd.merge(merged_df,
                     data_fast[['payment_date', 'fast_preds', 'fast_Y']],
                     how='inner',
                     on='payment_date')

In [77]:
merged_df['sum_preds'] = merged_df.quick_preds + merged_df.basic_preds + merged_df.fast_preds
merged_df['sum_Y'] = merged_df.quick_Y + merged_df.basic_Y + merged_df.fast_Y

In [78]:
merged_df

Unnamed: 0,payment_date,quick_preds,quick_Y,basic_preds,basic_Y,fast_preds,fast_Y,sum_preds,sum_Y
0,2021-05-04,9.731989e+05,1611785.0,9.724330e+05,1129570.0,1.160097e+06,1844146.0,3.105729e+06,4585501.0
1,2021-05-05,9.953109e+05,1999040.0,9.853028e+05,1148880.0,1.181756e+06,1694702.0,3.162369e+06,4842622.0
2,2021-05-06,1.017302e+06,1499572.0,1.002958e+06,1087360.0,1.221565e+06,1702965.0,3.241824e+06,4289897.0
3,2021-05-07,1.042241e+06,0.0,1.017753e+06,0.0,1.211104e+06,0.0,3.271099e+06,0.0
4,2021-05-08,9.484519e+05,0.0,1.018234e+06,0.0,1.075536e+06,0.0,3.042222e+06,0.0
...,...,...,...,...,...,...,...,...,...
633,2023-01-27,1.169690e+06,2184930.0,1.637704e+06,3063495.0,1.398825e+06,2357889.0,4.206219e+06,7606314.0
634,2023-01-28,1.075667e+06,1550079.0,1.669148e+06,2707335.0,1.270688e+06,1986177.0,4.015502e+06,6243591.0
635,2023-01-29,1.068181e+06,1898020.0,1.661635e+06,2754180.0,1.265178e+06,2214400.0,3.994994e+06,6866600.0
636,2023-01-30,1.330423e+06,2179638.0,2.036925e+06,3261225.0,1.659732e+06,2896360.0,5.027081e+06,8337223.0


In [79]:
MSE = mean_squared_error(merged_df.sum_Y, merged_df.sum_preds)
MAE = mean_absolute_error(merged_df.sum_Y, merged_df.sum_preds)

print(f'MSE: {MSE:.0f}, MAE: {MAE:.0f}')

MSE: 7648099477525, MAE: 2454731
