# Linear Models

## Import libraries

In [1]:
#!pip install matplotlib seaborn scikit-learn

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [3]:
from clickhouse_driver import Client


user_name = 'user_name'
pwd = 'password'

# creating connection ClickHouse
client = Client(host='clickhouse.lab.karpov.courses', port=9000,
                user=user_name, password=pwd, database='hardda')

# checking connection
result = client.execute("SELECT * FROM hardda.user_dm_events LIMIT 1")

# showing the result
for row in result[0:1]:
    print(row)

(datetime.date(2022, 2, 1), datetime.date(2022, 1, 31), 'android', 'f7411212fd0e2523e126cbfdd3f226c211212', '4beb10e1-aeeb-4c52-acd2-ce1ddbc1fc24b10e1', 22, 11, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0)


## Tasks

### Task 1.

 **Write an SQL query to load the data for all available payment_date dates for transactions of types (`'basic sale'`, `'fast sale'`, `'quick sale'`). Group -sum(amount) by the payment_date field. Note that amount in the table is negative, since money is debited from the account, so we multiply by -1.**

In [4]:
query = '''
SELECT 
  payment_date, 
  -SUM(amount)
FROM 
  user_transactions
WHERE 
  type IN ('basic sale', 'fast sale', 'quick sale')
GROUP BY 
  payment_date
'''

In [5]:
result = client.execute(query)

In [6]:
len(result)

511

In [7]:
df = pd.DataFrame(result, columns=['payment_date', 'volume'])

df

Unnamed: 0,payment_date,volume
0,2021-02-02,3396867
1,2021-02-03,3504675
2,2021-02-04,3321098
3,2021-02-05,3317318
4,2021-02-06,2763316
...,...,...
506,2023-01-27,7606314
507,2023-01-28,6243591
508,2023-01-29,6866600
509,2023-01-30,8337223


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   payment_date  511 non-null    object
 1   volume        511 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.1+ KB


In [9]:
df['payment_date'] = pd.to_datetime(df['payment_date'], format='%Y-%m-%d')

In [10]:
df = df.sort_values('payment_date').reset_index(drop=True)

In [11]:
df

Unnamed: 0,payment_date,volume
0,2021-02-02,3396867
1,2021-02-03,3504675
2,2021-02-04,3321098
3,2021-02-05,3317318
4,2021-02-06,2763316
...,...,...
506,2023-01-27,7606314
507,2023-01-28,6243591
508,2023-01-29,6866600
509,2023-01-30,8337223


Checking for missing dates. 

In [12]:
(df.payment_date - df.payment_date.shift(1)).mean()

Timedelta('1 days 10:15:31.764705882')

We can see that there are some missing dates in our dataset. And we have to fix it. 

In [13]:
date_range = pd.date_range(start=df.payment_date.min(), 
                           end=df.payment_date.max(),
                           freq='D')

In [14]:
len(date_range)

729

In [15]:
df_date_range = pd.DataFrame({'payment_date': date_range})

In [16]:
df_date_range

Unnamed: 0,payment_date
0,2021-02-02
1,2021-02-03
2,2021-02-04
3,2021-02-05
4,2021-02-06
...,...
724,2023-01-27
725,2023-01-28
726,2023-01-29
727,2023-01-30


In [17]:
df_full = pd.merge(df_date_range, df, how='left', on='payment_date')

In [18]:
df_full

Unnamed: 0,payment_date,volume
0,2021-02-02,3396867.0
1,2021-02-03,3504675.0
2,2021-02-04,3321098.0
3,2021-02-05,3317318.0
4,2021-02-06,2763316.0
...,...,...
724,2023-01-27,7606314.0
725,2023-01-28,6243591.0
726,2023-01-29,6866600.0
727,2023-01-30,8337223.0


In [19]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   volume        511 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 17.1 KB


Let's impute missing values with zeroes. 

In [20]:
df_full.fillna(0, inplace=True)

In [21]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729 entries, 0 to 728
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   payment_date  729 non-null    datetime64[ns]
 1   volume        729 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 17.1 KB


### Task 2.  

**Generate additional features for our dataset, using 30, 61, 91 days lag and exponential weighted moving average with 7, 30, 91 days window size. Also create a binary feature was it a work day or a weekend.** 

In [22]:
def generate_features_for_series(
    data: pd.DataFrame,
    date_colname='payment_date',
    value_colname= 'volume'
):

    data_ = data.copy()
    data_.sort_values(date_colname, inplace=True)
   
    # is_holiday feature
    data_['weekday'] = data_[date_colname].dt.weekday
    data_.loc[data_['weekday'].isin([5, 6]), 'is_holiday'] = 1
    data_['is_holiday'] = data_['is_holiday'].fillna(0)
    data_.drop('weekday', axis=1, inplace=True)

    # lagging days features
    data_['lag_30d'] = data_[value_colname].shift(30)
    data_['lag_61d'] = data_[value_colname].shift(61)
    data_['lag_91d'] = data_[value_colname].shift(91)

    # exponential weighted moving avereage features
    data_['rolling_7d'] = data_['lag_30d'].ewm(span=7).mean()
    data_['rolling_30d'] = data_['lag_30d'].ewm(span=30).mean()
    data_['rolling_91d'] = data_['lag_30d'].ewm(span=91).mean()
   
    # removing NaN values
    data_.dropna(inplace=True)
   
    return data_

In [23]:
data = generate_features_for_series(df_full)

In [24]:
data

Unnamed: 0,payment_date,volume,is_holiday,lag_30d,lag_61d,lag_91d,rolling_7d,rolling_30d,rolling_91d
91,2021-05-04,4585501.0,0.0,0.0,3298677.0,3396867.0,1.052007e+06,1.837514e+06,2.137795e+06
92,2021-05-05,4842622.0,0.0,3707972.0,2632327.0,3504675.0,1.715998e+06,1.960023e+06,2.183332e+06
93,2021-05-06,4289897.0,0.0,3929579.0,2489894.0,3321098.0,2.269393e+06,2.088896e+06,2.233610e+06
94,2021-05-07,0.0,0.0,3719324.0,2604353.0,3317318.0,2.631876e+06,2.195482e+06,2.276087e+06
95,2021-05-08,0.0,1.0,0.0,2141489.0,2763316.0,1.973907e+06,2.052080e+06,2.211456e+06
...,...,...,...,...,...,...,...,...,...
724,2023-01-27,7606314.0,0.0,5576730.0,5560131.0,6605566.0,4.042208e+06,4.284195e+06,4.362926e+06
725,2023-01-28,6243591.0,1.0,4879158.0,6110948.0,5888153.0,4.251446e+06,4.322580e+06,4.374148e+06
726,2023-01-29,6866600.0,1.0,3997769.0,6505339.0,6081790.0,4.188027e+06,4.301624e+06,4.365966e+06
727,2023-01-30,8337223.0,0.0,2698006.0,0.0,0.0,3.815521e+06,4.198165e+06,4.329706e+06


### Task 3. 

**Train a linear regression model and measure the quality of your model's performance.** 

In [25]:
X_ = data.drop(['payment_date', 'volume'], axis=1)
Y_ = data.volume

model = LinearRegression()

model.fit(X_, Y_)

In [26]:
data['preds'] = model.predict(X_)

In [27]:
MSE = mean_squared_error(Y_, data.preds)
MAE = mean_absolute_error(Y_, data.preds)

print(f'MSE: {MSE:.0f}, MAE: {MAE:.0f}')

MSE: 7659274556248, MAE: 2454155


### Task 4. 

**In machine learning, it can be useful to use asymmetric metrics. Especially in cases where we want to give only a lower or upper bound on the prediction. For example, a great example of such a metric is the use of quadratic error for over-prediction, and absolute error for under-prediction on each object (or vice versa). And then - averaging the losses calculated for each object.**

Asymmetric = [Prediction > Target] (Prediction - Target)^2 + [Prediction < Target] |Prediction - Target|

In [28]:
preds_lst = data.preds.to_list()

sum_ = 0
for idx in range(len(preds_lst)):
    if preds_lst[idx] > Y_.values[idx]:
        sum_ += (preds_lst[idx] - Y_.values[idx])**2
    else:
        sum_ += np.abs(preds_lst[idx] - Y_.values[idx])
        
print(f'Assymetric error value: {sum_ / len(preds_lst):.0f}')

Assymetric error value: 4838181948427


### Task 5. 

tbc..