# ML Pipelines

## Import libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

## Tasks

### Task 1.

 **Load data from a file.**

In [2]:
df = pd.read_csv('premium_by_passports.csv')

In [3]:
df

Unnamed: 0,payment_date,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,2021-04-20,premium,140980663,2021-03-29 23:33:14,profi,cars_seller,1370
1,2022-11-07,premium,141788719,2021-07-16 14:25:39,simple_user,,785
2,2022-11-29,premium,140458955,2021-01-16 00:12:07,simple_user,cars_simple,985
3,2022-07-03,premium,143665334,2022-05-31 21:26:31,simple_user,,785
4,2022-11-02,premium,143267208,2022-03-20 21:17:48,simple_user,cars_simple,1105
...,...,...,...,...,...,...,...
61241,2021-12-09,premium,140416941,2021-01-10 16:12:06,simple_user,cars_simple,785
61242,2021-11-15,premium,142016280,2021-08-22 06:22:22,simple_user,,785
61243,2021-08-20,premium,141195529,2021-04-27 00:23:22,profi,,885
61244,2021-08-05,premium,140782851,2021-02-28 17:21:46,simple_user,,1570


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61246 entries, 0 to 61245
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   payment_date         61246 non-null  object
 1   type                 61246 non-null  object
 2   passport_id          61246 non-null  int64 
 3   created_at           61246 non-null  object
 4   user_type_name       58184 non-null  object
 5   user_type_cars_name  27561 non-null  object
 6   revenue              61246 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 3.3+ MB


In [5]:
df['created_at'] = pd.to_datetime(df.created_at)

In [6]:
df['payment_date'] = pd.to_datetime(df.payment_date, format='%Y-%m-%d')

In [7]:
df.head()

Unnamed: 0,payment_date,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,2021-04-20,premium,140980663,2021-03-29 23:33:14,profi,cars_seller,1370
1,2022-11-07,premium,141788719,2021-07-16 14:25:39,simple_user,,785
2,2022-11-29,premium,140458955,2021-01-16 00:12:07,simple_user,cars_simple,985
3,2022-07-03,premium,143665334,2022-05-31 21:26:31,simple_user,,785
4,2022-11-02,premium,143267208,2022-03-20 21:17:48,simple_user,cars_simple,1105


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61246 entries, 0 to 61245
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   payment_date         61246 non-null  datetime64[ns]
 1   type                 61246 non-null  object        
 2   passport_id          61246 non-null  int64         
 3   created_at           61246 non-null  datetime64[ns]
 4   user_type_name       58184 non-null  object        
 5   user_type_cars_name  27561 non-null  object        
 6   revenue              61246 non-null  int64         
dtypes: datetime64[ns](2), int64(2), object(3)
memory usage: 3.3+ MB


### Task 2.  

**Calculate monthly revenue and store calculated values in a dictionary.** 

In [9]:
df['payment_month'] = df.payment_date.dt.to_period('M')

In [10]:
monthly_revenue_df = df.groupby('payment_month', as_index=False).agg({'revenue': 'sum'})

In [11]:
monthly_revenue_df.head()

Unnamed: 0,payment_month,revenue
0,2021-02,378875
1,2021-03,703240
2,2021-04,969820
3,2021-05,1232885
4,2021-06,1486855


In [12]:
monthly_revenue_df = monthly_revenue_df.astype({'payment_month': 'str'})

In [13]:
temp_dct = monthly_revenue_df.set_index('payment_month').to_dict(orient='dict')

In [14]:
quasi_external = temp_dct['revenue']

In [15]:
quasi_external

{'2021-02': 378875,
 '2021-03': 703240,
 '2021-04': 969820,
 '2021-05': 1232885,
 '2021-06': 1486855,
 '2021-07': 1417360,
 '2021-08': 1595315,
 '2021-09': 1378065,
 '2021-10': 1758175,
 '2021-11': 1815995,
 '2021-12': 1747880,
 '2022-01': 2304820,
 '2022-02': 3440030,
 '2022-03': 2791920,
 '2022-04': 3918770,
 '2022-05': 4230780,
 '2022-06': 6537800,
 '2022-07': 5779295,
 '2022-08': 4945030,
 '2022-09': 5234725,
 '2022-10': 5052215,
 '2022-11': 7116800,
 '2022-12': 4635815,
 '2023-01': 4275530}

### Task 3. 

**We need to get a table where each row is a pair of object-target of the form (`passport_id` - `payment amount` after one month).** 

In [16]:
df.head()

Unnamed: 0,payment_date,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue,payment_month
0,2021-04-20,premium,140980663,2021-03-29 23:33:14,profi,cars_seller,1370,2021-04
1,2022-11-07,premium,141788719,2021-07-16 14:25:39,simple_user,,785,2022-11
2,2022-11-29,premium,140458955,2021-01-16 00:12:07,simple_user,cars_simple,985,2022-11
3,2022-07-03,premium,143665334,2022-05-31 21:26:31,simple_user,,785,2022-07
4,2022-11-02,premium,143267208,2022-03-20 21:17:48,simple_user,cars_simple,1105,2022-11


In [17]:
df_temp = df[(df['payment_date'] - df['created_at']).dt.days <= 30]

In [18]:
df_filt = df_temp.groupby(['type', 'passport_id', 'created_at',
                 'user_type_name', 'user_type_cars_name'], dropna=False) \
    .agg({'revenue': 'sum'}).reset_index()

In [19]:
df_filt

Unnamed: 0,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,,1370
1,premium,140371571,2021-01-04 20:36:56,simple_user,,685
2,premium,140383147,2021-01-06 09:01:34,simple_user,cars_simple,685
3,premium,140386549,2021-01-06 16:54:42,simple_user,,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,,1370
...,...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,cars_simple,785


### Task 4. 

**Calculate the values of the 2.5 and 97.5 quantiles for the target. Remove all records from the data whose values of the target variable are outside the specified quantiles.**

In [20]:
np.percentile(df_filt.revenue, 2.5)

685.0

In [21]:
np.percentile(df_filt.revenue, 97.5)

3540.0

In [22]:
df_filt = df_filt[(df_filt['revenue'] > np.percentile(df_filt.revenue, 2.5)) \
                  & (df_filt['revenue'] < np.percentile(df_filt.revenue, 97.5))]

In [23]:
df_filt

Unnamed: 0,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,,1370
3,premium,140386549,2021-01-06 16:54:42,simple_user,,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,,1370
6,premium,140390659,2021-01-07 01:08:09,simple_user,cars_simple,1370
7,premium,140391889,2021-01-07 11:29:07,simple_user,cars_simple,1370
...,...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,cars_simple,785


### Task 5. 

**Define columns with missing values.**

In [24]:
df_filt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9304 entries, 0 to 10207
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   type                 9304 non-null   object        
 1   passport_id          9304 non-null   int64         
 2   created_at           9304 non-null   datetime64[ns]
 3   user_type_name       8724 non-null   object        
 4   user_type_cars_name  4342 non-null   object        
 5   revenue              9304 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 508.8+ KB


### Task 6. 

**Code a custom class `FilteringSelector` that leaves only those columns that have no more than t=40% missing values.**

In [25]:
class FilteringSelector:

    def __init__(self, t=0.4):
        self.t = t
        self.nan_features = None
        
    def __call__(self, df):
        if not hasattr(df, 'iloc'):
            raise ValueError(
                'FilteringSelector class can only be applied to pandas dataframes'
            )
        
        nan_ratio = df.isna().sum() / df.shape[0]
        self.nan_features = nan_ratio[nan_ratio < self.t].index.to_list()
        
        return self.nan_features

In [26]:
col_selector = ColumnTransformer(
    transformers=[
        ('selector', 'passthrough', FilteringSelector())
    ],
    verbose_feature_names_out=False  
).set_output(transform='pandas')

In [27]:
df_filt

Unnamed: 0,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,,1370
3,premium,140386549,2021-01-06 16:54:42,simple_user,,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,,1370
6,premium,140390659,2021-01-07 01:08:09,simple_user,cars_simple,1370
7,premium,140391889,2021-01-07 11:29:07,simple_user,cars_simple,1370
...,...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,cars_simple,785


In [28]:
col_selector.fit_transform(df_filt)

Unnamed: 0,type,passport_id,created_at,user_type_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,1370
3,premium,140386549,2021-01-06 16:54:42,simple_user,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,1370
6,premium,140390659,2021-01-07 01:08:09,simple_user,1370
7,premium,140391889,2021-01-07 11:29:07,simple_user,1370
...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,785


### Task 7. 

**Write a class `AddColumnsTransformer` that does the following**:
- Extracts the quarter from the profile creation date - column `Quarter` in the format '2023Q3'
- Creates the feature `last_month_pmts`, which is the sum of transactions in the previous month, using external data. If there is no data on payments for the previous month, then take the average for all available months
- Removes the key columns: `passport_id` (the name of this column can be hardcoded) and `created_at`

In [29]:
class AddColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, created_at_column = 'created_at', payments_by_month = quasi_external):
        self.created_at_column = created_at_column
        self.payments_by_month = payments_by_month

    def fit(self, df, y= None):
        return self
    
    def transform(self, df, y=None):
        if not hasattr(df, 'iloc'):
            raise ValueError(
                'CustomTransformer can only be applied to pandas dataframes in df argument'
            )
        df_copy= df.copy()
        
        df_copy['Quarter'] = (
                    pd.PeriodIndex(df_copy[self.created_at_column], freq='Q')
                    .to_series()
                    .apply(lambda x: f'{x.year}Q{x.quarter}')
                    .values
        )
        
        quasi_df = pd.DataFrame(list(self.payments_by_month.items()), columns=['payment_month', 'last_month_pmts'])
        df_copy['payment_month'] =  (df_copy['created_at'] - pd.DateOffset(months=1)).dt.strftime('%Y-%m')
        
        df_copy = pd.merge(df_copy, quasi_df, on='payment_month', how='left')
        df_copy.drop('payment_month', axis=1, inplace=True)
        mean_values = quasi_df['last_month_pmts'].mean()
        df_copy['last_month_pmts'].fillna(mean_values, inplace = True)
        
        df_copy.drop('passport_id', axis=1, inplace=True)
        df_copy.drop(self.created_at_column, axis=1, inplace=True)

        return df_copy

In [30]:
df_filt

Unnamed: 0,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,,1370
3,premium,140386549,2021-01-06 16:54:42,simple_user,,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,,1370
6,premium,140390659,2021-01-07 01:08:09,simple_user,cars_simple,1370
7,premium,140391889,2021-01-07 11:29:07,simple_user,cars_simple,1370
...,...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,cars_simple,785


In [31]:
transformer = AddColumnsTransformer()
df_mod = transformer.transform(df_filt)

In [32]:
df_mod

Unnamed: 0,type,user_type_name,user_type_cars_name,revenue,Quarter,last_month_pmts
0,premium,simple_user,,1370,2021Q1,3.114500e+06
1,premium,simple_user,,2055,2021Q1,3.114500e+06
2,premium,simple_user,,1370,2021Q1,3.114500e+06
3,premium,simple_user,cars_simple,1370,2021Q1,3.114500e+06
4,premium,simple_user,cars_simple,1370,2021Q1,3.114500e+06
...,...,...,...,...,...,...
9299,premium,simple_user,,1105,2023Q1,4.635815e+06
9300,premium,profi,,1100,2023Q1,4.635815e+06
9301,premium,simple_user,,785,2023Q1,4.635815e+06
9302,premium,simple_user,cars_simple,785,2023Q1,4.635815e+06


### Task 8. 

**Prepare the `final_process_pipe`, which will consist of the following steps:**
- Application of a custom selector to automatically select columns with a small number of missing values
- Creating new features using a custom transformer
- Filling in missing values with the mean/mode for real/categorical columns, respectively
- MeanTargetEcnoder for all categorical features
- StandardScaler on the remaining dataframe

In [33]:
col_selector = ColumnTransformer(
    transformers=[
        ('selector', 'passthrough', FilteringSelector())
    ],
    verbose_feature_names_out=False   
).set_output(transform='pandas')      


col_imputer = ColumnTransformer(
    transformers=[
        ('impute_num', SimpleImputer(strategy='mean'), selector(dtype_include='number')),
        ('impute_cat', SimpleImputer(strategy='most_frequent'), selector(dtype_exclude='number'))
    ],
    verbose_feature_names_out=False   
).set_output(transform='pandas')      


col_transformer_with_selector = ColumnTransformer(
    transformers=[
        ('MeanTargetEncoder', TargetEncoder(target_type='continuous', shuffle=False), selector(dtype_exclude='number'))
    ],
    remainder='passthrough',          
    verbose_feature_names_out=False   
).set_output(transform='pandas')      


num_scaler = ColumnTransformer(
    transformers=[
        ('StandardScaler', StandardScaler(), selector(dtype_include='number'))
    ],
    verbose_feature_names_out=False   
).set_output(transform='pandas')      


final_process_pipe = Pipeline(
    [
        ('col_selector', col_selector),
        ('add_columns_transformer', AddColumnsTransformer()),
        ('col_imputer', col_imputer),
        ('col_transformer_with_selector', col_transformer_with_selector),
        ('num_scaler', num_scaler)
    ]
)

In [34]:
X_ = df_filt.drop(['type', 'revenue'], axis=1).reset_index(drop=True)
y_ = df_filt[['revenue']].reset_index(drop=True)

In [35]:
final_processed_data = final_process_pipe.fit_transform(X_,y_)
final_processed_data.to_csv('final_processed_data.csv', index=False)

In [36]:
final_processed_data

Unnamed: 0,user_type_name,Quarter,last_month_pmts
0,-0.369776,-0.142148,-0.227864
1,-0.369776,-0.142148,-0.227864
2,-0.369776,-0.142148,-0.227864
3,-0.369776,-0.142148,-0.227864
4,-0.369776,-0.142148,-0.227864
...,...,...,...
9299,-0.891653,-0.562629,0.577659
9300,2.421688,-0.562629,0.577659
9301,-0.891653,-0.562629,0.577659
9302,-0.891653,-0.562629,0.577659


### Task 9.

**Prepare a splitter for data validation. Use the TimeSeriesSplit strategy.**

In [37]:
splitter = TimeSeriesSplit(
    n_splits=4,
    max_train_size=None, 
    test_size=None,      
    gap=0                
)

print(splitter)
print(splitter.split(X_.sort_values('created_at', inplace=True)))

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=4, test_size=None)
<generator object TimeSeriesSplit.split at 0x7fed6ba76120>


### Task 10. 

**Now, let's try using classical linear regression, Lasso, and Ridge. Train all of these models and measure the best parameters on the validation set, initialized earlier in the variable splitter. For classical LR, try a model without and with a free coefficient β. The key metric is MAE. Choose from these two models the one that gives the best quality.**

Regression model without intercept. 

In [38]:
splitter = TimeSeriesSplit(n_splits=4)

df_filt_sorted = df_filt.sort_values('created_at').reset_index(drop=True)
X_sorted = df_filt_sorted.drop(['type', 'revenue'], axis=1).reset_index(drop=True)
y_sorted = df_filt_sorted[['revenue']].reset_index(drop=True)

final_pipe_model = Pipeline(
    [
        ('col_selector', col_selector),
        ('add_columns_transformer', AddColumnsTransformer()),
        ('col_imputer', col_imputer),
        ('col_transformer_with_selector', col_transformer_with_selector),
        ('num_scaler', num_scaler),
        ('simple_model', LinearRegression(fit_intercept=False))
    ]
)

In [39]:
errors_lst = []
for fold in splitter.split(X_sorted):
    train_index, val_index = fold[0], fold[1]
    
    X_to_train = X_sorted[X_sorted.index.isin(train_index)]
    X_to_val = X_sorted[X_sorted.index.isin(val_index)]
    
    y_to_train = y_sorted[y_sorted.index.isin(train_index)]
    y_to_val = y_sorted[y_sorted.index.isin(val_index)]
    
    eval_instance = final_pipe_model
    eval_instance.fit(X_to_train, y_to_train)
    
    pred = eval_instance.predict(X_to_val)
    error = mean_absolute_error(y_to_val, pred)
    
    print(error)
    errors_lst.append(error)
    
print(f'\nAverage MAE: {np.mean(errors_lst)}')

1128.7227137906016
1157.0504051954172
1177.0311082486073
1286.1987299721404

Average MAE: 1187.2507393016917


Regression model with intercept. 

In [40]:
splitter = TimeSeriesSplit(n_splits=4)

df_filt_sorted = df_filt.sort_values('created_at').reset_index(drop=True)
X_sorted = df_filt_sorted.drop(['type', 'revenue'], axis=1).reset_index(drop=True)
y_sorted = df_filt_sorted[['revenue']].reset_index(drop=True)

final_pipe_model = Pipeline(
    [
        ('col_selector', col_selector),
        ('add_columns_transformer', AddColumnsTransformer()),
        ('col_imputer', col_imputer),
        ('col_transformer_with_selector', col_transformer_with_selector),
        ('num_scaler', num_scaler),
        ('simple_model', LinearRegression(fit_intercept=True))
    ]
)

In [41]:
errors_lst = []
for fold in splitter.split(X_sorted):
    train_index, val_index = fold[0], fold[1]
    
    X_to_train = X_sorted[X_sorted.index.isin(train_index)]
    X_to_val = X_sorted[X_sorted.index.isin(val_index)]
    
    y_to_train = y_sorted[y_sorted.index.isin(train_index)]
    y_to_val = y_sorted[y_sorted.index.isin(val_index)]
    
    eval_instance = final_pipe_model
    eval_instance.fit(X_to_train, y_to_train)
    
    pred = eval_instance.predict(X_to_val)
    error = mean_absolute_error(y_to_val, pred)
    
    print(error)
    errors_lst.append(error)
    
print(f'\nAverage MAE: {np.mean(errors_lst)}')

465.3564377009648
453.7285941666168
447.13746992543594
442.14916950050434

Average MAE: 452.09291782338045


### Task 11. 

**Train Lasso and Ridge regression models using GridSearchCV to find best possible parameters.**

In [42]:
lasso_pipe = Pipeline(
    [
        ('col_selector', col_selector),
        ('add_columns_transformer', AddColumnsTransformer()),
        ('col_imputer', col_imputer),
        ('col_transformer_with_selector', col_transformer_with_selector),
        ('num_scaler', num_scaler),
        ('simple_model', Lasso())
    ]
)

In [43]:
ridge_pipe = Pipeline(
    [
        ('col_selector', col_selector),
        ('add_columns_transformer', AddColumnsTransformer()),
        ('col_imputer', col_imputer),
        ('col_transformer_with_selector', col_transformer_with_selector),
        ('num_scaler', num_scaler),
        ('simple_model', Ridge())
    ]
)

In [44]:
param_grid = {
    'simple_model__alpha': np.linspace(start=0.1, stop=10000, num=20),
    'simple_model__max_iter': [100, 1000]
}

for model in [lasso_pipe, ridge_pipe]:
    search = GridSearchCV(model, param_grid, 
                          cv=splitter, scoring='neg_mean_absolute_error')
    
    search.fit(X_sorted, y_sorted)
    
    print(f'Best parameter (CV score={search.best_score_:.5f}):')
    print(search.best_params_)

Best parameter (CV score=-447.48965):
{'simple_model__alpha': 526.4105263157895, 'simple_model__max_iter': 100}
Best parameter (CV score=-447.16723):
{'simple_model__alpha': 10000.0, 'simple_model__max_iter': 100}
