# ML Pipelines

## Import libraries

In [1]:
#!pip install matplotlib seaborn scikit-learn openpyxl

In [2]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

## Tasks

### Task 1.

 **Load data from a file.**

In [3]:
df = pd.read_csv('premium_by_passports.csv')

In [4]:
df

Unnamed: 0,payment_date,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,2021-04-20,premium,140980663,2021-03-29 23:33:14,profi,cars_seller,1370
1,2022-11-07,premium,141788719,2021-07-16 14:25:39,simple_user,,785
2,2022-11-29,premium,140458955,2021-01-16 00:12:07,simple_user,cars_simple,985
3,2022-07-03,premium,143665334,2022-05-31 21:26:31,simple_user,,785
4,2022-11-02,premium,143267208,2022-03-20 21:17:48,simple_user,cars_simple,1105
...,...,...,...,...,...,...,...
61241,2021-12-09,premium,140416941,2021-01-10 16:12:06,simple_user,cars_simple,785
61242,2021-11-15,premium,142016280,2021-08-22 06:22:22,simple_user,,785
61243,2021-08-20,premium,141195529,2021-04-27 00:23:22,profi,,885
61244,2021-08-05,premium,140782851,2021-02-28 17:21:46,simple_user,,1570


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61246 entries, 0 to 61245
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   payment_date         61246 non-null  object
 1   type                 61246 non-null  object
 2   passport_id          61246 non-null  int64 
 3   created_at           61246 non-null  object
 4   user_type_name       58184 non-null  object
 5   user_type_cars_name  27561 non-null  object
 6   revenue              61246 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 3.3+ MB


In [6]:
df['created_at'] = pd.to_datetime(df.created_at)

In [7]:
df['payment_date'] = pd.to_datetime(df.payment_date, format='%Y-%m-%d')

In [8]:
df.head()

Unnamed: 0,payment_date,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,2021-04-20,premium,140980663,2021-03-29 23:33:14,profi,cars_seller,1370
1,2022-11-07,premium,141788719,2021-07-16 14:25:39,simple_user,,785
2,2022-11-29,premium,140458955,2021-01-16 00:12:07,simple_user,cars_simple,985
3,2022-07-03,premium,143665334,2022-05-31 21:26:31,simple_user,,785
4,2022-11-02,premium,143267208,2022-03-20 21:17:48,simple_user,cars_simple,1105


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61246 entries, 0 to 61245
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   payment_date         61246 non-null  datetime64[ns]
 1   type                 61246 non-null  object        
 2   passport_id          61246 non-null  int64         
 3   created_at           61246 non-null  datetime64[ns]
 4   user_type_name       58184 non-null  object        
 5   user_type_cars_name  27561 non-null  object        
 6   revenue              61246 non-null  int64         
dtypes: datetime64[ns](2), int64(2), object(3)
memory usage: 3.3+ MB


### Task 2.  

**Calculate monthly revenue and store calculated values in a dictionary.** 

In [10]:
df['payment_month'] = df.payment_date.dt.to_period('M')

In [11]:
monthly_revenue_df = df.groupby('payment_month', as_index=False).agg({'revenue': 'sum'})

In [12]:
monthly_revenue_df.head()

Unnamed: 0,payment_month,revenue
0,2021-02,378875
1,2021-03,703240
2,2021-04,969820
3,2021-05,1232885
4,2021-06,1486855


In [13]:
monthly_revenue_df = monthly_revenue_df.astype({'payment_month': 'str'})

In [14]:
temp_dct = monthly_revenue_df.set_index('payment_month').to_dict(orient='dict')

In [15]:
quasi_external = temp_dct['revenue']

In [16]:
quasi_external

{'2021-02': 378875,
 '2021-03': 703240,
 '2021-04': 969820,
 '2021-05': 1232885,
 '2021-06': 1486855,
 '2021-07': 1417360,
 '2021-08': 1595315,
 '2021-09': 1378065,
 '2021-10': 1758175,
 '2021-11': 1815995,
 '2021-12': 1747880,
 '2022-01': 2304820,
 '2022-02': 3440030,
 '2022-03': 2791920,
 '2022-04': 3918770,
 '2022-05': 4230780,
 '2022-06': 6537800,
 '2022-07': 5779295,
 '2022-08': 4945030,
 '2022-09': 5234725,
 '2022-10': 5052215,
 '2022-11': 7116800,
 '2022-12': 4635815,
 '2023-01': 4275530}

### Task 3. 

**We need to get a table where each row is a pair of object-target of the form (`passport_id` - `payment amount` after one month).** 

In [17]:
df.head()

Unnamed: 0,payment_date,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue,payment_month
0,2021-04-20,premium,140980663,2021-03-29 23:33:14,profi,cars_seller,1370,2021-04
1,2022-11-07,premium,141788719,2021-07-16 14:25:39,simple_user,,785,2022-11
2,2022-11-29,premium,140458955,2021-01-16 00:12:07,simple_user,cars_simple,985,2022-11
3,2022-07-03,premium,143665334,2022-05-31 21:26:31,simple_user,,785,2022-07
4,2022-11-02,premium,143267208,2022-03-20 21:17:48,simple_user,cars_simple,1105,2022-11


In [18]:
df_temp = df[(df['payment_date'] - df['created_at']).dt.days <= 30]

In [19]:
df_filt = df_temp.groupby(['type', 'passport_id', 'created_at',
                 'user_type_name', 'user_type_cars_name'], dropna=False) \
    .agg({'revenue': 'sum'}).reset_index()

In [20]:
df_filt

Unnamed: 0,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,,1370
1,premium,140371571,2021-01-04 20:36:56,simple_user,,685
2,premium,140383147,2021-01-06 09:01:34,simple_user,cars_simple,685
3,premium,140386549,2021-01-06 16:54:42,simple_user,,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,,1370
...,...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,cars_simple,785


### Task 4. 

**Calculate the values of the 2.5 and 97.5 quantiles for the target. Remove all records from the data whose values of the target variable are outside the specified quantiles.**

In [21]:
np.percentile(df_filt.revenue, 2.5)

685.0

In [22]:
np.percentile(df_filt.revenue, 97.5)

3540.0

In [23]:
df_filt = df_filt[(df_filt['revenue'] > np.percentile(df_filt.revenue, 2.5)) \
                  & (df_filt['revenue'] < np.percentile(df_filt.revenue, 97.5))]

In [24]:
df_filt

Unnamed: 0,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,,1370
3,premium,140386549,2021-01-06 16:54:42,simple_user,,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,,1370
6,premium,140390659,2021-01-07 01:08:09,simple_user,cars_simple,1370
7,premium,140391889,2021-01-07 11:29:07,simple_user,cars_simple,1370
...,...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,cars_simple,785


### Task 5. 

**Define columns with missing values.**

In [25]:
df_filt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9304 entries, 0 to 10207
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   type                 9304 non-null   object        
 1   passport_id          9304 non-null   int64         
 2   created_at           9304 non-null   datetime64[ns]
 3   user_type_name       8724 non-null   object        
 4   user_type_cars_name  4342 non-null   object        
 5   revenue              9304 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 508.8+ KB


### Task 6. 

**Code a custom class FilteringSelector that leaves only those columns that have no more than t=40% missing values.**

In [26]:
class FilteringSelector:

    def __init__(self, t=0.4):
        self.t = t
        self.nan_features = None
        
    def __call__(self, df):
        if not hasattr(df, 'iloc'):
            raise ValueError(
                'FilteringSelector class can only be applied to pandas dataframes'
            )
        
        nan_ratio = df.isna().sum() / df.shape[0]
        self.nan_features = nan_ratio[nan_ratio < self.t].index.to_list()
        
        return self.nan_features

In [27]:
col_selector = ColumnTransformer(
    transformers=[
        ('selector', 'passthrough', FilteringSelector())
    ],
    verbose_feature_names_out=False  
).set_output(transform='pandas')

In [28]:
df_filt

Unnamed: 0,type,passport_id,created_at,user_type_name,user_type_cars_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,,1370
3,premium,140386549,2021-01-06 16:54:42,simple_user,,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,,1370
6,premium,140390659,2021-01-07 01:08:09,simple_user,cars_simple,1370
7,premium,140391889,2021-01-07 11:29:07,simple_user,cars_simple,1370
...,...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,cars_simple,785


In [29]:
col_selector.fit_transform(df_filt)

Unnamed: 0,type,passport_id,created_at,user_type_name,revenue
0,premium,140366939,2021-01-04 11:09:27,simple_user,1370
3,premium,140386549,2021-01-06 16:54:42,simple_user,2055
4,premium,140387667,2021-01-06 19:07:18,simple_user,1370
6,premium,140390659,2021-01-07 01:08:09,simple_user,1370
7,premium,140391889,2021-01-07 11:29:07,simple_user,1370
...,...,...,...,...,...
10203,premium,144970910,2023-01-28 18:48:05,simple_user,1105
10204,premium,144972314,2023-01-28 22:51:49,profi,1100
10205,premium,144974954,2023-01-29 15:12:38,simple_user,785
10206,premium,144979202,2023-01-30 10:24:40,simple_user,785


### Task 7. 

**tbc..**