# Data pipelines. Sklearn examples

**Внимание**: Здесь я удалил большую часть классов, чтобы не давать полного варианта решения тем, кто решит взять дефолтный датасет. Поэтому ноутбук полностью не запустится. Возьмите его просто как примеры того, как создавать классы препроцессинга. Полную версию смогу скинуть после выполнения всеми домашней работы.

#### Necessary imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics

import warnings
warnings.simplefilter('ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

#### Custom classes for pipeline construction

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    The class provides basic functionality for retrieving
    a subset of columns from the dataset.
    """
    
    def __init__(self, feature_names):
        """
        Initialize class instance by setting
        a list of columns to retrieve from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        """
        Fit FeatureSelector to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transform X using feature selection. 
        Return column-subset of X.
        """
        return X[self.feature_names]

In [None]:
class DateTransformer(BaseEstimator, TransformerMixin):
    """
    The class provides functionality for converting date columns to numeric.
    Converts dates to a number indicating the amount of time 
    that has elapsed from a certain point in time.
    """
    
    def __init__(self, timepoint, transform_to, drop):
        """
        Initialize class instance by setting convert options. 
        
        Parameters
        ----------
        timepoint : pandas.Timestamp, 
            the time point from which the count is taken.
        transform_to: str, 
            unit of time to use for calculating the result.
            options:
            - 'y' -- years;
            - 'm' -- months;
            - 'w' -- weeks;
            - 'd' -- days.
        drop: bool, 
            if True, remove the original columns from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.timepoint = timepoint
        self.transform_to = transform_to
        self.drop = drop
    
    def fit(self, X, y=None):
        """
        Fit DateTransformer to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transfor X using the parameters set in the constructor.
        Return transformed dataframe.
        """
        options = dict(d=1, w=7, m=30, y=365)
        div = options.get(self.transform_to, 1)
        columns = X.columns
        for col in columns:
            new_col_name = f'{col}_{self.transform_to}'
            X[new_col_name] = X[f'{col}'].apply(
                lambda x: (self.timepoint-x).days / div)
        if self.drop:
            X.drop(columns, axis=1, inplace=True)
        return X

In [None]:
class ColumnTranslation:
    """
    The class is used to store information about the conversion of a single column.
    """

    def __init__(self, column_name, to_save, default='Other'):
        """
        Initialize ColumnTranslation instance.

        Parameters
        ----------
        column_name: str, 
            name of column to translate.
        to_save: list, 
            list of values whose translation is not required.
        default: str, 
            value to be written to the column if it is not in the to_save list.
        """
        self.column_name = column_name
        self.to_save = to_save
        self.default = default


class Translator(BaseEstimator, TransformerMixin):
    """
    The class provides functionality for translating column values 
    to a defined range of values.
    """

    def __init__(self, translations):
        """
        Initialize class instance.

        Parameters
        ----------
        translations: list of ColumnTranslation's, 
            object that provides column_name, to_save and default properties.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.translations = translations[:]

    def fit(self, X, y=None):
        """
        Fit Translator to X, but really does nothing.
        Return self.
        """
        return self

    def transform(self, X, y=None):
        """
        Transform X.
        For each column from the list `translations`: 
        the values from the `to_save` list are kept intact;
        default values are written in the remaining rows.
        """
        for tr in self.translations:
            X[tr.column_name] = X[tr.column_name].apply(
                lambda x: x if x in tr.to_save else tr.default)
        return X[[tr.column_name for tr in self.translations]]

#### Constructing pipeline

In [None]:
translate_ohe = OneHotEncoder(sparse=False)
translate_pipeline = Pipeline(
    steps=[
        ('translate_selector', FeatureSelector(['STATUS', 'ASSET_TYPE_LAST',
                                                'DEVICE_TYPE_BUS', 'USAGE_AREA'])),
        ('translate_transformer', Translator(
            [ColumnTranslation(column_name='STATUS',
                               to_save=['D', 'F', 'R', 'W'],
                               default='U'),
             ColumnTranslation(column_name='ASSET_TYPE_LAST',
                               to_save=['Smartphone', 'Tablet']),
             ColumnTranslation(column_name='DEVICE_TYPE_BUS',
                               to_save=['Smartphone', 'Tablet', 'Undefined']),
             ColumnTranslation(column_name='USAGE_AREA',
                               to_save=['Minsk', 'Undefined'])])),
        ('translate_encoder', translate_ohe)
    ]
)

numeric_pipeline = Pipeline(
    steps=[
        ('num_selector', FeatureSelector(['OBLIG_NUM', 'TP_CHANGES_NUM',
                                          'REVENUE_OCT_16', 'REVENUE_NOV_16',
                                          'REFILL_OCT_16', 'REFILL_NOV_16',
                                          'GPRS_OCT_16', 'GPRS_NOV_16',
                                          'OUTGOING_OCT_16', 'OUTGOING_NOV_16'])),
        ('num_imputer', SimpleImputer(strategy='constant', fill_value=0.0))
    ]
)

trend_pipeline = Pipeline(
    steps=[
        ('trend_selector', FeatureSelector(['REVENUE_OCT_16', 'REVENUE_NOV_16',
                                            'REFILL_OCT_16', 'REFILL_NOV_16',
                                            'GPRS_OCT_16', 'GPRS_NOV_16',
                                            'OUTGOING_OCT_16', 'OUTGOING_NOV_16'])),
        ('trend_creator', TrendGenerator(
            ['REVENUE', 'REFILL', 'GPRS', 'OUTGOING'])),
        ('trend_imputer', SimpleImputer(strategy='constant', fill_value=0.0))
    ]
)

date_pipeline = Pipeline(
    steps=[
        ('date_selector', FeatureSelector(['ACT_DATE', 'BIRTHDAY'])),
        ('date_transformer', DateTransformer(pd.Timestamp(2016, 12, 1),
                                             transform_to='y',
                                             drop=True)),
        ('date_imputer', SimpleImputer(strategy='median'))
    ]
)

gender_ohe = OneHotEncoder(sparse=False)
gender_pipeline = Pipeline(
    steps=[
        ('gen_selector', FeatureSelector(
            ['GENDER', 'OUTGOING_OCT_16', 'OUTGOING_NOV_16'])),
        ('gen_imputer', GenderImputer()),
        ('gen_encoder', gender_ohe)
    ]
)

bool_pipeline = Pipeline(
    steps=[
        ('bool_selector', FeatureSelector(['MLLS_STATE'])),
        ('bool_transformer', BooleanTransformer(
            [BooleanColumn('MLLS_STATE', true_values=['Active', 'Begin'])]))
    ]
)

In [None]:
full_pipeline = FeatureUnion(transformer_list=[
    ('num', numeric_pipeline),
    ('translate', translate_pipeline),
    ('trend', trend_pipeline),
    ('date', date_pipeline),
    ('gender', gender_pipeline),
    ('bool', bool_pipeline)
])

#### Function for convenient grid search using and report

In [None]:
def gc(method, ctor_params, params, preparation):
    """
    Function builds pipeline of full data preparation and launches grid search
    with parameters passed in `params`.
    
    Parameters
    ----------
    method: <class 'type'>,
        class (type/method) for boosting classification.
    ctor_params: dict,
        parameters for `method` initialization (__init__ method).
    params: list of dict,
        grid definition for GridSearchCV.
    preparation: pipeline, transformer,
        preparation step for pipeline.
    """
    
    pip = Pipeline(
        steps=[
            ('preparation', preparation), 
            ('gc', GridSearchCV(method(random_state=29, **ctor_params), params, n_jobs=-1,
                                scoring='accuracy', cv=5, refit=True, verbose=2))
        ]
    )

    pip.fit(X_train, y_train)
    y_true, y_pred = y_test, pip.predict(X_test)
    
    print(metrics.classification_report(y_true, y_pred))
    print('Best params found:\n', pip['gc'].best_params_)
    return pip