In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
import dill
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import logging
import gc

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

#### Представим, что у нас есть пара сырых файлов паркет по которым нужно предсказать кредитную историю (сохранил 1 рандомный файл паркет из исходной выборки), у меня все прописано в модуле dd, который сразу энкодит и сохраняет датасет, который потом скармливается в модель и получает предикты (функция из модуля dd modify_data_pipe)

#### Промежуточные шаги по типу энкодинга не включены, так как изначально обрабатываются в функции dd.modify_data(), с помощью которой тренировочный фрейм был сохранен на этапе моделирования, предикт реализуется отдельной функцией

In [2]:
def pipeline_fit() -> None:
    ## Загружаем данные тренировочной выборки для последующего обучения моделей и выбора лучшей
    data = pd.read_csv('train_data_for_pipe.csv', index_col=0)
    
    X = data.drop('flag', axis=1)
    y = data['flag']
    
    del data
    
    numerical_features = make_column_selector(dtype_include=['int64', 'float64'])
    categorical_features = make_column_selector(dtype_include=object)
    
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy="most_frequent"))
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ])
    
    path_to_save = 'pickle_model'
    model_filename_xgb = f'{path_to_save}/XGBClassifier_cr_sc.pkl'
    model_filename_cat = f'{path_to_save}/CatClassifier_cr_sc.pkl'
    model_filename_lgbm = f'{path_to_save}/LGBMClassifier_cr_sc.pkl'

    with open(model_filename_xgb, 'rb') as file:
        XGBClassifier = dill.load(file)
        
    with open(model_filename_cat, 'rb') as file:
        CatBoostClassifier = dill.load(file)
        
    with open(model_filename_lgbm, 'rb') as file:
        LGBMClassifier = dill.load(file)
    
    
    
    models = [
        XGBClassifier,
        CatBoostClassifier,
        LGBMClassifier
    ]
    
    best_score = .0
    best_pipe = None
    
    for model in models:
        
        pipe = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('classifier', model)
        ])
        
        score = cross_val_score(pipe, X, y, cv=3, scoring='roc_auc')

        if score.mean() > best_score:
            best_score = score.mean()
            best_pipe = model
        
    logging.info(f'Best Model: {model}, roc_auc_mean: {best_score}')
    
    print(f'Best Model: {model}, roc_auc_mean: {best_score}')
        
    best_pipe.fit(X, y)
    
    model_filename_save = f'{path_to_save}/best_model_ever.pkl'
    
    with open(model_filename_save, 'wb') as file:
        dill.dump(best_pipe, file)
        
    return best_pipe

In [3]:
class PipeLine(object):
    
    def __init__(self, target, path, path_to_save_model):
        self.pipe = None
        self.target = target
        self.path = path
        self.path_to_save = path_to_save_model
        self.best_score = None
        
    def print_info(self):
        print(self.target, self.path)
        
    def pipeline_fit(self) -> None:
        
        print(f'Starting fitting for {self.target} in {self.path}')
        
        data = pd.read_csv(self.path, index_col=0)
        
        X = data.drop(self.target, axis=1)
        y = data[self.target]
        
        del data
        
        numerical_features = make_column_selector(dtype_include=['int64', 'float64'])
        categorical_features = make_column_selector(dtype_include=object)
        
        model_filename_xgb = f'{self.path_to_save}/XGBClassifier_cr_sc.pkl'
        model_filename_cat = f'{self.path_to_save}/CatClassifier_cr_sc.pkl'
        model_filename_lgbm = f'{self.path_to_save}/LGBMClassifier_cr_sc.pkl'
        
        with open(model_filename_xgb, 'rb') as file:
            XGBClassifier = dill.load(file)

        with open(model_filename_cat, 'rb') as file:
            CatBoostClassifier = dill.load(file)

        with open(model_filename_lgbm, 'rb') as file:
            LGBMClassifier = dill.load(file)
            
        models = [
        XGBClassifier,
        CatBoostClassifier,
        LGBMClassifier]
        
        best_score = .0
        best_pipe = None
        
        for model in models:
            
            pipe = Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('classifier', model)
            ])
            
            score = cross_val_score(pipe, X, y, cv=3, scoring='roc_auc')
            
            if score.mean() > best_score:
                best_score = score.mean()
                best_pipe = model
                
        print(f'Best Model: {model}, roc_auc_mean: {best_score}')
        
        best_pipe = best_pipe.fit(X, y)
        
        self.pipe = best_pipe
        
        print(f'Model {model} fitting done')
        
        model_filename_save = f'{self.path_to_save}/best_model_ever.pkl'
        
        with open(model_filename_save, 'wb') as file:
            dill.dump(best_pipe, file)
            
        return self.pipe
    
    def predict(path_to_predict_data, path_to_save, target_path, path_to_save):
        
        predict_data = dd.modify_data(path_to_dataset=path_to_predict_data,
                              num_parts_to_preprocess_at_once=1,
                              num_parts_total=3,
                              save_to_path=path_to_save,
                              target_path=target_path,
                              path_to_save=path_to_save)
        
        predict_data.drop(['id'], axis=1, inplace=True)
        
        display(predict_data.head(5))
        
        predictions = self.pipe.predict(predict_data)
        
        return predictions
        

        
        
        
        

In [4]:
pipe = PipeLine(target='flag', path='train_data_for_pipe.csv', path_to_save_model='pickle_model')

In [None]:
best_best = pipe.pipeline_fit()

Starting fitting for flag in train_data_for_pipe.csv


  mask |= (ar1 == a)
Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 363ms	remaining: 30m 13s
1000:	total: 2m 56s	remaining: 11m 43s
2000:	total: 5m 37s	remaining: 8m 25s
3000:	total: 7m 58s	remaining: 5m 18s
4000:	total: 10m 14s	remaining: 2m 33s
4999:	total: 12m 34s	remaining: 0us


In [None]:
prediction = best_pipe.predict(path_to_predict_data='data_for_pipe/train_data',
                              save_to_path='data_for_pipe/preprocess_train_data',
                              target_path='data_for_pipe/process_data',
                              path_to_save='data_for_pipe/preprocess_train_data')