## Check Preprocessing Pipeline Module

In [83]:
import sys

sys.path.append('/Users/manueljohn/Training/github-projects/bike-demand-prediction/')

In [56]:
import os
from src.components.data_cleaner import clean_col_names, NullValueImputer

from src.components.feature_extractor import SkewDiscretizer, CategoricalEncoder, LagFeatureCreator
from src.components.feature_extractor import extract_date_features, remove_multicollinear_features

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer 
from sklearn.base import BaseEstimator, TransformerMixin


class PreProcessPipeline:
    def __init__(self):
        self.clean_col_transformer = None
        self.null_value_imputer = None

        self.skew_discretizer = None
        self.multicollinear_transformer = None
        self.categorical_encoder = None
        self.date_features_transformer = None
        self.lag_features_transformer = None

        self.cleaning_pipeline = None
        self.feature_transformer_pipeline = None
        self.preprocessing_pipeline = None
        
    def create_cleaning_pipeline(self):
        self.clean_col_transformer = FunctionTransformer(func=clean_col_names)
        self.null_value_imputer = NullValueImputer()

        self.cleaning_pipeline = Pipeline([
            ('clean_col_transformer', self.clean_col_transformer), 
            ('imputer', self.null_value_imputer)])
        
        return self.cleaning_pipeline
    
    def create_feature_pipeline(self):
        self.skew_discretizer = SkewDiscretizer()
        self.multicollinear_transformer = FunctionTransformer(func=remove_multicollinear_features)
        self.categorical_encoder = CategoricalEncoder()
        self.date_features_transformer = FunctionTransformer(func=extract_date_features)
        self.lag_features_transformer = LagFeatureCreator(lag_hours=24)

        self.feature_transformer_pipeline = Pipeline([
            ('skew_discretizer', self.skew_discretizer), 
            ('multicollinear_transformer', self.multicollinear_transformer), 
            ('categorical_encoder', self.categorical_encoder), 
            ('date_features_transformer', self.date_features_transformer), 
            ('lag_features_transformer', self.lag_features_transformer)
            ])
        
        return self.feature_transformer_pipeline
    
    def get_preprocessing_pipeline(self):
        self.create_cleaning_pipeline()
        self.create_feature_pipeline()

        self.preprocessing_pipeline = Pipeline([
            ('cleaning_pipeline', self.cleaning_pipeline), 
            ('feature_transform_pipeline', self.feature_transformer_pipeline)
            ])
        
        return self.preprocessing_pipeline

In [84]:
import cloudpickle

def save_pipeline_components(pipeline_obj, transformer_root, pipeline_root):
    transformer_directory = os.path.dirname(transformer_root)
    pipeline_directory = os.path.dirname(pipeline_root)

    os.makedirs(transformer_directory, exist_ok=True), os.makedirs(pipeline_directory, exist_ok=True)

    for key in pipeline_obj.__dict__.keys():
        if type(pipeline_obj.__dict__[key]) != Pipeline:
            with open(f"{transformer_root}/{key}.pkl", 'wb') as f:
                cloudpickle.dump(pipeline_obj.__dict__[key], f)

        else:
            with open(f"{pipeline_root}/{key}.pkl", 'wb') as f:
                cloudpickle.dump(pipeline_obj.__dict__[key], f)

    return



import pandas as pd

data = pd.read_csv('/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/raw_data/train_data.csv')
feature_pipeObj = PreProcessPipeline()
feature_pipe = feature_pipeObj.get_preprocessing_pipeline()
print(data.shape)

data = feature_pipe.fit_transform(data)

print(data.shape)


save_pipeline_components(feature_pipeObj, pipeline_root='/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/pipeline-components/'
                         , transformer_root='/Users/manueljohn/Training/github-projects/bike-demand-prediction/artifacts/transformer-components/')

(7008, 14)
(6984, 40)
