In [26]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from datetime import datetime
import dill
from sklearn import set_config
import pickle

In [27]:
train_data = pd.read_csv('../../data/raw/train.csv',  encoding= 'unicode_escape')
y_train = train_data["Unusual"]                      #defining the labels
X_train = train_data.drop(["Unusual"], axis=1)
X_train.head()

Data_df = pd.read_csv('../../data/raw/data.csv',  encoding= 'unicode_escape')
Data = Data_df.drop(["Unusual"], axis=1)
Data['CellName'].unique()
# get unique values of column 'B' as a NumPy array
categories = np.unique(Data['CellName'])

In [28]:
# Define custom transformers
class NullEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.replace(['#¡VALOR!'], np.nan, inplace=True)
        X.replace(['#Â¡VALOR!'], np.nan, inplace=True)
        return X
    
class TimeConversation(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # convert time to seconds
        X['Time'] = X['Time'].astype(str) + ':00'
        X['Time'] = pd.to_timedelta(X['Time']).dt.total_seconds()
        return X
    
class ObjectToFloat(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['maxUE_UL+DL'] = X['maxUE_UL+DL'].astype(float)
        return X
    
class OneHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self,categories):
        self.categories = categories
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        # Initialize the OneHotEncoder object
        one_hot_encoder = OneHotEncoder(categories=[categories])

        # Fit and transform the categorical variable using OneHotEncoder
        one_hot_encoded = one_hot_encoder.fit_transform(X[['CellName']])

        # Convert the sparse matrix to a dense array and create a new dataframe
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['CellName']))

        # Concatenate the one hot encoded dataframe with the original dataframe
        X = pd.concat([X, one_hot_encoded_df], axis=1)

        return X.drop(["CellName"], axis=1)

class Scaling(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Initialize the StandardScaler object
        scaler = StandardScaler()

        # Select only the float columns to scale
        float_cols = X.select_dtypes(include=['float']).columns.tolist()

        # Scale the float columns using StandardScaler and replace the original data with the scaled data
        X[float_cols] = scaler.fit_transform(X[float_cols])
        return X

In [29]:
pipe = Pipeline([
("SetNull",NullEncoder()),
("TimeConversion",TimeConversation()),
("ObjectToFloat",ObjectToFloat()),
("Scaling",Scaling()),
("OneHotEncoding",OneHotEncoding(categories=categories))  
])
pipe.fit(X_train)

Pipeline(steps=[('SetNull', NullEncoder()),
                ('TimeConversion', TimeConversation()),
                ('ObjectToFloat', ObjectToFloat()), ('Scaling', Scaling()),
                ('OneHotEncoding',
                 OneHotEncoding(categories=array(['10ALTE', '10BLTE', '10CLTE', '1ALTE', '1BLTE', '1CLTE', '2ALTE',
       '3ALTE', '3BLTE', '3CLTE', '4ALTE', '4BLTE', '4CLTE', '5ALTE',
       '5BLTE', '5CLTE', '6ALTE', '6BLTE', '6CLTE', '6ULTE', '6VLTE',
       '6WLTE', '7ALTE', '7BLTE', '7CLTE', '7ULTE', '7VLTE', '7WLTE',
       '8ALTE', '8BLTE', '8CLTE', '9ALTE', '9BLTE'], dtype=object)))])

In [30]:
#dill.dump(pipe, 'PreprocessingPipeline2.0.joblib')

with open('PreprocessingPipeline2.0.pkl', 'wb') as f:
    dill.dump((pipe,categories), f)

In [None]:
data = pipe.fit_transform(X_train)
data

In [None]:
set_config(display="diagram")
pipe

In [None]:
data = pipe.fit_transform(X_train)
data

In [None]:
dump(pipe, 'PreprocessingPipeline2.0.pkl')