In [349]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from datetime import datetime
import dill
from sklearn import set_config
from joblib import dump

In [350]:
train_data = pd.read_csv('../../data/raw/train.csv',  encoding= 'unicode_escape')
y_train = train_data["Unusual"]                      #defining the labels
X_train = train_data.drop(["Unusual"], axis=1)
X_train.head()

Unnamed: 0,Time,CellName,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,maxUE_UL,maxUE_UL+DL
0,8:30,9ALTE,3.537,0.808,0.148,0.013,4.236,0.111,1.051,1.021,3.0,2.0,5
1,18:15,4CLTE,15.259,1.819,0.457,0.039,82.104,0.56,1.172,1.112,5.0,4.0,9
2,7:15,6ALTE,3.335,0.909,0.448,0.032,31.147,0.849,1.071,0.01,3.0,2.0,5
3,21:30,7WLTE,2.728,5.154,0.786,0.085,19.737,1.408,1.374,0.01,6.0,4.0,10
4,18:45,6ULTE,2.526,5.558,1.936,0.116,37.587,1.253,1.445,0.01,6.0,4.0,10


In [351]:
# Define custom transformers
class NullEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[X == '#Â¡VALOR!'] = np.nan
        return X
    
class TimeConversation(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
#         X['Time'] = pd.to_datetime(X['Time'], format='%H:%M').dt.time
#         print(X.info())
        return X
    
class ObjectToFloat(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['maxUE_UL+DL'] = X['maxUE_UL+DL'].astype(float)
        return X
    
class OneHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        # Initialize the OneHotEncoder object
        one_hot_encoder = OneHotEncoder()

        # Fit and transform the categorical variable using OneHotEncoder
        one_hot_encoded = one_hot_encoder.fit_transform(X[['CellName']])

        # Convert the sparse matrix to a dense array and create a new dataframe
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['CellName']))

        # Concatenate the one hot encoded dataframe with the original dataframe
        X = pd.concat([X, one_hot_encoded_df], axis=1)

        return X.drop(["CellName"], axis=1)

class Scaling(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Initialize the StandardScaler object
        scaler = StandardScaler()

        # Select only the float columns to scale
        float_cols = X.select_dtypes(include=['float']).columns.tolist()

        # Scale the float columns using StandardScaler and replace the original data with the scaled data
        X[float_cols] = scaler.fit_transform(X[float_cols])
        return X

In [352]:
pipe = Pipeline([
("SetNull",NullEncoder()),
("TimeConversion",TimeConversation()),
("ObjectToFloat",ObjectToFloat()),
("Scaling",Scaling()),
("OneHotEncoding",OneHotEncoding())  
])
pipe.fit(X_train)

In [353]:
# save the pipeline using pickle
dump(pipe, 'PreprocessingPipeline2.0.pkl')

['PreprocessingPipeline2.0.pkl']

In [354]:
set_config(display="diagram")
pipe

In [355]:
data = pipe.fit_transform(X_train)
data

Unnamed: 0,Time,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,maxUE_UL,maxUE_UL+DL,CellName_10ALTE,CellName_10BLTE,CellName_10CLTE,CellName_1ALTE,CellName_1BLTE,CellName_1CLTE,CellName_2ALTE,CellName_3ALTE,CellName_3BLTE,CellName_3CLTE,CellName_4ALTE,CellName_4BLTE,CellName_4CLTE,CellName_5ALTE,CellName_5BLTE,CellName_5CLTE,CellName_6ALTE,CellName_6BLTE,CellName_6CLTE,CellName_6ULTE,CellName_6VLTE,CellName_6WLTE,CellName_7ALTE,CellName_7BLTE,CellName_7CLTE,CellName_7ULTE,CellName_7VLTE,CellName_7WLTE,CellName_8ALTE,CellName_8BLTE,CellName_8CLTE,CellName_9ALTE,CellName_9BLTE
0,8:30,-0.508567,-0.600417,-0.578625,-0.312281,-0.860451,-0.336988,-0.569604,0.665886,-0.669510,-0.765168,-0.736426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,18:15,0.889801,-0.127242,-0.143745,-0.164396,4.092399,-0.247547,-0.003490,0.835839,0.457255,0.676191,0.572713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7:15,-0.532664,-0.553146,-0.156412,-0.204211,0.851243,-0.189979,-0.476031,-1.222273,-0.669510,-0.765168,-0.736426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21:30,-0.605076,1.433628,0.319282,0.097245,0.125501,-0.078626,0.941593,-1.222273,1.020637,0.676191,0.899998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,18:45,-0.629173,1.622711,1.937767,0.273569,1.260863,-0.109502,1.273776,-1.222273,1.020637,0.676191,0.899998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25827,19:45,2.036934,0.325340,0.343208,0.137060,1.063177,-0.140776,0.749769,-1.222273,1.584020,0.676191,1.227283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25828,15:30,0.407613,0.345466,0.645794,-0.062015,1.230078,-0.187987,-0.097062,0.759267,0.457255,-0.044488,0.245429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25829,20:45,-0.195181,-0.600417,-0.394259,-0.158709,-0.418137,-0.250336,-0.335672,0.684562,-0.106127,-0.765168,-0.409141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25830,19:30,0.009767,-0.505407,-0.501219,-0.221275,0.535313,0.123561,-0.190635,0.665886,-0.106127,-0.765168,-0.409141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
