In [346]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from datetime import datetime
import dill
from sklearn import set_config
import pickle

In [363]:
train_data = pd.read_csv('../../data/raw/train.csv',  encoding= 'unicode_escape')
y_train = train_data["Unusual"]                      #defining the labels
X_train = train_data.drop(["Unusual"], axis=1)
X_train.head()

Data_df = pd.read_csv('../../data/raw/data.csv',  encoding= 'unicode_escape')
Data = Data_df.drop(["Unusual"], axis=1)
Data['CellName'].unique()


array(['3BLTE', '1BLTE', '9BLTE', '4ALTE', '10BLTE', '9ALTE', '4BLTE',
       '4CLTE', '6CLTE', '5CLTE', '7BLTE', '8CLTE', '7ULTE', '6WLTE',
       '7VLTE', '7WLTE', '5ALTE', '6ALTE', '6ULTE', '3CLTE', '5BLTE',
       '8ALTE', '8BLTE', '6BLTE', '10CLTE', '7CLTE', '3ALTE', '1CLTE',
       '2ALTE', '10ALTE', '1ALTE', '6VLTE', '7ALTE'], dtype=object)

In [364]:
# Define custom transformers
class NullEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.replace(['#¡VALOR!'], np.nan, inplace=True)
        X.replace(['#Â¡VALOR!'], np.nan, inplace=True)
        return X
    
class TimeConversation(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # convert time to seconds
        X['Time'] = X['Time'].astype(str) + ':00'
        X['Time'] = pd.to_timedelta(X['Time']).dt.total_seconds()
        return X
    
class ObjectToFloat(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['maxUE_UL+DL'] = X['maxUE_UL+DL'].astype(float)
        return X
    
class OneHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # get unique values of column 'B' as a NumPy array
        categories = np.unique(Data['CellName'])
        # Initialize the OneHotEncoder object
        one_hot_encoder = OneHotEncoder(categories=[categories])

        # Fit and transform the categorical variable using OneHotEncoder
        one_hot_encoded = one_hot_encoder.fit_transform(X[['CellName']])

        # Convert the sparse matrix to a dense array and create a new dataframe
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['CellName']))

        # Concatenate the one hot encoded dataframe with the original dataframe
        X = pd.concat([X, one_hot_encoded_df], axis=1)

        return X.drop(["CellName"], axis=1)

class Scaling(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Initialize the StandardScaler object
        scaler = StandardScaler()

        # Select only the float columns to scale
        float_cols = X.select_dtypes(include=['float']).columns.tolist()

        # Scale the float columns using StandardScaler and replace the original data with the scaled data
        X[float_cols] = scaler.fit_transform(X[float_cols])
        return X

In [365]:
pipe = Pipeline([
("SetNull",NullEncoder()),
("TimeConversion",TimeConversation()),
("ObjectToFloat",ObjectToFloat()),
("Scaling",Scaling()),
("OneHotEncoding",OneHotEncoding())  
])
pipe.fit(X_train)

In [366]:
data = pipe.fit_transform(X_train)
data

Unnamed: 0,Time,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,...,CellName_7BLTE,CellName_7CLTE,CellName_7ULTE,CellName_7VLTE,CellName_7WLTE,CellName_8ALTE,CellName_8BLTE,CellName_8CLTE,CellName_9ALTE,CellName_9BLTE
0,0.004636,-0.508567,-0.600417,-0.578625,-0.312281,-0.860451,-0.336988,-0.569604,0.665886,-0.669510,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.005244,0.889801,-0.127242,-0.143745,-0.164396,4.092399,-0.247547,-0.003490,0.835839,0.457255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.004558,-0.532664,-0.553146,-0.156412,-0.204211,0.851243,-0.189979,-0.476031,-1.222273,-0.669510,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.547800,-0.605076,1.433628,0.319282,0.097245,0.125501,-0.078626,0.941593,-1.222273,1.020637,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.547629,-0.629173,1.622711,1.937767,0.273569,1.260863,-0.109502,1.273776,-1.222273,1.020637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25827,1.547691,2.036934,0.325340,0.343208,0.137060,1.063177,-0.140776,0.749769,-1.222273,1.584020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25828,0.005073,0.407613,0.345466,0.645794,-0.062015,1.230078,-0.187987,-0.097062,0.759267,0.457255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25829,1.547753,-0.195181,-0.600417,-0.394259,-0.158709,-0.418137,-0.250336,-0.335672,0.684562,-0.106127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25830,1.547675,0.009767,-0.505407,-0.501219,-0.221275,0.535313,0.123561,-0.190635,0.665886,-0.106127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [367]:
#dill.dump(pipe, 'PreprocessingPipeline2.0.joblib')

with open('PreprocessingPipeline2.0.pkl', 'wb') as f:
    dill.dump(pipe, f)

In [328]:
dump(pipe, 'PreprocessingPipeline2.0.pkl')

['PreprocessingPipeline2.0.pkl']

In [101]:
set_config(display="diagram")
pipe

In [17]:
data = pipe.fit_transform(X_train)
data

        Time CellName  PRBUsageUL  PRBUsageDL  meanThr_DL  meanThr_UL  \
0       8:30    9ALTE   -0.508567   -0.600417   -0.578625   -0.312281   
1      18:15    4CLTE    0.889801   -0.127242   -0.143745   -0.164396   
2       7:15    6ALTE   -0.532664   -0.553146   -0.156412   -0.204211   
3      21:30    7WLTE   -0.605076    1.433628    0.319282    0.097245   
4      18:45    6ULTE   -0.629173    1.622711    1.937767    0.273569   
...      ...      ...         ...         ...         ...         ...   
25827  19:45    6BLTE    2.036934    0.325340    0.343208    0.137060   
25828  15:30    3CLTE    0.407613    0.345466    0.645794   -0.062015   
25829  20:45    5ALTE   -0.195181   -0.600417   -0.394259   -0.158709   
25830  19:30    4ALTE    0.009767   -0.505407   -0.501219   -0.221275   
25831   3:45    3BLTE   -0.443790   -0.000406    0.495204   -0.181460   

       maxThr_DL  maxThr_UL  meanUE_DL  meanUE_UL  maxUE_DL  maxUE_UL  \
0      -0.860451  -0.336988  -0.569604   0.665886 

Unnamed: 0,Time,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,...,CellName_7BLTE,CellName_7CLTE,CellName_7ULTE,CellName_7VLTE,CellName_7WLTE,CellName_8ALTE,CellName_8BLTE,CellName_8CLTE,CellName_9ALTE,CellName_9BLTE
0,8:30,-0.508567,-0.600417,-0.578625,-0.312281,-0.860451,-0.336988,-0.569604,0.665886,-0.669510,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,18:15,0.889801,-0.127242,-0.143745,-0.164396,4.092399,-0.247547,-0.003490,0.835839,0.457255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7:15,-0.532664,-0.553146,-0.156412,-0.204211,0.851243,-0.189979,-0.476031,-1.222273,-0.669510,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21:30,-0.605076,1.433628,0.319282,0.097245,0.125501,-0.078626,0.941593,-1.222273,1.020637,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,18:45,-0.629173,1.622711,1.937767,0.273569,1.260863,-0.109502,1.273776,-1.222273,1.020637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25827,19:45,2.036934,0.325340,0.343208,0.137060,1.063177,-0.140776,0.749769,-1.222273,1.584020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25828,15:30,0.407613,0.345466,0.645794,-0.062015,1.230078,-0.187987,-0.097062,0.759267,0.457255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25829,20:45,-0.195181,-0.600417,-0.394259,-0.158709,-0.418137,-0.250336,-0.335672,0.684562,-0.106127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25830,19:30,0.009767,-0.505407,-0.501219,-0.221275,0.535313,0.123561,-0.190635,0.665886,-0.106127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
