In [1]:
import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2
import pickle
from sklearn import set_config
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
train_data = pd.read_csv('../../data/raw/train.csv',  encoding= 'unicode_escape')
train_data = train_data.loc[:, ~train_data.columns.str.contains('^Unnamed')]
y_train = train_data["Unusual"]                      #defining the labels
X_train = train_data.drop(["Unusual"], axis=1)

test_data = pd.read_csv('../../data/raw/test.csv',  encoding= 'unicode_escape')
test_data = test_data.loc[:, ~test_data.columns.str.contains('^Unnamed')]
y_test = test_data["Unusual"]                      #defining the labels
X_test = test_data.drop(["Unusual"], axis=1)
X_train

Unnamed: 0.1,Unnamed: 0,Time,CellName,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,maxUE_UL,maxUE_UL+DL
0,35217,8:30,9ALTE,3.537,0.808,0.148,0.013,4.236,0.111,1.051,1.021,3.0,2.0,5
1,507,18:15,4CLTE,15.259,1.819,0.457,0.039,82.104,0.560,1.172,1.112,5.0,4.0,9
2,6074,7:15,6ALTE,3.335,0.909,0.448,0.032,31.147,0.849,1.071,0.010,3.0,2.0,5
3,12206,21:30,7WLTE,2.728,5.154,0.786,0.085,19.737,1.408,1.374,0.010,6.0,4.0,10
4,29773,18:45,6ULTE,2.526,5.558,1.936,0.116,37.587,1.253,1.445,0.010,6.0,4.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25827,32081,19:45,6BLTE,24.875,2.786,0.803,0.092,34.479,1.096,1.333,0.010,7.0,4.0,11
25828,7259,15:30,3CLTE,11.217,2.829,1.018,0.057,37.103,0.859,1.152,1.071,5.0,3.0,8
25829,21584,20:45,5ALTE,6.164,0.808,0.279,0.040,11.190,0.546,1.101,1.031,4.0,2.0,6
25830,36543,19:30,4ALTE,7.882,1.011,0.203,0.029,26.180,2.423,1.132,1.021,4.0,2.0,6


In [44]:
# Define custom transformers
class CustomTransformer:
    def __init__(self, parameter):
        self.parameter = parameter
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # perform custom transformation
        return X
# Define a function that converts the 'Time' column to datetime type
def convert_to_datetime(X_train):
    X_train['Time'] = pd.to_datetime(X_train['Time']).dt.time
    return X_train
#     def convert_to_float(dt):
#         return dt.timestamp()

# apply the conversion function to the date_time column
#    X_train['Time'] = X_train['date_time'].apply(convert_to_float)
#    X_train.drop(['date_time'],inplace=True)
#    return X_train

In [55]:
import pandas as pd

def modify_time_column(df):
    df['Time'] = pd.to_datetime(df['Time']).dt.time
    return df

modified_df = modify_time_column(X_train)
X_train

Unnamed: 0.1,Unnamed: 0,Time,CellName,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,maxUE_UL,maxUE_UL+DL
0,35217,08:30:00,9ALTE,3.537,0.808,0.148,0.013,4.236,0.111,1.051,1.021,3.0,2.0,5
1,507,18:15:00,4CLTE,15.259,1.819,0.457,0.039,82.104,0.560,1.172,1.112,5.0,4.0,9
2,6074,07:15:00,6ALTE,3.335,0.909,0.448,0.032,31.147,0.849,1.071,0.010,3.0,2.0,5
3,12206,21:30:00,7WLTE,2.728,5.154,0.786,0.085,19.737,1.408,1.374,0.010,6.0,4.0,10
4,29773,18:45:00,6ULTE,2.526,5.558,1.936,0.116,37.587,1.253,1.445,0.010,6.0,4.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25827,32081,19:45:00,6BLTE,24.875,2.786,0.803,0.092,34.479,1.096,1.333,0.010,7.0,4.0,11
25828,7259,15:30:00,3CLTE,11.217,2.829,1.018,0.057,37.103,0.859,1.152,1.071,5.0,3.0,8
25829,21584,20:45:00,5ALTE,6.164,0.808,0.279,0.040,11.190,0.546,1.101,1.031,4.0,2.0,6
25830,36543,19:30:00,4ALTE,7.882,1.011,0.203,0.029,26.180,2.423,1.132,1.021,4.0,2.0,6


In [45]:
from sklearn.preprocessing import FunctionTransformer


dt_feature = ['Time']
date_time_transformer = Pipeline([
    ('datetime_conversion', FunctionTransformer(convert_to_datetime))
])

In [46]:
numeric_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

In [47]:
# Create a list of all possible categories across both datasets
categories = pd.concat([X_train, X_test])['CellName'].unique()
categorical_features = ['CellName']
categorical_transformer = Pipeline(
    steps=[
        #("encoder", OneHotEncoder(handle_unknown="ignore"))
        ('encoder', OneHotEncoder(categories=[categories]))
    ]
)

In [48]:
# time = ['Time']
# text_transformer = Pipeline(steps=[
#     ('countvec', CountVectorizer())
# ])

In [49]:
preprocessor = ColumnTransformer(
    transformers=[
        ("dt", date_time_transformer, dt_feature),
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        
    ],
    remainder='drop'
)

In [50]:
# fit the preprocessor on the data
preprocessor.fit(X_train)
print(preprocessor)

TypeError: float() argument must be a string or a number, not 'datetime.time'

In [None]:
with open('PreprocessingPipeline.pkl', 'wb') as f:
    # write the preprocessor object to the file
    pickle.dump(preprocessor, f)

In [None]:
set_config(display="diagram")
preprocessor

In [None]:
#X_processed = preprocess(X_test)
#print(X_processed.shap)
#column_names = pipe.named_steps['preprocessor'].transformers_[1][1]\
#   .named_steps["encoder"].get_feature_names(categorical_features)
# Create new pandas dataframe with transformed data and column names
#df_transformed = pd.DataFrame(X_processed, columns=np.concatenate((column_names, numeric_features)))