In [None]:
import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2
import pickle
from sklearn import set_config
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
train_data = pd.read_csv('../../data/raw/train.csv',  encoding= 'unicode_escape')
y_train = train_data["Unusual"]                      #defining the labels
X_train = train_data.drop(["Unusual"], axis=1)
test_data = pd.read_csv('../../data/raw/test.csv',  encoding= 'unicode_escape')
y_test = test_data["Unusual"]                      #defining the labels
X_test = test_data.drop(["Unusual"], axis=1)
X_train

In [None]:
# Define custom transformers
class CustomTransformer:
    def __init__(self, parameter):
        self.parameter = parameter
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # perform custom transformation
        return X
# Define a function that converts the 'date' column to datetime type
def convert_to_datetime(X_train):
    X_train['date_time'] = pd.to_datetime(X_train['Time'])
    def convert_to_float(dt):
        return dt.timestamp()

# apply the conversion function to the date_time column
    X_train['Time'] = X_train['date_time'].apply(convert_to_float)
    X_train.drop(['date_time'],inplace=True)
    return X_train

In [None]:
from sklearn.preprocessing import FunctionTransformer


dt_feature = ['Time']
date_time_transformer = Pipeline([
    ('datetime_conversion', FunctionTransformer(convert_to_datetime))
])

In [None]:
numeric_features = X_train.select_dtypes(exclude=['object']).columns.tolist()

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

In [None]:
# Create a list of all possible categories across both datasets
categories = pd.concat([X_train, X_test])['CellName'].unique()
categorical_features = ['CellName']
categorical_transformer = Pipeline(
    steps=[
        #("encoder", OneHotEncoder(handle_unknown="ignore"))
        ('encoder', OneHotEncoder(categories=[categories]))
    ]
)

In [None]:
time = ['Time']
text_transformer = Pipeline(steps=[
    ('countvec', CountVectorizer())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("dt", date_time_transformer, dt_feature),
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        
    ],
    remainder='drop'
)

In [None]:
# fit the preprocessor on the data
preprocessor.fit(X_train)
print(preprocessor)

In [None]:
with open('PreprocessingPipeline.pkl', 'wb') as f:
    # write the preprocessor object to the file
    pickle.dump(preprocessor, f)

In [None]:
set_config(display="diagram")
preprocessor

In [None]:
#X_processed = preprocess(X_test)
#print(X_processed.shap)
#column_names = pipe.named_steps['preprocessor'].transformers_[1][1]\
#   .named_steps["encoder"].get_feature_names(categorical_features)
# Create new pandas dataframe with transformed data and column names
#df_transformed = pd.DataFrame(X_processed, columns=np.concatenate((column_names, numeric_features)))