In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


In [3]:
df = pd.read_csv("../data/raw/data.csv")  


In [4]:
class TransactionAggregator(BaseEstimator, TransformerMixin):
    def __init__(self, group_key='CustomerId'):
        self.group_key = group_key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        agg = X.groupby(self.group_key).agg({
            'Amount': ['sum', 'mean', 'count', 'std'],
            'Value': ['sum', 'mean', 'std'],
        })
        # Flatten multi-index columns
        agg.columns = ['_'.join(col) for col in agg.columns]
        agg.reset_index(inplace=True)
        return agg


In [5]:
class TimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, time_column='TransactionStartTime'):
        self.time_column = time_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df[self.time_column] = pd.to_datetime(df[self.time_column])
        df['transaction_hour'] = df[self.time_column].dt.hour
        df['transaction_day'] = df[self.time_column].dt.day
        df['transaction_month'] = df[self.time_column].dt.month
        df['transaction_year'] = df[self.time_column].dt.year
        return df[[self.time_column, 'transaction_hour', 'transaction_day', 'transaction_month', 'transaction_year']]


In [6]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        if self.columns is None:
            self.columns = X.select_dtypes(include='object').columns.tolist()
        self.encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        encoded = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        X = X.drop(columns=self.columns).re


In [7]:
class MissingValueHandler(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imputer = None

    def fit(self, X, y=None):
        self.imputer = SimpleImputer(strategy=self.strategy)
        self.imputer.fit(X)
        return self

    def transform(self, X):
        X_imputed = pd.DataFrame(self.imputer.transform(X), columns=X.columns, index=X.index)
        return X_imputed


In [8]:
class Normalizer(BaseEstimator, TransformerMixin):
    def __init__(self, method='standard'):
        if method == 'standard':
            self.scaler = StandardScaler()
        elif method == 'minmax':
            self.scaler = MinMaxScaler()
        else:
            raise ValueError("method must be 'standard' or 'minmax'")

    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self

    def transform(self, X):
        return pd.DataFrame(self.scaler.transform(X), columns=X.columns, index=X.index)


In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        if self.columns is None:
            self.columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
        self.encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        encoded = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(
            encoded,
            columns=self.encoder.get_feature_names_out(self.columns),
            index=X.index
        )
        X = X.drop(columns=self.columns)
        return pd.concat([X.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)


In [12]:
# Apply the encoder to the correct columns
cat_enc = CategoricalEncoder()
encoded_df = cat_enc.fit_transform(df_transactions[['ChannelId', 'ProductCategory']])
encoded_df['CustomerId'] = df_transactions['CustomerId'].values
encoded_df = encoded_df.groupby('CustomerId').mean().reset_index()


In [13]:
# Check missing values
df_transactions.isnull().sum()



TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
dtype: int64

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class MissingValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.fill_values = {}

    def fit(self, X, y=None):
        for col in X.columns:
            if X[col].dtype in ['float64', 'int64']:
                self.fill_values[col] = X[col].mean()
            else:
                self.fill_values[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        return X.fillna(self.fill_values)


In [15]:
# Apply missing value imputer
imputer = MissingValueImputer()
df_transactions_imputed = imputer.fit_transform(df_transactions)


In [16]:
from sklearn.preprocessing import StandardScaler

class NumericScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()
        self.columns = None

    def fit(self, X, y=None):
        self.columns = X.select_dtypes(include=['float64', 'int64']).columns
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X_scaled = X.copy()
        X_scaled[self.columns] = self.scaler.transform(X_scaled[self.columns])
        return X_scaled


In [17]:
# Scale numerical features
scaler = NumericScaler()
df_scaled = scaler.fit_transform(df_transactions_imputed)


In [19]:
from sklearn.base import BaseEstimator, TransformerMixin

class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['TransactionStartTime'] = pd.to_datetime(X['TransactionStartTime'], errors='coerce')
        time_features = X[['CustomerId']].copy()
        time_features['transaction_hour'] = X['TransactionStartTime'].dt.hour
        time_features['transaction_day'] = X['TransactionStartTime'].dt.day
        time_features['transaction_month'] = X['TransactionStartTime'].dt.month
        time_features['transaction_year'] = X['TransactionStartTime'].dt.year
        return time_features


In [23]:
feature_pipeline = Pipeline([
    ('temporal', TimeFeaturesExtractor()),           # Extract time features first
    ('categorical', CategoricalEncoder()),            # Encode categorical features at transaction level
    ('imputer', MissingValueImputer()),               # Impute missing transaction data if needed
    ('aggregator', TransactionAggregator()),          # Then aggregate at customer level
    ('scaler', NumericScaler())                        # Finally scale aggregated features
])


In [26]:
# Transaction-level pipeline
transaction_pipeline = Pipeline([
    ('temporal', TimeFeaturesExtractor()),
    ('categorical', CategoricalEncoder()),
    ('imputer', MissingValueImputer()),
])
transaction_features = transaction_pipeline.fit_transform(df_transactions)


In [27]:
# Transaction-level pipeline
transaction_pipeline = Pipeline([
    ('temporal', TimeFeaturesExtractor()),
    ('categorical', CategoricalEncoder()),
    ('imputer', MissingValueImputer()),
])
transaction_features = transaction_pipeline.fit_transform(df_transactions)
