In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def feature_engineering(df):
    # Aggregate features
    df['Total_Transaction_Amount'] = df.groupby('CustomerId')['Amount'].transform('sum')
    df['Average_Transaction_Amount'] = df.groupby('CustomerId')['Amount'].transform('mean')
    df['Transaction_Count'] = df.groupby('CustomerId')['TransactionId'].transform('count')
    df['Transaction_Std_Dev'] = df.groupby('CustomerId')['Amount'].transform('std')

    # Extract features
    df['Transaction_Hour'] = pd.to_datetime(df['TransactionStartTime']).dt.hour
    df['Transaction_Day'] = pd.to_datetime(df['TransactionStartTime']).dt.day
    df['Transaction_Month'] = pd.to_datetime(df['TransactionStartTime']).dt.month
    df['Transaction_Year'] = pd.to_datetime(df['TransactionStartTime']).dt.year

    # Encode categorical variables
    categorical_features = ['ProductCategory', 'ChannelId']
    numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )

    df_transformed = preprocessor.fit_transform(df)
    return pd.DataFrame(df_transformed)

if __name__ == "__main__":
    df = pd.read_csv('../data/raw/data.csv')
    processed_df = feature_engineering(df)
    processed_df.to_csv('../data/processed/processed_data.csv', index=False)