In [4]:
import pandas as pd
import numpy as np
import gc

# Load optimized data
train = pd.read_parquet('../data/processed/train.parquet')
test = pd.read_parquet('../data/processed/test.parquet')

# Identify the columns for alignment
train_cols = train.columns
test_cols = test.columns

In [5]:
import datetime

# Reference: The first transaction is 86400 (1 day in seconds)
# We assume a start date to calculate cycles
START_DATE = '2025-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

for df in [train, test]:
    # Convert seconds to days
    df['TransactionAmt_decimal'] = ((df['TransactionAmt'] - df['TransactionAmt'].astype(int)) * 1000).astype(int)
    
    # Create Time features
    df['temp_dt'] = df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
    df['hour'] = df['temp_dt'].dt.hour
    df['day_of_week'] = df['temp_dt'].dt.dayofweek
    
    # Drop temp helper
    df.drop('temp_dt', axis=1, inplace=True)

In [6]:
# We group by 'card1' (the anonymized card ID)
for col in ['TransactionAmt', 'id_02', 'D15']:
    for df in [train, test]:
        # Calculate mean per card
        df[f'{col}_card1_mean'] = df.groupby(['card1'])[col].transform('mean')
        
        # Calculate standard deviation per card
        df[f'{col}_card1_std'] = df.groupby(['card1'])[col].transform('std')
        
        # Calculate the ratio (Current / Mean)
        df[f'{col}_to_mean_card1'] = df[col] / df[f'{col}_card1_mean']

gc.collect()

852

In [7]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns based on the dataset description
cat_cols = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 
            'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
            'DeviceType', 'DeviceInfo']

for col in cat_cols:
    le = LabelEncoder()
    # Fit on combined data to ensure all labels are captured
    full_values = list(train[col].astype(str).values) + list(test[col].astype(str).values)
    le.fit(full_values)
    
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

In [8]:
train.to_parquet('../data/processed/train.parquet')
test.to_parquet('../data/processed/test.parquet')

In [10]:
display(train.head())
display(test.head())
print(train.shape)
print(test.shape)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,day_of_week,TransactionAmt_card1_mean,TransactionAmt_card1_std,TransactionAmt_to_mean_card1,id_02_card1_mean,id_02_card1_std,id_02_to_mean_card1,D15_card1_mean,D15_card1_std,D15_to_mean_card1
0,2987000,0,86400,68.5,4,13926,,150.0,2,142.0,...,1,351.931152,371.141266,0.19464,153111.0,96778.708402,,82.441177,182.60791,0.0
1,2987001,0,86401,29.0,4,2755,404.0,150.0,3,102.0,...,1,234.292755,460.356964,0.123777,153593.109375,189083.778339,,143.879532,199.316864,0.0
2,2987002,0,86469,59.0,4,4663,490.0,150.0,4,166.0,...,1,97.015541,100.12886,0.60815,104099.445312,48882.364809,,125.070343,170.176773,2.518583
3,2987003,0,86499,50.0,4,18132,567.0,150.0,3,117.0,...,1,123.416344,192.717422,0.405133,87683.242188,92841.790305,,201.718323,213.243607,0.550272
4,2987004,0,86506,50.0,1,4497,514.0,150.0,3,102.0,...,1,96.972221,56.629452,0.515612,92559.5,40373.557902,0.764773,199.428574,222.739838,


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,day_of_week,TransactionAmt_card1_mean,TransactionAmt_card1_std,TransactionAmt_to_mean_card1,id_02_card1_mean,id_02_card1_std,id_02_to_mean_card1,D15_card1_mean,D15_card1_std,D15_to_mean_card1
0,3663549,18403224,31.950001,4,10409,111.0,150.0,4,226.0,3,...,3,94.155457,122.719391,0.339332,151720.828125,97031.570735,,383.65332,245.556641,1.066067
1,3663550,18403263,49.0,4,4272,111.0,150.0,4,226.0,3,...,3,146.92926,364.93335,0.333494,135119.078125,132205.036469,,294.168091,309.619476,2.15523
2,3663551,18403310,171.0,4,4476,574.0,150.0,4,226.0,3,...,3,115.112427,90.200775,1.485504,46141.25,22712.107115,,346.575745,285.755798,0.279881
3,3663552,18403310,284.950012,4,10989,360.0,150.0,4,166.0,3,...,3,95.953964,148.880264,2.969653,127429.632812,105637.699887,,153.292358,214.154984,1.578683
4,3663553,18403317,67.949997,4,18018,452.0,150.0,3,117.0,3,...,3,119.764473,219.086746,0.567364,200599.4375,232347.508938,,256.19342,291.927246,0.085873


(590540, 446)
(506691, 445)
