In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
import os
import gc
import math
import json
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

import lightgbm as lgb


# -------------------------------
# Config
# -------------------------------
DATA_DIR = "/kaggle/input/ieee-fraud-detection"
TRAIN_TRANS = f"{DATA_DIR}/train_transaction.csv"
TEST_TRANS = f"{DATA_DIR}/test_transaction.csv"
TRAIN_ID = f"{DATA_DIR}/train_identity.csv"
TEST_ID = f"{DATA_DIR}/test_identity.csv"
SAMPLE_SUB = f"{DATA_DIR}/sample_submission.csv"


SEED = 42
N_FOLDS = 5 # time‑ordered folds
PURGE_DAYS = 1 # optional gap (in days) between train and valid to reduce leakage
np.random.seed(SEED)

In [3]:
# -------------------------------
# Utilities
# -------------------------------

def reduce_mem_usage(df: pd.DataFrame, verbose=True):
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)
    # leave objects as is
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f"Mem. usage decreased to {end_mem:5.2f} Mb ({100*(start_mem-end_mem)/start_mem:.1f}% reduction)")
    return df


# Basic cleaners for messy strings
EMAIL_MAP = {
'gmail.com': 'gmail', 'gmail': 'gmail',
'yahoo.com': 'yahoo', 'yahoo': 'yahoo',
'hotmail.com': 'hotmail', 'outlook.com': 'outlook', 'live.com': 'microsoft',
'aol.com': 'aol', 'icloud.com': 'apple', 'me.com': 'apple', 'mac.com': 'apple'
}

def clean_email_domain(s: pd.Series) -> pd.Series:
    s = s.fillna('')
    s = s.str.lower()
    s = s.replace(EMAIL_MAP)
    s = s.where(s != '', np.nan)
    return s


def split_device_info(dev_series: pd.Series):
    # DeviceInfo examples like "Windows; Chrome 70" or vendor/version strings
    base = dev_series.fillna('').str.lower()
    brand = base.str.extract(r'^([a-z0-9_\-\s]+)')[0].str.strip()
    # Keep only first token as brand proxy
    brand = brand.str.split(' ').str[0]
    brand = brand.replace({'samsungsm': 'samsung', 'samsung': 'samsung', 'windows': 'windows',
    'iphone': 'iphone', 'ipad': 'ipad', 'mac': 'mac', 'huawei': 'huawei'})
    brand = brand.where(brand != '', np.nan)
    return brand


def make_uid(df: pd.DataFrame):
    # Stable-ish user key; you can try richer combos later
    uid = df['card1'].astype('float32').astype('Int32').astype('string') + '_' + df['addr1'].astype('float32').astype('Int32').astype('string')
    uid = uid.replace({'<NA>_': np.nan, '_<NA>': np.nan, '<NA>_<NA>': np.nan})
    return uid


def add_time_features(df: pd.DataFrame):
    # TransactionDT is seconds from a reference point; derive day/hour
    df['DT_day'] = (df['TransactionDT'] / (24*60*60)).astype(np.int32)
    df['DT_hour'] = (df['TransactionDT'] / (60*60)).astype(np.int32)
    df['DT_dayofweek'] = (df['DT_day'] % 7).astype(np.int8)
    df['DT_week'] = (df['DT_day'] // 7).astype(np.int32)
    return df


def frequency_encode(train: pd.DataFrame, test: pd.DataFrame, cols):
    for c in cols:
        freq = train[c].value_counts(dropna=False)
        train[f'{c}_freq'] = train[c].map(freq).astype(np.float32)
        test[f'{c}_freq'] = test[c].map(freq).astype(np.float32)
    return train, test

In [4]:
# -------------------------------
# Read & Merge
# -------------------------------
print("Reading data…")
train_tr = pd.read_csv(TRAIN_TRANS)
train_id = pd.read_csv(TRAIN_ID)

test_tr = pd.read_csv(TEST_TRANS)
test_id = pd.read_csv(TEST_ID)

print("Merging identity…")
train = train_tr.merge(train_id, how='left', on='TransactionID')
train.drop(columns=['TransactionID'], inplace=True)

test = test_tr.merge(test_id, how='left', on='TransactionID')
# Keep for submission later
test_transaction_ids = test_tr['TransactionID'].values

# Reduce mem a bit before feature work
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

del train_tr, train_id, test_tr, test_id
gc.collect()

Reading data…
Merging identity…
Mem. usage decreased to 1654.18 Mb (35.4% reduction)
Mem. usage decreased to 1436.05 Mb (35.1% reduction)


0

In [5]:
# -------------------------------
# Light Cleaning & Feature Engineering (SAFE)
# -------------------------------
print("Feature engineering…")

import re

def normalize_id_columns(df):
    """Rename id-* / id_* columns to a unified id_XX format."""
    mapping = {}
    for c in df.columns:
        m = re.match(r'^id[-_](\d{2})$', c)
        if m:
            mapping[c] = f'id_{m.group(1)}'
    if mapping:
        df.rename(columns=mapping, inplace=True)

# normalize both train and test once
normalize_id_columns(train)
normalize_id_columns(test)


# Helper: present columns
def present_cols(df, cols):
    return [c for c in cols if c in df.columns]

ALL_ID_COLS = [f'id_{i:02d}' for i in list(range(1,21)) + [28,29,30,31,32,33,34,35,36,37,38]]
ID_TRAIN = present_cols(train, ALL_ID_COLS)
ID_TEST  = present_cols(test,  ALL_ID_COLS)


# --- A) De-categorize columns we will mutate (avoid 'new category' errors) ---
cols_we_mutate = [
    'P_emaildomain','R_emaildomain','DeviceInfo','DeviceType','DeviceBrand',
    'card4','card6','M1','M2','M3','M4','M5','M6','M7','M8','M9'
]
for c in cols_we_mutate:
    if c in train.columns: train[c] = train[c].astype('object')
    if c in test.columns:  test[c]  = test[c].astype('object')

# --- B) Email + Device cleanup ---
for col in ['P_emaildomain', 'R_emaildomain']:
    if col in train.columns: train[col] = clean_email_domain(train[col])
    if col in test.columns:  test[col]  = clean_email_domain(test[col])

if 'DeviceInfo' in train.columns:
    train['DeviceBrand'] = split_device_info(train['DeviceInfo'])
if 'DeviceInfo' in test.columns:
    test['DeviceBrand'] = split_device_info(test['DeviceInfo'])

# --- C) Time features from TransactionDT ---
train = add_time_features(train)
test  = add_time_features(test)

# --- D) Stable user key ---
train['uid'] = make_uid(train)
test['uid']  = make_uid(test)

# --- E) Numeric transforms ---
for df in [train, test]:
    if 'TransactionAmt' in df.columns:
        df['TransactionAmt_log1p'] = np.log1p(df['TransactionAmt'])

# --- F) Cast common string columns to 'category' (AFTER cleaning) ---
maybe_cats = [
    'ProductCD','P_emaildomain','R_emaildomain','DeviceType','DeviceInfo','DeviceBrand',
    'card4','card6','M1','M2','M3','M4','M5','M6','M7','M8','M9','uid'
]
present_cats_train = present_cols(train, maybe_cats) + ID_TRAIN
present_cats_test  = present_cols(test,  maybe_cats) + ID_TEST

for c in present_cats_train: train[c] = train[c].astype('category')
for c in present_cats_test:  test[c]  = test[c].astype('category')

# --- G) Frequency encodings (fit on train only; safe intersections) ---
freq_candidates = ['uid','card1','card2','addr1','addr2',
                   'P_emaildomain','R_emaildomain','DeviceBrand','DeviceType','ProductCD']
freq_cols = present_cols(train, freq_candidates)
train, test = frequency_encode(train, test, freq_cols)

# --- H) Build feature list & prune ultra-sparse columns ---
TARGET = 'isFraud'
ignore_cols = [TARGET]
feature_cols = [c for c in train.columns if c not in ignore_cols]

na_rate = train[feature_cols].isna().mean()
feature_cols = na_rate[na_rate < 0.98].index.tolist()


# --- I) Ensure LightGBM-compatible dtypes ---
# Drop any remaining object/string columns (keep their *_freq encodings instead)
obj_like = train[feature_cols].select_dtypes(include=['object','string']).columns.tolist()
feature_cols = [c for c in feature_cols if c not in obj_like]

# Track categoricals that are actually used
categorical_cols = [c for c in feature_cols if str(train[c].dtype).startswith('category')]

# --- J) Quick sanity prints ---
print("Dropped non-numeric columns:", obj_like)
print("#Features used:", len(feature_cols))
print("Sample features:", feature_cols[:20])
print("Categorical cols:", categorical_cols[:20])
print("Present id_* in TRAIN:", ID_TRAIN[:10], "…", len(ID_TRAIN))
print("Present id_* in TEST :", ID_TEST[:10],  "…", len(ID_TEST))


Feature engineering…
Dropped non-numeric columns: []
#Features used: 440
Sample features: ['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5']
Categorical cols: ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06']
Present id_* in TRAIN: ['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10'] … 31
Present id_* in TEST : ['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10'] … 31


In [6]:
# -------------------------------
# Time‑Aware Purged CV Splitter
# -------------------------------
print("Building time‑aware folds…")

if 'DT_day' not in train.columns:
    raise RuntimeError('DT_day not found – ensure add_time_features() ran.')

# We create folds by day quantiles so validation marches forward in time.
day_values = train['DT_day']
q = np.quantile(day_values, np.linspace(0, 1, N_FOLDS+1))
fold_bounds = [(int(q[i]), int(q[i+1]) if i < N_FOLDS-1 else int(q[i+1])+1) for i in range(N_FOLDS)]
# fold_bounds: list of (start_day, end_day_exclusive)

folds = []
for i, (d0, d1) in enumerate(fold_bounds):
    # Purge: exclude days immediately before/after validation from training
    train_mask = (train['DT_day'] < (d0 - PURGE_DAYS))
    valid_mask = (train['DT_day'] >= d0) & (train['DT_day'] < d1)
    # Also ensure no future leakage: training strictly before validation window
    train_mask &= (train['DT_day'] < d0)

    tr_idx = np.where(train_mask)[0]
    va_idx = np.where(valid_mask)[0]
    if len(va_idx) == 0 or len(tr_idx) == 0:
        continue
    folds.append((tr_idx, va_idx))

print(f"Constructed {len(folds)} folds with purge.")

# -------------------------------
# Train LightGBM with Early Stopping
# -------------------------------
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 256,
    'max_depth': -1,
    'min_data_in_leaf': 200,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'min_gain_to_split': 0.0,
    'is_unbalance': True,  # class imbalance helper
    'verbosity': -1,
    'seed': SEED,
    'num_threads': max(1, os.cpu_count()-1),
}

print("Training…")

oof_pred = np.zeros(len(train), dtype=np.float32)
test_pred = np.zeros(len(test), dtype=np.float32)
feature_importance = pd.DataFrame({'feature': feature_cols, 'importance': 0})

for fold, (tr_idx, va_idx) in enumerate(folds, 1):
    print(f"\nFold {fold}/{len(folds)}: train={len(tr_idx)}, valid={len(va_idx)}")
    tr_data = lgb.Dataset(train.loc[tr_idx, feature_cols], label=train.loc[tr_idx, TARGET])
    va_data = lgb.Dataset(train.loc[va_idx, feature_cols], label=train.loc[va_idx, TARGET])

    model = lgb.train(
    params,
    tr_data,
    num_boost_round=10000,
    valid_sets=[tr_data, va_data],
    valid_names=['train','valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=300),
        lgb.log_evaluation(period=200)
    ],
)

    oof_pred[va_idx] = model.predict(train.loc[va_idx, feature_cols], num_iteration=model.best_iteration)
    test_pred += model.predict(test[feature_cols], num_iteration=model.best_iteration) / len(folds)

    # Importance
    fi = pd.DataFrame({  
        'feature': feature_cols,
        'importance': model.feature_importance(importance_type='gain')
    })
    feature_importance['importance'] += fi['importance'].values

    del tr_data, va_data, model
    gc.collect()

Building time‑aware folds…
Constructed 4 folds with purge.
Training…

Fold 1/4: train=110549, valid=117590
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 1	valid's auc: 0.861782
[400]	train's auc: 1	valid's auc: 0.868853
[600]	train's auc: 1	valid's auc: 0.871612
[800]	train's auc: 1	valid's auc: 0.86979
Early stopping, best iteration is:
[565]	train's auc: 1	valid's auc: 0.871737

Fold 2/4: train=229906, valid=119835
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 0.999985	valid's auc: 0.879576
[400]	train's auc: 1	valid's auc: 0.886842
[600]	train's auc: 1	valid's auc: 0.893049
[800]	train's auc: 1	valid's auc: 0.89677
[1000]	train's auc: 1	valid's auc: 0.898277
[1200]	train's auc: 1	valid's auc: 0.899389
[1400]	train's auc: 1	valid's auc: 0.900028
[1600]	train's auc: 1	valid's auc: 0.899613
Early stopping, best iteration is:
[1462]	train's auc: 1	valid's auc: 0.900195

Fold 3/4: train=350664, valid=118290
Training un

In [7]:
# -------------------------------
# Evaluation & Submission
# -------------------------------
oof_auc = roc_auc_score(train[TARGET], oof_pred)
print(f"\nOOF AUC: {oof_auc:.6f}")


# Save feature importance
feature_importance.sort_values('importance', ascending=False, inplace=True)
feature_importance.to_csv('feature_importance.csv', index=False)
print("Saved feature_importance.csv")


# Submission
sub = pd.read_csv(SAMPLE_SUB)
sub['isFraud'] = test_pred
sub.to_csv('submission.csv', index=False)
print("Saved submission.csv")


OOF AUC: 0.811578
Saved feature_importance.csv
Saved submission.csv
