In [1]:
import os
import gc
import pickle
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
class Config:
    SEED = 42
    DATA_PATH = '/content/drive/MyDrive/--DM-Project--/data/'
    MODEL_PATH = '/content/drive/MyDrive/--DM-Project--/xgboost_model.json'
    ENCODER_PATH = '/content/drive/MyDrive/--DM-Project--/encoders.pkl'
    XGB_PARAMS = {
        'n_estimators': 2000,
        'learning_rate': 0.02,
        'max_depth': 12,
        'subsample': 0.8,
        'colsample_bytree': 0.6,
        'objective': 'binary:logistic',
        'eval_metric': 'aucpr',
        'tree_method': 'hist',
        'random_state': 42
    }

In [4]:
def clean_device(x):
    if pd.isna(x): return 'unknown'
    x = str(x).lower()
    if 'ios' in x or 'iphone' in x: return 'apple'
    if 'samsung' in x or 'sm-' in x: return 'samsung'
    if 'huawei' in x: return 'huawei'
    if 'moto' in x: return 'motorola'
    if 'rv:' in x or 'windows' in x: return 'windows'
    return 'other'

In [5]:
def main():
    print("--- Loading Data ---")
    try:
        train_transaction = pd.read_csv(os.path.join(Config.DATA_PATH, 'train_transaction.csv'))
        train_identity = pd.read_csv(os.path.join(Config.DATA_PATH, 'train_identity.csv'))
        df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
        del train_transaction, train_identity
        gc.collect()
    except FileNotFoundError:
        print("Dataset not found.")
        return

    print("--- Feature Engineering ---")
    df['hour'] = (df['TransactionDT'] // 3600) % 24
    df['day'] = (df['TransactionDT'] // (3600 * 24)) % 7

    email_maps = {
        'gmail.com': 'google', 'gmail': 'google', 'googlemail.com': 'google',
        'hotmail.com': 'microsoft', 'outlook.com': 'microsoft', 'msn.com': 'microsoft', 'live.com': 'microsoft',
        'yahoo.com': 'yahoo', 'ymail.com': 'yahoo', 'rocketmail.com': 'yahoo',
        'icloud.com': 'apple', 'me.com': 'apple', 'mac.com': 'apple'
    }

    for c in ['P_emaildomain', 'R_emaildomain']:
        df[c + '_bin'] = df[c].map(email_maps)
        df[c + '_bin'].fillna('unknown', inplace=True)

    df['email_match'] = np.where((df['P_emaildomain'] == df['R_emaildomain']) & (df['P_emaildomain'].notnull()), 1, 0)
    df['device_name'] = df['DeviceInfo'].apply(clean_device)

    # Frequency encoding logic
    freq_cols = ['card1', 'card2', 'addr1', 'P_emaildomain']
    for col in freq_cols:
        df[f'{col}_count'] = df[col].map(df[col].value_counts(dropna=False))

    df['uid'] = df['card1'].astype(str) + '_' + df['addr1'].astype(str) + '_' + df['P_emaildomain'].astype(str)
    df['uid_mean_TransactionAmt'] = df.groupby('uid')['TransactionAmt'].transform('mean')
    df['uid_std_TransactionAmt'] = df.groupby('uid')['TransactionAmt'].transform('std')
    df['TransactionAmt_normalized'] = (df['TransactionAmt'] - df['uid_mean_TransactionAmt']) / (df['uid_std_TransactionAmt'] + 1e-5)

    print("--- Preprocessing ---")
    drop_cols = ['TransactionID', 'TransactionDT', 'uid', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain']
    X = df.drop(drop_cols + ['isFraud'], axis=1, errors='ignore')
    y = df['isFraud']

    cat_cols = X.select_dtypes(include=['object']).columns
    encoders = {}

    for col in cat_cols:
        le = LabelEncoder()
        X[col] = X[col].astype(str).fillna('Unknown')
        X[col] = le.fit_transform(X[col])
        encoders[col] = le
        # Handle unseen labels in production
        encoders[col].classes_ = np.append(encoders[col].classes_, 'Unknown_Unseen')

    with open(Config.ENCODER_PATH, 'wb') as f:
        pickle.dump(encoders, f)

    pos_count = y.sum()
    neg_count = len(y) - pos_count
    Config.XGB_PARAMS['scale_pos_weight'] = neg_count / pos_count

    print("--- Training Final Model ---")
    clf = xgb.XGBClassifier(**Config.XGB_PARAMS)
    clf.fit(X, y)

    clf.save_model(Config.MODEL_PATH)

    with open('feature_names.pkl', 'wb') as f:
        pickle.dump(list(X.columns), f)

    print(f"Artifacts saved: {Config.MODEL_PATH}, {Config.ENCODER_PATH}, feature_names.pkl")

In [7]:
if __name__ == "__main__":
    main()

--- Loading Data ---
--- Feature Engineering ---
--- Preprocessing ---
--- Training Final Model ---
Artifacts saved: /content/drive/MyDrive/--DM-Project--/xgboost_model.json, /content/drive/MyDrive/--DM-Project--/encoders.pkl, feature_names.pkl
