In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install mlflow dagshub

In [None]:
import dagshub
dagshub.init(repo_owner='lkata22',
             repo_name='IEEE-CIS-Fraud-Detection',
             mlflow=True)

import mlflow

### Imports


In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import gc
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier






## Memory Reducer to reduce memory usage of DataFrame

In [None]:
class MemoryReducer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = reduce_mem_usage(X)
        return X

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Cleaner Class

In [None]:
class MissingValueHandler(BaseEstimator, TransformerMixin):
    def __init__(self, num_strategy='median', cat_strategy='constant', fill_value='missing'):
        self.num_strategy = num_strategy
        self.cat_strategy = cat_strategy
        self.fill_value = fill_value
        self.num_impute_values = {}
        self.cat_impute_values = {}
        
    def fit(self, X, y=None):
        num_cols = [col for col in X.columns if X[col].dtype != 'object' 
                   and col not in ['isFraud', 'TransactionID']]
        
        if self.num_strategy == 'median':
            for col in num_cols:
                self.num_impute_values[col] = X[col].median()
        elif self.num_strategy == 'mean':
            for col in num_cols:
                self.num_impute_values[col] = X[col].mean()
        elif self.num_strategy == 'constant':
            for col in num_cols:
                self.num_impute_values[col] = 0 
                
        cat_cols = [col for col in X.columns if X[col].dtype == 'object']
        if self.cat_strategy == 'missing':
            for col in cat_cols:
                self.cat_impute_values[col] = 'missing'
        elif self.cat_strategy == 'mode':
            for col in cat_cols:
                self.cat_impute_values[col] = X[col].mode()[0]
        elif self.cat_strategy == 'constant':
            for col in cat_cols:
                self.cat_impute_values[col] = self.fill_value
                
        return self
    
    def transform(self, X, y=None):
        for col, val in self.num_impute_values.items():
            if col in X.columns:
                X[col].fillna(val, inplace=True)
                X[f'{col}_missing'] = X[col].isna().astype(int)
                
        for col, val in self.cat_impute_values.items():
            if col in X.columns:
                X[col].fillna(val, inplace=True)
                
        return X

## Feature Engineer Class (Creates new features and transforms existing ones


In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_encoders = {}
        
    def fit(self, X, y=None):
        freq_cols = ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']
        for col in freq_cols:
            if col in X.columns:
                self.freq_encoders[col] = X[col].value_counts(dropna=False)
        return self
    
    def transform(self, X, y=None):
        if 'TransactionAmt' in X.columns:
            for group_col in ['card1', 'card4']:
                if group_col in X.columns:
                    group_means = X.groupby([group_col])['TransactionAmt'].mean()
                    X[f'TransactionAmt_to_mean_{group_col}'] = X['TransactionAmt'] / X[group_col].map(group_means)
        
        for col, counts in self.freq_encoders.items():
            if col in X.columns:
                X[f'{col}_freq'] = X[col].map(counts)
        
        if 'P_emaildomain' in X.columns and 'R_emaildomain' in X.columns:
            X['P_emaildomain_match_R_emaildomain'] = (X['P_emaildomain'] == X['R_emaildomain']).astype(int)
            
        return X

## Handles unseen categories and missing values safely

In [None]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
  
    def __init__(self, handle_unseen='impute', impute_value='missing'):
        self.handle_unseen = handle_unseen 
        self.impute_value = impute_value
        self.encoders = {}
        self.known_categories = {}
        
    def fit(self, X, y=None):
        cat_cols = [col for col in X.columns if X[col].dtype == 'object']
        for col in cat_cols:
            unique_vals = X[col].dropna().unique()
            self.known_categories[col] = set(unique_vals)
            
            le = LabelEncoder()
            if self.handle_unseen == 'impute':
                le.fit(np.append(unique_vals, self.impute_value))
            else:
                le.fit(unique_vals)
            self.encoders[col] = le
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        for col, le in self.encoders.items():
            if col in X.columns:
                X[col] = X[col].astype(str)
                X[col] = X[col].replace('nan', self.impute_value)
                
                if self.handle_unseen == 'impute':
                    unseen_mask = ~X[col].isin(self.known_categories[col])
                    X.loc[unseen_mask, col] = self.impute_value
                
                try:
                    X[col] = le.transform(X[col])
                except ValueError:
                    unseen = set(X[col].unique()) - set(le.classes_)
                    X.loc[X[col].isin(unseen), col] = self.impute_value
                    X[col] = le.transform(X[col])
        return X

## Featire Selector Class (Selects features based on importance or other criteria)

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='default'):
        self.strategy = strategy
        self.selected_features = None
        
    def fit(self, X, y=None):
        if self.strategy == 'default':
            cols_to_drop = [
                'TransactionID', 
                'V300', 'V309', 'V111', 'V124', 'V106', 'V125', 'V315', 'V134', 'V102', 'V123', 
                'V136', 'V305', 'V110', 'V129', 'V114', 'V116', 'V298', 'V126', 'V113', 'V105', 
                'V119', 'V104', 'V122', 'V320', 'V115', 'V317', 'V303', 'V112', 'V118', 'V108', 
                'V127', 'V132', 'V109', 'V103', 'V120', 'V107', 'V131', 'V135', 'V308', 'V117', 
                'V121', 'V133', 'V130', 'V318', 'V304', 'V128', 'V319', 'V307', 'V306', 'V302', 
                'V311', 'V301', 'V310'
            ]
            self.selected_features = [col for col in X.columns if col not in cols_to_drop]
            
        return self
    
    def transform(self, X, y=None):
        if self.selected_features is not None:
            X = X.copy()
            for col in self.selected_features:
                if col not in X.columns:
                    X[col] = np.nan 
            return X[self.selected_features]
        return X

## Start of training

In [None]:
RANDOM_STATE = 42
N_FOLDS = 5
TEST_SIZE = 0.2


train_identity = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_identity.csv")
train_transaction = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_transaction.csv")


train_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

print(f"Training data shape: {train_df.shape}")
print(f"Fraud rate: {train_df['isFraud'].mean():.4f}")

X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop('isFraud', axis=1),
    train_df['isFraud'],
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=train_df['isFraud']
)


print("Building preprocessing pipeline...")
preprocessing_pipeline = Pipeline([
    ('memory_reducer', MemoryReducer()),
    ('missing_handler', MissingValueHandler(
        num_strategy='median',
        cat_strategy='constant',
        fill_value='missing'
    )),
    ('feature_engineer', FeatureEngineer()),
    ('categorical_encoder', CategoricalEncoder(
        handle_unseen='impute',
        impute_value='missing'
    )),
    ('feature_selector', FeatureSelector(strategy='default'))
])

print("Preprocessing training data...")
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train, y_train)
X_val_preprocessed = preprocessing_pipeline.transform(X_val)

print("Training XGBoost model...")


kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)


cv_scores = []
models = []


for fold, (train_idx, valid_idx) in enumerate(kf.split(X_train_preprocessed, y_train)):
    print(f"\nFold {fold + 1}/{N_FOLDS}")
    
    X_tr, X_v = X_train_preprocessed.iloc[train_idx], X_train_preprocessed.iloc[valid_idx]
    y_tr, y_v = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    model = XGBClassifier(
        n_estimators=1000,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        missing=-999,
        random_state=RANDOM_STATE,
        eval_metric='auc',
        tree_method='hist'  
    )
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_v, y_v)],
        early_stopping_rounds=100,
        verbose=100
    )
    
    val_preds = model.predict_proba(X_v)[:, 1]
    score = roc_auc_score(y_v, val_preds)
    cv_scores.append(score)
    models.append(model)
    print(f"Fold {fold + 1} AUC: {score:.5f}")

print("\nCross-validation results:")
print(f"Mean AUC: {np.mean(cv_scores):.5f}")
print(f"Std AUC: {np.std(cv_scores):.5f}")

print("\nEvaluating on holdout validation set...")
val_preds = np.mean([model.predict_proba(X_val_preprocessed)[:, 1] for model in models], axis=0)
val_score = roc_auc_score(y_val, val_preds)
print(f"Validation AUC: {val_score:.5f}")






## Feature Importance


In [None]:

print("\nFeature importance analysis...")
feature_importance = pd.DataFrame({
    'feature': X_train_preprocessed.columns,
    'importance': np.mean([model.feature_importances_ for model in models], axis=0)
}).sort_values('importance', ascending=False)


print("\nTop 20 features:")
print(feature_importance.head(20))

## MLFlow logging

In [None]:
mlflow.set_experiment("XGBoost_Training")

# 1. Cleaning
with mlflow.start_run(run_name="XGBoost_Cleaning"):
    mlflow.log_param("memory_reduction", True)
    mlflow.log_param("missing_value_strategy_num", "median")
    mlflow.log_param("missing_value_strategy_cat", "constant_missing")
    mlflow.log_text("Used MemoryReducer and MissingValueHandler for missing values.", "cleaning_notes.txt")

mlflow.end_run()

with mlflow.start_run(run_name="XGBoost_Feature_Engineering"):
    mlflow.log_param("feature_engineering_applied", True)
    mlflow.log_param("freq_encoding_columns", ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2'])
    mlflow.log_text("Created TransactionAmt_to_mean features and Email domain matching.", "feature_engineering_notes.txt")

mlflow.end_run()

with mlflow.start_run(run_name="XGBoost_Feature_Selection"):
    mlflow.log_param("feature_selection_strategy", "manual_drop_low_importance_V-features")
    mlflow.log_param("final_selected_features", len(preprocessing_pipeline.named_steps['feature_selector'].selected_features))
    mlflow.log_text("Dropped around 50 V-features manually based on prior EDA.", "feature_selection_notes.txt")

mlflow.end_run()

with mlflow.start_run(run_name="XGBoost_Final_Model") as run:
    mlflow.log_params({
        "model_type": "XGBoost",
        "n_folds": N_FOLDS,
        "random_state": RANDOM_STATE,
        "n_estimators": 1000,
        "max_depth": 6,
        "learning_rate": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    })
    mlflow.log_metrics({
        "fold_1_auc": cv_scores[0],
        "fold_2_auc": cv_scores[1],
        "fold_3_auc": cv_scores[2],
        "fold_4_auc": cv_scores[3],
        "fold_5_auc": cv_scores[4],
        "mean_auc": np.mean(cv_scores),
        "std_auc": np.std(cv_scores),
        "validation_auc": val_score
    })

    best_model_idx = np.argmax(cv_scores)
    best_model = models[best_model_idx]

    final_pipeline = Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('model', best_model)
    ])
    
    mlflow.xgboost.log_model(best_model, "best_xgboost_model")
    mlflow.sklearn.log_model(final_pipeline, "full_pipeline")

    top_features = feature_importance.head(20).to_dict()
    mlflow.log_dict(top_features, "feature_importance/top_20_features.json")
    
    val_preds_sample = pd.DataFrame({
        'actual': y_val,
        'predicted': val_preds
    }).sample(1000)
    mlflow.log_table(val_preds_sample, "validation_predictions_sample.json")

    mlflow.log_text(f"""
    - Train shape: {X_train_preprocessed.shape}
    - Fraud rate: {y_train.mean():.4f}
    - Best fold AUC: {cv_scores[best_model_idx]:.4f}
    - Validation AUC: {val_score:.4f}
    """, "training_summary.txt")

    print(f"Successfully logged to MLflow! Run ID: {run.info.run_id}")

mlflow.end_run()
