In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
!pip install dagshub mlflow --quiet
!pip install imbalanced-learn==0.11.0 --quiet

In [3]:
import dagshub
import mlflow

from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
import xgboost as xgb
from category_encoders import WOEEncoder
import warnings
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from imblearn.pipeline import Pipeline as ImbPipeline  # ✅ Corrected import
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# import shap
import mlflow.data
from mlflow.data.pandas_dataset import PandasDataset
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

In [4]:
import mlflow
from dagshub import dagshub_logger
import os

# Set tracking URI manually
mlflow.set_tracking_uri("https://dagshub.com/nkikn21/IEEE-CIS-Fraud-Detection.mlflow")

# Use your DagsHub credentials
os.environ["MLFLOW_TRACKING_USERNAME"] = "nkikn21"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "90ec7031365aea1b6ca271f4236c194e530973c8"

# Optional: set registry if you're using model registry
mlflow.set_registry_uri("https://dagshub.com/nkikn21/IEEE-CIS-Fraud-Detection.mlflow")



In [5]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

In [6]:
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
# test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
# test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')

In [7]:
train_joined = train_transaction.merge(train_identity, on="TransactionID", how="left")
# test_joined = test_transaction.merge(test_identity, on="TransactionID", how="left")

In [8]:
train_joined['TransactionDay'] = train_joined['TransactionDT'] // (24*60*60)
# test_joined['TransactionDay'] = test_joined['TransactionDT'] // (24*60*60)
cutoff_day = train_joined['TransactionDay'].max() - 30  # last 30 days for validation

train_data = train_joined[train_joined['TransactionDay'] <= cutoff_day]
val_data = train_joined[train_joined['TransactionDay'] > cutoff_day]

X_train = train_data.drop(columns=['isFraud', 'TransactionID'])
y_train = train_data['isFraud']

X_val = val_data.drop(columns=['isFraud', 'TransactionID'])
y_val = val_data['isFraud']

# X_test = test_joined.drop(columns=['TransactionID', 'isFraud'], errors='ignore')  # Ignore errors in case 'isFraud' isn't in test_joined


In [9]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [10]:
s = X_train[cat_cols].nunique()

threshold = 3

woe_columns = list(s[s > threshold].index)
one_hot_columns = list(s[s <= threshold].index)

# Cleaning

In [11]:
import mlflow
from sklearn.base import BaseEstimator, TransformerMixin

class Cleaning(BaseEstimator, TransformerMixin):
    
    def __init__(self, woe_columns, one_hot_columns, num_cols, log_mlflow=False, null_threshold=0.8):
        self.woe_columns = woe_columns
        self.one_hot_columns = one_hot_columns
        self.num_cols = num_cols
        self.log_mlflow = log_mlflow
        self.null_threshold = null_threshold

    def fit(self, X, y=None):
        # Drop columns with more than threshold% missing values
        null_frac = X.isnull().mean()
        self.to_drop = null_frac[null_frac > self.null_threshold].index.tolist()

        # Fill strategies for remaining columns
        self.woe_columns_fill_na = X[self.woe_columns].mode().T[0].to_dict()
        self.one_hot_columns_fill_na = X[self.one_hot_columns].mode().T[0].to_dict()
        self.num_cols_fill_na = X[self.num_cols].median().to_dict()

        # MLflow logging
        if self.log_mlflow:
            experiment_name = 'LGBM_Training'
            run_name = 'LGBM_Cleaning'
            
            mlflow.set_experiment(experiment_name)
            mlflow.start_run(run_name=run_name)

            mlflow.log_param("cat_cols_handling", "mode")
            mlflow.log_param("num_cols_handling", "median")
            mlflow.log_param("dropped_cols_threshold", self.null_threshold)
            mlflow.log_param("dropped_columns", self.to_drop)

            mlflow.end_run()
        
        return self

    def transform(self, X):
        X_transformed = X.copy()

        # Drop columns identified during fit
        X_transformed = X_transformed.drop(columns=self.to_drop, errors='ignore')

        # Fill WOE columns
        for col in self.woe_columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna(self.woe_columns_fill_na.get(col))

        # Fill one-hot columns
        for col in self.one_hot_columns:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna(self.one_hot_columns_fill_na.get(col))

        # Fill numeric columns
        for col in self.num_cols:
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna(self.num_cols_fill_na.get(col))

        return X_transformed


# Feature Engineering

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import mlflow

class FeatureEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self, woe_columns=None, one_hot_columns=None, woe_mappings=None, woe_columns_fill_na=None, log_mlflow=False):
        self.woe_columns = woe_columns
        self.one_hot_columns = one_hot_columns
        self.woe_mappings = woe_mappings
        self.woe_columns_fill_na = woe_columns_fill_na
        self.log_mlflow = log_mlflow

    def fit(self, X, y=None):
        # Assign default dicts if None (safe to do here)
        self.woe_mappings_ = self.woe_mappings or {}
        self.woe_columns_fill_na_ = self.woe_columns_fill_na or {}

        # Optional: make sure columns exist
        self.woe_columns_ = [col for col in self.woe_columns if col in X.columns]
        self.one_hot_columns_ = [col for col in self.one_hot_columns if col in X.columns]

        # Log to MLflow
        if self.log_mlflow:
            experiment_name = "LGBM_Training"
            run_name = "LGBM_Feature_Engineering"
            mlflow.set_experiment(experiment_name)
            with mlflow.start_run(run_name=run_name):
                mlflow.log_param("woe_columns", self.woe_columns_)
                mlflow.log_param("one_hot_columns", self.one_hot_columns_)

        return self

    def transform(self, X):
        X_transformed = X.copy()

        # WOE encoding
        for col in self.woe_columns_:
            mapping = self.woe_mappings_.get(col, {})
            default_val = mapping.get(self.woe_columns_fill_na_.get(col), 0)
            new_col = f'{col}_woe'
            X_transformed[new_col] = X_transformed[col].map(mapping).fillna(default_val)
            X_transformed.drop(columns=col, inplace=True)

        # One-hot encoding
        X_transformed = pd.get_dummies(
            X_transformed, 
            columns=self.one_hot_columns_, 
            drop_first=True, 
            dummy_na=True,
            dtype=int
        )

        return X_transformed


# Feature Selection

In [13]:
class FeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self, model, correlation_threshold=0.8, n_features_to_select=30, log_mlflow=False):
        self.model = model
        self.correlation_threshold = correlation_threshold
        self.n_features_to_select = n_features_to_select
        self.log_mlflow = log_mlflow

    def fit(self, X, y=None):
        # Step 1: Remove highly correlated features
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop = [column for column in upper.columns if any(upper[column] > self.correlation_threshold)]
    
        # Step 2: Run RFE to select top N features
        self.rfe = RFE(self.model, n_features_to_select=self.n_features_to_select)
        self.rfe.fit(X, y)
    
        all_rfe_features = X.columns[self.rfe.support_].tolist()
    
        # Step 3: Final selected features = RFE features minus highly correlated
        self.selected_features = [col for col in all_rfe_features if col not in self.to_drop]
    
        # Step 4: Log to MLflow if enabled
        if self.log_mlflow:
            experiment_name = 'LGBM_Training'
            run_name = 'LGBM_Feature_Selection'
            
            mlflow.set_experiment(experiment_name)
            mlflow.start_run(run_name=run_name)
            
            mlflow.log_param("RFE_all_features", all_rfe_features)
            mlflow.log_param("Highly_correlated_dropped", self.to_drop)
            mlflow.log_param("Selected_features", self.selected_features)

            mlflow.end_run()
    
        return self


    def transform(self, X):
        return X[self.selected_features]


# Training

In [14]:
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier
from imblearn.under_sampling import RandomUnderSampler

# Create the pipeline with LightGBM
imb_pipeline = ImbPipeline(steps=[
    ('undersampler', RandomUnderSampler(random_state=42, sampling_strategy=0.4)),
    
    ('cleaning', Cleaning(
        woe_columns=woe_columns, 
        one_hot_columns=one_hot_columns, 
        num_cols=num_cols, 
        log_mlflow=True
    )),
    
    ('feature_engineering', FeatureEngineering(
        woe_columns=woe_columns, 
        one_hot_columns=one_hot_columns, 
        log_mlflow=True
    )),
    
    ('feature_selection', FeatureSelection(
        model=LGBMClassifier(
            objective='binary',
            boosting_type='gbdt',
            n_jobs=-1,
            random_state=42
        ),
        n_features_to_select=30, 
        log_mlflow=True
    )),
    
    ('scaler', StandardScaler()),
    
    ('classifier', LGBMClassifier(
        objective='binary',
        boosting_type='gbdt',
        n_jobs=-1,
        random_state=42
    ))
])


In [15]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Optional: convert object columns to category if using categorical features
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')

# Define LGBM model
lgbm = LGBMClassifier(
    objective='binary',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Define parameter grid (small for demo; expand as needed)
param_dist = {
    'n_estimators': np.random.randint(100, 501, size=5),  # 100 random ints between 100 and 500
    'max_depth': np.random.randint(3, 12, size=5),         # random ints from 3 to 11
    'learning_rate': np.random.uniform(0.01, 0.3, size=5), # float values between 0.01 and 0.3
}

# Randomized search CV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,  # Number of random combinations to try
    scoring='roc_auc',
    cv=2,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
# Fit
random_search.fit(X_train, y_train)

# Results
print("Best Parameters:", grid_srandom_searchearch.best_params_)
print("Best ROC AUC Score (CV):", grid_search.best_score_)

# Optionally use the best model
best_lgbm = grid_search.best_estimator_


TypeError: estimator should be an estimator implementing 'fit' method, <module 'xgboost' from '/usr/local/lib/python3.11/dist-packages/xgboost/__init__.py'> was passed

**Model_v1**

In [None]:
# imb_pipeline.fit(X_train, y_train)

In [None]:
# experiment_name = 'LGBM_Training'
# run_name = 'Model_v1'

# mlflow.set_experiment(experiment_name)
# mlflow.start_run(run_name=run_name)

# mlflow.sklearn.log_model(imb_pipeline, "LGBM_pipeline")

# # Train the model
# # imb_pipeline.fit(X_train, y_train)

# # Log metrics (e.g., AUC, F1-score, etc.)
# y_pred = imb_pipeline.predict(X_val)
# y_pred_proba = imb_pipeline.predict_proba(X_val)[:, 1]

# auc_score = roc_auc_score(y_val, y_pred_proba)
# f1_score_val = f1_score(y_val, y_pred)
# precision_score_val = precision_score(y_val, y_pred)
# recall_score_val = recall_score(y_val, y_pred)

# mlflow.log_metric("AUC", auc_score)
# mlflow.log_metric("F1_Score", f1_score_val)
# mlflow.log_metric("Precision", precision_score_val)
# mlflow.log_metric("Recall", recall_score_val)

# # Log model parameters
# mlflow.log_param("RandomUnderSampler_Sampling_Strategy", 0.4)
# mlflow.log_param("Classifier", "XGBClassifier")

# mlflow.end_run()