In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [11]:
identity_train = pd.read_csv(   "/kaggle/input/ieee-fraud-detection/train_identity.csv")
transaction_train = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_transaction.csv")

In [12]:
%%time
from sklearn.model_selection import train_test_split

X_transaction = transaction_train.drop(columns=['isFraud'])
y = transaction_train['isFraud']

X_identity = identity_train.copy()

X = pd.merge(X_transaction, X_identity, on='TransactionID', how='left')


X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)


CPU times: user 5.72 s, sys: 4.95 s, total: 10.7 s
Wall time: 10.8 s


In [13]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 null_threshold=0.6, 
                 encoding_threshold=7, 
                 sampling_strategy='none', 
                 target_ratio=0.5, 
                 l1_regularization=False, 
                 l1_C=0.01):
        self.null_threshold = null_threshold
        self.encoding_threshold = encoding_threshold
        self.sampling_strategy = sampling_strategy
        self.target_ratio = target_ratio
        self.l1_regularization = l1_regularization
        self.l1_C = l1_C
        
    def fit(self, X, y=None):
        X = X.copy()
        
        # 1. Identify columns to drop
        null_frac = X.isnull().mean()
        self.cols_to_drop_ = null_frac[null_frac > self.null_threshold].index.tolist()
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        
        # 2. Update cat_cols and num_cols after dropping
        self.cat_cols_ = X.select_dtypes(include=['object', 'category']).columns.tolist()
        self.num_cols_ = [col for col in X.columns if col not in self.cat_cols_]
        
        # 3. Save fill values
        self.fill_values_ = {}
        for col in self.num_cols_:
            self.fill_values_[col] = X[col].median()
        for col in self.cat_cols_:
            self.fill_values_[col] = X[col].mode(dropna=True)[0]
        
        # 4. Identify columns to apply WOE
        self.onehot_cols_ = []
        self.woe_cols_ = []
       
        for col in self.cat_cols_:
            if X[col].nunique() <= self.encoding_threshold:
                self.onehot_cols_.append(col)
            else:
                self.woe_cols_.append(col)
        
        # 5. Compute WOE mappings for WOE columns
        self.woe_maps_ = {}
        if y is not None:
            for col in self.woe_cols_:
                self.woe_maps_[col] = self._compute_woe(X[col], y)

        # 6. L1 feature selection
        if self.l1_regularization and y is not None:
            X_basic = self._basic_clean(X)
            model = LogisticRegression(penalty='l1', solver='liblinear', C=self.l1_C, max_iter=1000)
            model.fit(X_basic, y)
            non_zero_coef = model.coef_[0] != 0
            self.selected_features_ = X_basic.columns[non_zero_coef].tolist()
        else:
            self.selected_features_ = None

        return self

    def transform(self, X):
        X = X.copy()
        
        # 1. Drop bad columns
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        
        # 2. Fill missing values
        for col, fill_value in self.fill_values_.items():
            if col in X.columns:
                X[col] = X[col].fillna(fill_value)
        
        # 3. Apply WOE encoding for selected columns
        for col in self.woe_cols_:
            if col in X.columns:
                X[col] = X[col].map(self.woe_maps_.get(col, {})).fillna(0)

        # 4. Apply One-Hot encoding for other columns
        X = pd.get_dummies(X, columns=self.onehot_cols_, drop_first=True)
        
        # 5. If L1 selection, keep only selected features
        if self.selected_features_ is not None:
            for feature in self.selected_features_:
                if feature not in X.columns:
                    X[feature] = 0
            X = X[self.selected_features_]
        
        return X

    def fit_resample(self, X, y):
        """Optional resampling after cleaning"""
        X_clean = self.fit(X, y).transform(X)
        
        if self.sampling_strategy == 'undersample':
            fraud = X_clean[y == 1]
            legit = X_clean[y == 0]
            legit_downsampled = resample(legit, replace=False, 
                                         n_samples=int(len(fraud) / self.target_ratio - len(fraud)), 
                                         random_state=42)
            X_resampled = pd.concat([fraud, legit_downsampled])
            y_resampled = np.array([1]*len(fraud) + [0]*len(legit_downsampled))
        
        elif self.sampling_strategy == 'undersample':
            fraud = X_clean[y == 1]
            legit = X_clean[y == 0]
            fraud_upsampled = resample(fraud, replace=True, 
                                       n_samples=int(len(legit) * self.target_ratio / (1 - self.target_ratio)), 
                                       random_state=42)
            X_resampled = pd.concat([fraud_upsampled, legit])
            y_resampled = np.array([1]*len(fraud_upsampled) + [0]*len(legit))
        
        else:
            X_resampled = X_clean
            y_resampled = y
        
        return X_resampled, y_resampled
    
    def _basic_clean(self, X):
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        for col, fill_value in self.fill_values_.items():
            if col in X.columns:
                X[col] = X[col].fillna(fill_value)
        X = pd.get_dummies(X, columns=self.cat_cols_, drop_first=True)
        return X

    def _compute_woe(self, series, y):
        df = pd.DataFrame({'feature': series, 'target': y})
        grouped = df.groupby('feature')['target']
        event = grouped.sum()
        non_event = grouped.count() - event
        event_rate = (event + 0.5) / event.sum()
        non_event_rate = (non_event + 0.5) / non_event.sum()
        woe = np.log(event_rate / non_event_rate)
        return woe.to_dict()


In [14]:
pre_high_null = CustomPreprocessor(null_threshold=0.8)
pre_low_null = CustomPreprocessor(null_threshold=0.2)

pre_undersampled = CustomPreprocessor(sampling_strategy='undersample')
pre_undersampled_low = CustomPreprocessor(sampling_strategy='undersample', target_ratio=0.3)

In [15]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def correlation_filter(X, threshold=0.9):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X_filtered = X.drop(columns=to_drop)
    return X_filtered, to_drop

def apply_rfe(X, y, n_features=20, estimator=LogisticRegression()):
    model = estimator
    selector = RFE(model, n_features_to_select=n_features, )
    selector = selector.fit(X, y)
    
    selected_columns = X.columns[selector.support_].tolist()
    X_selected = X[selected_columns]
    return X_selected, selected_columns

def process_dataset(X, y, corr_threshold=0.7, n_features=15, estimator=LogisticRegression()):
    X_corr_filtered, dropped_corr = correlation_filter(X, threshold=corr_threshold)
    print("Correlaction filter finished")
    X_final, selected_cols = apply_rfe(X_corr_filtered, y, n_features=n_features, estimator=estimator)
    print("RFE finished")
    return X_final, dropped_corr, selected_cols


In [16]:
%%time
print("transforming undersampled")
X_undersampled, y_undersampled = pre_undersampled.fit_resample(X_train, y_train)
print("transforming undersampled_low")
X_undersampled_low, y_undersampled_low = pre_undersampled_low.fit_resample(X_train, y_train)

print("transforming high_null")
X_high_null = pre_high_null.fit_transform(X_train, y_train)
print("transforming low_null")
X_low_null = pre_low_null.fit_transform(X_train, y_train)

transforming undersampled
transforming undersampled_low
transforming high_null
transforming low_null
CPU times: user 42.9 s, sys: 22.5 s, total: 1min 5s
Wall time: 1min 5s


In [18]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

In [19]:
print("Processing X_undersampled")
X_undersampled_final, undersampled_dropped_corr, undersampled_selected = process_dataset(X_undersampled, y_undersampled, estimator=model)

Processing X_undersampled


  return op(a, b)


Correlaction filter finished
RFE finished


In [20]:
print("Processing X_undersampled_low")
X_undersampled_low_final, undersampled_low_dropped_corr, undersampled_low_selected = process_dataset(X_undersampled_low, y_undersampled_low, estimator=model)

Processing X_undersampled_low


  return op(a, b)


Correlaction filter finished
RFE finished


In [23]:
!pip install mlflow dagshub

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.51.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1

In [24]:
import dagshub
import os
os.environ['MLFLOW_TRACKING_USERNAME'] = 'lmamu21' 
os.environ['MLFLOW_TRACKING_PASSWORD'] = '8bc574422c1ba5ebd3c7e16e00460a8560803a94'
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/lmamu21/fraud-detection.mlflow'

dagshub.init(repo_owner='lmamu21', repo_name='fraud-detection', mlflow=True)




Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=8f029ad5-d4fd-4a01-9408-2c0152c86832&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=af4c25c1045f89fb2e6fa0d0fe39653979af23424431443309eb4cf7b381b02d




Output()

In [27]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

mlflow.set_experiment("RandomForest")

with mlflow.start_run(run_name="RandomForest_undersampled"):
    X_validation_clean = pre_undersampled.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_undersampled_final.columns, fill_value=0)

    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )

    rf_model.fit(X_undersampled_final, y_undersampled)

    
    y_val_pred = rf_model.predict(X_validation_clean)
    y_val_proba = rf_model.predict_proba(X_validation_clean)[:, 1]

    
    preprocessor_params = {
        'null_threshold': 0.6,
        'encoding_threshold': 7,
        'sampling_strategy': 'undersampling',
        'target_ratio': 0.5,
        'l1_regularization': False,
    }

    feat_selection_params = {
        'corr_threshold': 0.7,
        'n_features_to_select': 15,
    }

    
    mlflow.log_params({
        'n_estimators': 100,
        'max_depth': None,
        'random_state': 42
    })

    mlflow.log_metrics({
        "accuracy": accuracy_score(y_validation, y_val_pred),
        "precision": precision_score(y_validation, y_val_pred),
        "recall": recall_score(y_validation, y_val_pred),
        "f1_score": f1_score(y_validation, y_val_pred),
        "roc_auc": roc_auc_score(y_validation, y_val_proba)
    })

    mlflow.sklearn.log_model(rf_model, artifact_path="random_forest_model_undersampled")
    
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/random_forest_model_undersampled"
    mlflow.register_model(model_uri, "random_forest_model_undersampled")


Successfully registered model 'random_forest_model_undersampled'.
2025/04/30 16:44:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest_model_undersampled, version 1
Created version '1' of model 'random_forest_model_undersampled'.


🏃 View run RandomForest_undersampled at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/4/runs/2cd2803d39e14f678a48dc58c3580e46
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/4


In [28]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

mlflow.set_experiment("RandomForest")

with mlflow.start_run(run_name="RandomForest_undersampled_low"):
    X_validation_clean = pre_undersampled.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_undersampled_low_final.columns, fill_value=0)

    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )

    rf_model.fit(X_undersampled_low_final, y_undersampled_low)

    
    y_val_pred = rf_model.predict(X_validation_clean)
    y_val_proba = rf_model.predict_proba(X_validation_clean)[:, 1]

    
    preprocessor_params = {
        'null_threshold': 0.6,
        'encoding_threshold': 7,
        'sampling_strategy': 'undersampling',
        'target_ratio': 0.3,
        'l1_regularization': False,
    }

    feat_selection_params = {
        'corr_threshold': 0.7,
        'n_features_to_select': 15,
    }

    
    mlflow.log_params({
        'n_estimators': 100,
        'max_depth': None,
        'random_state': 42
    })

    mlflow.log_metrics({
        "accuracy": accuracy_score(y_validation, y_val_pred),
        "precision": precision_score(y_validation, y_val_pred),
        "recall": recall_score(y_validation, y_val_pred),
        "f1_score": f1_score(y_validation, y_val_pred),
        "roc_auc": roc_auc_score(y_validation, y_val_proba)
    })

    mlflow.sklearn.log_model(rf_model, artifact_path="random_forest_model_undersampled_low")
    

    model_uri = f"runs:/{mlflow.active_run().info.run_id}/random_forest_model_undersampled_low"
    mlflow.register_model(model_uri, "random_forest_model_undersampled_low")

Successfully registered model 'random_forest_model_undersampled_low'.
2025/04/30 16:44:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest_model_undersampled_low, version 1
Created version '1' of model 'random_forest_model_undersampled_low'.


🏃 View run RandomForest_undersampled_low at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/4/runs/c180072902cb469fbcc97edc15eb0e75
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/4
