In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
%%time
identity_train = pd.read_csv(   "/kaggle/input/ieee-fraud-detection/train_identity.csv")
transaction_train = pd.read_csv("/kaggle/input/ieee-fraud-detection/train_transaction.csv")

CPU times: user 20.7 s, sys: 5.13 s, total: 25.9 s
Wall time: 31.4 s


In [3]:
%%time
from sklearn.model_selection import train_test_split

X_transaction = transaction_train.drop(columns=['isFraud'])
y = transaction_train['isFraud']

X_identity = identity_train.copy()

X = pd.merge(X_transaction, X_identity, on='TransactionID', how='left')


X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)


CPU times: user 4.86 s, sys: 1.56 s, total: 6.43 s
Wall time: 6.2 s


In [4]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 null_threshold=0.6, 
                 encoding_threshold=7, 
                 sampling_strategy='none', 
                 target_ratio=0.5, 
                 l1_regularization=False, 
                 l1_C=0.01):
        self.null_threshold = null_threshold
        self.encoding_threshold = encoding_threshold
        self.sampling_strategy = sampling_strategy
        self.target_ratio = target_ratio
        self.l1_regularization = l1_regularization
        self.l1_C = l1_C
        
    def fit(self, X, y=None):
        X = X.copy()
        
        # 1. Identify columns to drop
        null_frac = X.isnull().mean()
        self.cols_to_drop_ = null_frac[null_frac > self.null_threshold].index.tolist()
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        
        # 2. Update cat_cols and num_cols after dropping
        self.cat_cols_ = X.select_dtypes(include=['object', 'category']).columns.tolist()
        self.num_cols_ = [col for col in X.columns if col not in self.cat_cols_]
        
        # 3. Save fill values
        self.fill_values_ = {}
        for col in self.num_cols_:
            self.fill_values_[col] = X[col].median()
        for col in self.cat_cols_:
            self.fill_values_[col] = X[col].mode(dropna=True)[0]
        
        # 4. Identify columns to apply WOE
        self.onehot_cols_ = []
        self.woe_cols_ = []
       
        for col in self.cat_cols_:
            if X[col].nunique() <= self.encoding_threshold:
                self.onehot_cols_.append(col)
            else:
                self.woe_cols_.append(col)
        
        # 5. Compute WOE mappings for WOE columns
        self.woe_maps_ = {}
        if y is not None:
            for col in self.woe_cols_:
                self.woe_maps_[col] = self._compute_woe(X[col], y)

        # 6. L1 feature selection
        if self.l1_regularization and y is not None:
            X_basic = self._basic_clean(X)
            model = LogisticRegression(penalty='l1', solver='liblinear', C=self.l1_C, max_iter=1000)
            model.fit(X_basic, y)
            non_zero_coef = model.coef_[0] != 0
            self.selected_features_ = X_basic.columns[non_zero_coef].tolist()
        else:
            self.selected_features_ = None

        return self

    def transform(self, X):
        X = X.copy()
        
        # 1. Drop bad columns
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        
        # 2. Fill missing values
        for col, fill_value in self.fill_values_.items():
            if col in X.columns:
                X[col] = X[col].fillna(fill_value)
        
        # 3. Apply WOE encoding for selected columns
        for col in self.woe_cols_:
            if col in X.columns:
                X[col] = X[col].map(self.woe_maps_.get(col, {})).fillna(0)

        # 4. Apply One-Hot encoding for other columns
        X = pd.get_dummies(X, columns=self.onehot_cols_, drop_first=True)
        
        # 5. If L1 selection, keep only selected features
        if self.selected_features_ is not None:
            for feature in self.selected_features_:
                if feature not in X.columns:
                    X[feature] = 0
            X = X[self.selected_features_]
        
        return X

    def fit_resample(self, X, y):
        """Optional resampling after cleaning"""
        X_clean = self.fit(X, y).transform(X)
        
        if self.sampling_strategy == 'undersample':
            fraud = X_clean[y == 1]
            legit = X_clean[y == 0]
            legit_downsampled = resample(legit, replace=False, 
                                         n_samples=int(len(fraud) / self.target_ratio - len(fraud)), 
                                         random_state=42)
            X_resampled = pd.concat([fraud, legit_downsampled])
            y_resampled = np.array([1]*len(fraud) + [0]*len(legit_downsampled))
        
        elif self.sampling_strategy == 'undersample':
            fraud = X_clean[y == 1]
            legit = X_clean[y == 0]
            fraud_upsampled = resample(fraud, replace=True, 
                                       n_samples=int(len(legit) * self.target_ratio / (1 - self.target_ratio)), 
                                       random_state=42)
            X_resampled = pd.concat([fraud_upsampled, legit])
            y_resampled = np.array([1]*len(fraud_upsampled) + [0]*len(legit))
        
        else:
            X_resampled = X_clean
            y_resampled = y
        
        return X_resampled, y_resampled
    
    def _basic_clean(self, X):
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        for col, fill_value in self.fill_values_.items():
            if col in X.columns:
                X[col] = X[col].fillna(fill_value)
        X = pd.get_dummies(X, columns=self.cat_cols_, drop_first=True)
        return X

    def _compute_woe(self, series, y):
        df = pd.DataFrame({'feature': series, 'target': y})
        grouped = df.groupby('feature')['target']
        event = grouped.sum()
        non_event = grouped.count() - event
        event_rate = (event + 0.5) / event.sum()
        non_event_rate = (non_event + 0.5) / non_event.sum()
        woe = np.log(event_rate / non_event_rate)
        return woe.to_dict()


In [6]:
pre_high_null = CustomPreprocessor(null_threshold=0.8)
pre_low_null = CustomPreprocessor(null_threshold=0.2)

pre_undersampled = CustomPreprocessor(sampling_strategy='undersample')
pre_undersampled_low = CustomPreprocessor(sampling_strategy='undersample', target_ratio=0.3)

In [15]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def correlation_filter(X, threshold=0.9):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X_filtered = X.drop(columns=to_drop)
    return X_filtered, to_drop

def apply_rfe(X, y, n_features=20, estimator=LogisticRegression()):
    model = estimator
    selector = RFE(model, n_features_to_select=n_features, )
    selector = selector.fit(X, y)
    
    selected_columns = X.columns[selector.support_].tolist()
    X_selected = X[selected_columns]
    return X_selected, selected_columns

def process_dataset(X, y, corr_threshold=0.7, n_features=15, estimator=LogisticRegression()):
    X_corr_filtered, dropped_corr = correlation_filter(X, threshold=corr_threshold)
    print("Correlaction filter finished")
    X_final, selected_cols = apply_rfe(X_corr_filtered, y, n_features=n_features, estimator=estimator)
    print("RFE finished")
    return X_final, dropped_corr, selected_cols


In [16]:
%%time
print("transforming undersampled")
X_undersampled, y_undersampled = pre_undersampled.fit_resample(X_train, y_train)
print("transforming undersampled_low")
X_undersampled_low, y_undersampled_low = pre_undersampled_low.fit_resample(X_train, y_train)

print("transforming high_null")
X_high_null = pre_high_null.fit_transform(X_train, y_train)
print("transforming low_null")
X_low_null = pre_low_null.fit_transform(X_train, y_train)

transforming undersampled
transforming undersampled_low
transforming high_null
transforming low_null
CPU times: user 35.3 s, sys: 8.4 s, total: 43.7 s
Wall time: 43.8 s


In [17]:
%%time
from sklearn.tree import DecisionTreeClassifier
    
estimator = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=10,
    random_state=42
)

print("Processing X_undersampled")
X_undersampled_final, undersampled_dropped_corr, undersampled_selected = process_dataset(X_undersampled, y_undersampled, estimator=estimator)

print("Processing X_undersampled_low")
X_undersampled_low_final, undersampled_low_dropped_corr, undersampled_low_selected = process_dataset(X_undersampled_low, y_undersampled_low, estimator=estimator)

print("Processing X_high_null")
X_high_null_final, high_null_dropped_corr, high_null_selected = process_dataset(X_high_null, y_train, estimator=estimator)

print("Processing X_low_null")
X_low_null_final, low_null_dropped_corr, low_null_selected = process_dataset(X_low_null, y_train, estimator=estimator)


Processing X_undersampled


  return op(a, b)


Correlaction filter finished
RFE finished
Processing X_undersampled_low


  return op(a, b)


Correlaction filter finished
RFE finished
Processing X_high_null


  return op(a, b)


Correlaction filter finished
RFE finished
Processing X_low_null


  return op(a, b)


Correlaction filter finished
RFE finished
CPU times: user 14min 3s, sys: 334 ms, total: 14min 3s
Wall time: 14min 6s


In [None]:
!pip install dagshub mlflow

Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-3.5.2-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.7.1-py3-none-any.whl.metadata (1.4 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting dagshub-annotation-converter>=0.1.5 (from dagshub)
  Downloading dagshub_annotation_converter-0.1.9-py3-none-any.whl.metadata (2.5 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloadin

In [20]:
import dagshub
import os
os.environ['MLFLOW_TRACKING_USERNAME'] = 'lmamu21' 
os.environ['MLFLOW_TRACKING_PASSWORD'] = '8bc574422c1ba5ebd3c7e16e00460a8560803a94'
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/lmamu21/fraud-detection.mlflow'

dagshub.init(repo_owner='lmamu21', repo_name='fraud-detection', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=349a58f6-1045-4829-9e2a-fc1e189c0729&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=b388cb9ad2626166d1d64f61b46ba01b722fb77034e410a9339811144a7d6d51




Output()

In [21]:
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score



mlflow.set_experiment("DecisionTree")

with mlflow.start_run(run_name="DecisionTree_low_null"):
    X_validation_clean = pre_low_null.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_low_null_final.columns, fill_value=0)
    
    model = DecisionTreeClassifier(
        max_depth=5,
        min_samples_split=10,
        random_state=42
    )
    
    model.fit(X_low_null_final, y_train)
    
    y_pred = model.predict(X_validation_clean)

    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1]  # Get probabilities for the positive class
    roc_auc = roc_auc_score(y_validation, y_pred_proba)
    mlflow.log_metric("roc_auc", roc_auc)

    preprocessor_params = {
        'null_threshold': 0.2,
        'encoding_threshold': 7,
        'sampling_strategy': 'none',
        'l1_regularization': False,
        'C': 1
    }

    feat_selection_params = {
        'corr_threshold': 0.7,
        'n_features_to_select': 15,
    }

    mlflow.log_params(preprocessor_params)
    mlflow.log_params(feat_selection_params)
    
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("random_state", 42)

    acc = accuracy_score(y_validation, y_pred)
    prec = precision_score(y_validation, y_pred)
    rec = recall_score(y_validation, y_pred)
    f1 = f1_score(y_validation, y_pred)
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    
 
    mlflow.sklearn.log_model(model, artifact_path="decision_tree_model_low_null")

    mlflow.end_run()




🏃 View run DecisionTree_low_null at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/2/runs/da57d50efa4f42b3a5daff3f83dec385
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/2


In [22]:
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score



mlflow.set_experiment("DecisionTree")

with mlflow.start_run(run_name="DecisionTree_high_null"):
    X_validation_clean = pre_high_null.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_high_null_final.columns, fill_value=0)
    
    model = DecisionTreeClassifier(
        max_depth=5,
        min_samples_split=10,
        random_state=42
    )
    
    model.fit(X_high_null_final, y_train)
    
    y_pred = model.predict(X_validation_clean)

    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1]  # Get probabilities for the positive class
    roc_auc = roc_auc_score(y_validation, y_pred_proba)
    mlflow.log_metric("roc_auc", roc_auc)

    preprocessor_params = {
        'null_threshold': 0.8,
        'encoding_threshold': 7,
        'sampling_strategy': 'none',
        'l1_regularization': False,
        'C': 1
    }

    feat_selection_params = {
        'corr_threshold': 0.7,
        'n_features_to_select': 15,
    }

    mlflow.log_params(preprocessor_params)
    mlflow.log_params(feat_selection_params)
    
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("random_state", 42)

    acc = accuracy_score(y_validation, y_pred)
    prec = precision_score(y_validation, y_pred)
    rec = recall_score(y_validation, y_pred)
    f1 = f1_score(y_validation, y_pred)
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    
 
    mlflow.sklearn.log_model(model, artifact_path="decision_tree_model_high_null")

    mlflow.end_run()




🏃 View run DecisionTree_high_null at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/2/runs/bb4c29c2d9ec4935a88868a95079f748
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/2


In [23]:
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score



mlflow.set_experiment("DecisionTree")

with mlflow.start_run(run_name="DecisionTree_undersampled"):
    X_validation_clean = pre_undersampled.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_undersampled_final.columns, fill_value=0)
    
    model = DecisionTreeClassifier(
        max_depth=5,
        min_samples_split=10,
        random_state=42
    )
    
    model.fit(X_undersampled_final, y_undersampled)
    
    y_pred = model.predict(X_validation_clean)

    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1]  # Get probabilities for the positive class
    roc_auc = roc_auc_score(y_validation, y_pred_proba)
    mlflow.log_metric("roc_auc", roc_auc)

    preprocessor_params = {
        'null_threshold': 0.6,
        'encoding_threshold': 7,
        'sampling_strategy': 'undersampling',
        'l1_regularization': False,
    }

    feat_selection_params = {
        'corr_threshold': 0.7,
        'n_features_to_select': 15,
    }

    mlflow.log_params(preprocessor_params)
    mlflow.log_params(feat_selection_params)
    
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("random_state", 42)

    acc = accuracy_score(y_validation, y_pred)
    prec = precision_score(y_validation, y_pred)
    rec = recall_score(y_validation, y_pred)
    f1 = f1_score(y_validation, y_pred)
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    
 
    mlflow.sklearn.log_model(model, artifact_path="decision_tree_model_undersampled")

    mlflow.end_run()




🏃 View run DecisionTree_undersampled at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/2/runs/10028cb1986143e1ab6f60dc44aea7f4
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/2


In [25]:
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score



mlflow.set_experiment("DecisionTree")

with mlflow.start_run(run_name="DecisionTree_undersampled_low"):
    X_validation_clean = pre_undersampled_low.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_undersampled_low_final.columns, fill_value=0)
    
    model = DecisionTreeClassifier(
        max_depth=5,
        min_samples_split=10,
        random_state=42
    )
    
    model.fit(X_undersampled_low_final, y_undersampled_low)
    
    y_pred = model.predict(X_validation_clean)

    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1]  # Get probabilities for the positive class
    roc_auc = roc_auc_score(y_validation, y_pred_proba)
    mlflow.log_metric("roc_auc", roc_auc)

    preprocessor_params = {
        'null_threshold': 0.6,
        'encoding_threshold': 7,
        'sampling_strategy': 'undersampling',
        'target_ratio': 0.3,
        'l1_regularization': False,
    }

    feat_selection_params = {
        'corr_threshold': 0.7,
        'n_features_to_select': 15,
    }

    mlflow.log_params(preprocessor_params)
    mlflow.log_params(feat_selection_params)
    
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("min_samples_split", 10)
    mlflow.log_param("random_state", 42)

    acc = accuracy_score(y_validation, y_pred)
    prec = precision_score(y_validation, y_pred)
    rec = recall_score(y_validation, y_pred)
    f1 = f1_score(y_validation, y_pred)
    
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    
 
    mlflow.sklearn.log_model(model, artifact_path="decision_tree_model_undersampled_low")

    mlflow.end_run()




🏃 View run DecisionTree_undersampled_low at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/2/runs/49aef25cb0304a3cb826ac50469b9188
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/2
