In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
%%time
identity_train = pd.read_csv(   "data/train_identity.csv")
transaction_train = pd.read_csv("data/train_transaction.csv")

CPU times: user 8.13 s, sys: 1.65 s, total: 9.78 s
Wall time: 10.3 s


In [4]:
%%time
from sklearn.model_selection import train_test_split

X_transaction = transaction_train.drop(columns=['isFraud'])
y = transaction_train['isFraud']

X_identity = identity_train.copy()

X = pd.merge(X_transaction, X_identity, on='TransactionID', how='left')


X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)


CPU times: user 3 s, sys: 625 ms, total: 3.63 s
Wall time: 3.35 s


In [5]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 null_threshold=0.6, 
                 encoding_threshold=7, 
                 sampling_strategy='none', 
                 target_ratio=0.5, 
                 l1_regularization=False, 
                 l1_C=0.01):
        self.null_threshold = null_threshold
        self.encoding_threshold = encoding_threshold
        self.sampling_strategy = sampling_strategy
        self.target_ratio = target_ratio
        self.l1_regularization = l1_regularization
        self.l1_C = l1_C
        
    def fit(self, X, y=None):
        X = X.copy()
        
        # 1. Identify columns to drop
        null_frac = X.isnull().mean()
        self.cols_to_drop_ = null_frac[null_frac > self.null_threshold].index.tolist()
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        
        # 2. Update cat_cols and num_cols after dropping
        self.cat_cols_ = X.select_dtypes(include=['object', 'category']).columns.tolist()
        self.num_cols_ = [col for col in X.columns if col not in self.cat_cols_]
        
        # 3. Save fill values
        self.fill_values_ = {}
        for col in self.num_cols_:
            self.fill_values_[col] = X[col].median()
        for col in self.cat_cols_:
            self.fill_values_[col] = X[col].mode(dropna=True)[0]
        
        # 4. Identify columns to apply WOE
        self.onehot_cols_ = []
        self.woe_cols_ = []
       
        for col in self.cat_cols_:
            if X[col].nunique() <= self.encoding_threshold:
                self.onehot_cols_.append(col)
            else:
                self.woe_cols_.append(col)
        
        # 5. Compute WOE mappings for WOE columns
        self.woe_maps_ = {}
        if y is not None:
            for col in self.woe_cols_:
                self.woe_maps_[col] = self._compute_woe(X[col], y)

        # 6. L1 feature selection
        if self.l1_regularization and y is not None:
            X_basic = self._basic_clean(X)
            model = LogisticRegression(penalty='l1', solver='liblinear', C=self.l1_C, max_iter=1000)
            model.fit(X_basic, y)
            non_zero_coef = model.coef_[0] != 0
            self.selected_features_ = X_basic.columns[non_zero_coef].tolist()
        else:
            self.selected_features_ = None

        return self

    def transform(self, X):
        X = X.copy()
        
        # 1. Drop bad columns
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        
        # 2. Fill missing values
        for col, fill_value in self.fill_values_.items():
            if col in X.columns:
                X[col] = X[col].fillna(fill_value)
        
        # 3. Apply WOE encoding for selected columns
        for col in self.woe_cols_:
            if col in X.columns:
                X[col] = X[col].map(self.woe_maps_.get(col, {})).fillna(0)

        # 4. Apply One-Hot encoding for other columns
        X = pd.get_dummies(X, columns=self.onehot_cols_, drop_first=True)
        
        # 5. If L1 selection, keep only selected features
        if self.selected_features_ is not None:
            for feature in self.selected_features_:
                if feature not in X.columns:
                    X[feature] = 0
            X = X[self.selected_features_]
        
        return X

    def fit_resample(self, X, y):
        """Optional resampling after cleaning"""
        X_clean = self.fit(X, y).transform(X)
        
        if self.sampling_strategy == 'undersample':
            fraud = X_clean[y == 1]
            legit = X_clean[y == 0]
            legit_downsampled = resample(legit, replace=False, 
                                         n_samples=int(len(fraud) / self.target_ratio - len(fraud)), 
                                         random_state=42)
            X_resampled = pd.concat([fraud, legit_downsampled])
            y_resampled = np.array([1]*len(fraud) + [0]*len(legit_downsampled))
        
        elif self.sampling_strategy == 'oversample':
            fraud = X_clean[y == 1]
            legit = X_clean[y == 0]
            fraud_upsampled = resample(fraud, replace=True, 
                                       n_samples=int(len(legit) * self.target_ratio / (1 - self.target_ratio)), 
                                       random_state=42)
            X_resampled = pd.concat([fraud_upsampled, legit])
            y_resampled = np.array([1]*len(fraud_upsampled) + [0]*len(legit))
        
        else:
            X_resampled = X_clean
            y_resampled = y
        
        return X_resampled, y_resampled
    
    def _basic_clean(self, X):
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        for col, fill_value in self.fill_values_.items():
            if col in X.columns:
                X[col] = X[col].fillna(fill_value)
        X = pd.get_dummies(X, columns=self.cat_cols_, drop_first=True)
        return X

    def _compute_woe(self, series, y):
        df = pd.DataFrame({'feature': series, 'target': y})
        grouped = df.groupby('feature')['target']
        event = grouped.sum()
        non_event = grouped.count() - event
        event_rate = (event + 0.5) / event.sum()
        non_event_rate = (non_event + 0.5) / non_event.sum()
        woe = np.log(event_rate / non_event_rate)
        return woe.to_dict()


In [6]:
pre_high_null = CustomPreprocessor(null_threshold=0.8)
pre_low_null = CustomPreprocessor(null_threshold=0.5)

pre_undersampled = CustomPreprocessor(sampling_strategy='undersample')
pre_undersampled_low = CustomPreprocessor(sampling_strategy='undersample', target_ratio=0.3)
pre_oversampled = CustomPreprocessor(sampling_strategy='oversample')


In [6]:
%%time
X_high_null = pre_high_null.fit_transform(X_train, y_train)

CPU times: user 7.06 s, sys: 8.36 s, total: 15.4 s
Wall time: 16 s


In [6]:
%%time
X_low_null = pre_low_null.fit_transform(X_train, y_train)

CPU times: user 4.57 s, sys: 5.3 s, total: 9.86 s
Wall time: 10.5 s


In [9]:
%%time
X_undersampled, y_undersampled = pre_undersampled.fit_resample(X_train, y_train) 

CPU times: user 5.3 s, sys: 6.34 s, total: 11.6 s
Wall time: 13.2 s


In [7]:
%%time
X_undersampled_low, y_undersampled_low = pre_undersampled_low.fit_resample(X_train, y_train) 

CPU times: user 5.37 s, sys: 5.81 s, total: 11.2 s
Wall time: 12.6 s


In [7]:
%%time
X_oversampled, y_oversampled = pre_oversampled.fit_resample(X_train, y_train)

CPU times: user 5.28 s, sys: 6.09 s, total: 11.4 s
Wall time: 11.7 s


In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def correlation_filter(X, threshold=0.9):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X_filtered = X.drop(columns=to_drop)
    return X_filtered, to_drop

def apply_rfe(X, y, n_features=20):
    model = LogisticRegression(solver='liblinear', penalty='l2', max_iter=500)
    selector = RFE(model, n_features_to_select=n_features)
    selector = selector.fit(X, y)
    
    selected_columns = X.columns[selector.support_].tolist()
    X_selected = X[selected_columns]
    return X_selected, selected_columns

def process_dataset(X, y, corr_threshold=0.7, n_features=15):
    X_corr_filtered, dropped_corr = correlation_filter(X, threshold=corr_threshold)
    print("Correlaction filter finished")
    X_final, selected_cols = apply_rfe(X_corr_filtered, y, n_features=n_features)
    print("RFE finished")
    return X_final, dropped_corr, selected_cols


In [None]:
%%time
X_high_null_final, high_null_dropped_corr, high_null_selected = process_dataset(X_high_null, y_train)


Correlaction filter finished
RFE finished
CPU times: user 7min 2s, sys: 1.3 s, total: 7min 3s
Wall time: 7min 4s


In [8]:
%%time
X_low_null_final, low_null_dropped_corr, low_null_selected = process_dataset(X_low_null, y_train)

Correlaction filter finished
RFE finished
CPU times: user 2min 39s, sys: 3.22 s, total: 2min 42s
Wall time: 2min 43s


In [None]:
%%time
X_undersampled_final, undersampled_dropped_corr, undersampled_selected = process_dataset(X_undersampled, y_undersampled)

Correlaction filter finished


In [9]:
%%time
X_undersampled_low_final, undersampled_low_dropped_corr, undersampled_low_selected = process_dataset(X_undersampled_low, y_undersampled_low)

Correlaction filter finished
RFE finished
CPU times: user 3min 21s, sys: 53.9 ms, total: 3min 21s
Wall time: 3min 22s


In [None]:
%%time
X_oversampled_final, oversampled_dropped_corr, oversampled_selected = process_dataset(X_oversampled, y_oversampled)

# took to long and kernel crashed

Correlaction filter finished


: 

In [9]:
%%time
X_oversampled_final, oversampled_dropped_corr, oversampled_selected = process_dataset(X_oversampled, y_oversampled, n_features=10)

Correlaction filter finished


: 

In [None]:
import dagshub
import os
os.environ['MLFLOW_TRACKING_USERNAME'] = 'lmamu21' 
os.environ['MLFLOW_TRACKING_PASSWORD'] = '8bc574422c1ba5ebd3c7e16e00460a8560803a94'
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/lmamu21/fraud-detection.mlflow'

dagshub.init(repo_owner='lmamu21', repo_name='fraud-detection', mlflow=True)


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

mlflow.set_experiment("LogisticRegression")

with mlflow.start_run(run_name='LogisticRegression_high_null_threshold'):
    mlflow.set_tracking_uri("https://dagshub.com/lmamu21/fraud-detection.mlflow")
    
    X_validation_clean = pre_high_null.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_high_null_final.columns, fill_value=0)
    
    model = LogisticRegression()
    
    model.fit(X_high_null_final, y_train)

    y_pred = model.predict(X_validation_clean)
    
    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1] 
    roc_auc = roc_auc_score(y_validation, y_pred_proba)

    mlflow.log_metric("roc_auc", roc_auc)
    
    preprocessor_params = {
        'null_threshold': 0.8,
        'encoding_threshold': 7,
        'sampling_strategy': 'none',
        'l1_regularization': False        
    }

    rfe_params = {
        'n_features_to_select': 15
    }

    correlation_filter_params = {
        'corr_threshold': 0.7
    }

    mlflow.log_params(preprocessor_params)  
    mlflow.log_params(rfe_params)  
    mlflow.log_params(correlation_filter_params)  

    with open("confusion_matrix.txt", "w") as f:
        f.write(str(conf_matrix))
    mlflow.log_artifact("confusion_matrix.txt")
    
    with open("classification_report.txt", "w") as f:
        f.write(class_report)
    mlflow.log_artifact("classification_report.txt")
    
    mlflow.sklearn.log_model(model, "logistic_regression_model_high_null_threshold")
    
    mlflow.log_param("solver", model.solver)
    mlflow.log_param("C", model.C)
    
    # Log the model
    mlflow.sklearn.log_model(model, "logistic_regression_model_high_null_threshold")
    
    # Register the model in the Model Registry
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/logistic_regression_model_high_null_threshold"
    mlflow.register_model(model_uri, "logistic_regression_model_high_null_threshold")

    mlflow.end_run()


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Successfully registered model 'LogisticRegression_Model'.
2025/04/28 01:07:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression_Model, version 1
Created version '1' of model 'LogisticRegression_Model'.


🏃 View run LogisticRegression_high_null_threshold at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/1/runs/0e9092fdd02c475e920113e6719264d0
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/1


In [11]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

mlflow.set_experiment("LogisticRegression")

with mlflow.start_run(run_name='LogisticRegression_low_null_threshold'):
    mlflow.set_tracking_uri("https://dagshub.com/lmamu21/fraud-detection.mlflow")
    
    X_validation_clean = pre_low_null.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_low_null_final.columns, fill_value=0)
    
    model = LogisticRegression()
    
    model.fit(X_low_null_final, y_train)

    y_pred = model.predict(X_validation_clean)
    
    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1] 
    roc_auc = roc_auc_score(y_validation, y_pred_proba)

    mlflow.log_metric("roc_auc", roc_auc)
    
    preprocessor_params = {
        'null_threshold': 0.5,
        'encoding_threshold': 7,
        'sampling_strategy': 'none',
        'l1_regularization': False        
    }

    rfe_params = {
        'n_features_to_select': 15
    }

    correlation_filter_params = {
        'corr_threshold': 0.7
    }

    mlflow.log_params(preprocessor_params)  
    mlflow.log_params(rfe_params)  
    mlflow.log_params(correlation_filter_params)  

    with open("confusion_matrix.txt", "w") as f:
        f.write(str(conf_matrix))
    mlflow.log_artifact("confusion_matrix.txt")
    
    with open("classification_report.txt", "w") as f:
        f.write(class_report)
    mlflow.log_artifact("classification_report.txt")
    
    mlflow.sklearn.log_model(model, "logistic_regression_model_low_null_threshold")
    
    mlflow.log_param("solver", model.solver)
    mlflow.log_param("C", model.C)
    
    # Log the model
    mlflow.sklearn.log_model(model, "logistic_regression_model_low_null_threshold")
    
    # Register the model in the Model Registry
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/logistic_regression_model_low_null_threshold"
    mlflow.register_model(model_uri, "logistic_regression_model_low_null_threshold")

    mlflow.end_run()


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Successfully registered model 'logistic_regression_model_low_null_threshold'.
2025/04/28 01:26:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression_model_low_null_threshold, version 1
Created version '1' of model 'logistic_regression_model_low_null_threshold'.


🏃 View run LogisticRegression_low_null_threshold at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/1/runs/580590c3e4e44eb79815ad46bcfdb4a9
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/1


In [10]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

mlflow.set_experiment("LogisticRegression")

with mlflow.start_run(run_name='LogisticRegression_undersampled'):
    mlflow.set_tracking_uri("https://dagshub.com/lmamu21/fraud-detection.mlflow")
    
    X_validation_clean = pre_undersampled.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_undersampled_final.columns, fill_value=0)
    
    model = LogisticRegression()
    
    model.fit(X_undersampled_final, y_undersampled)

    y_pred = model.predict(X_validation_clean)
    
    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1] 
    roc_auc = roc_auc_score(y_validation, y_pred_proba)

    mlflow.log_metric("roc_auc", roc_auc)
    
    preprocessor_params = {
        'null_threshold': 0.6,
        'encoding_threshold': 7,
        'sampling_strategy': 'undersampling',
        'l1_regularization': False        
    }

    rfe_params = {
        'n_features_to_select': 15
    }

    correlation_filter_params = {
        'corr_threshold': 0.7
    }

    mlflow.log_params(preprocessor_params)  
    mlflow.log_params(rfe_params)  
    mlflow.log_params(correlation_filter_params)  

    with open("confusion_matrix.txt", "w") as f:
        f.write(str(conf_matrix))
    mlflow.log_artifact("confusion_matrix.txt")
    
    with open("classification_report.txt", "w") as f:
        f.write(class_report)
    mlflow.log_artifact("classification_report.txt")
    
    mlflow.sklearn.log_model(model, "logistic_regression_model_undersampled")
    
    mlflow.log_param("solver", model.solver)
    mlflow.log_param("C", model.C)
    
    # Log the model
    mlflow.sklearn.log_model(model, "logistic_regression_model_undersampled")
    
    # Register the model in the Model Registry
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/logistic_regression_model_undersampled"
    mlflow.register_model(model_uri, "logistic_regression_model_undersampled")

    mlflow.end_run()


Registered model 'logistic_regression_model_undersampled' already exists. Creating a new version of this model...
2025/04/28 01:50:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression_model_undersampled, version 2
Created version '2' of model 'logistic_regression_model_undersampled'.


🏃 View run LogisticRegression_undersampled at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/1/runs/cc8e72b37e5d431cbdf91fdfebb783fe
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/1


In [12]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

mlflow.set_experiment("LogisticRegression")

with mlflow.start_run(run_name='LogisticRegression_undersampled'):
    mlflow.set_tracking_uri("https://dagshub.com/lmamu21/fraud-detection.mlflow")
    
    X_validation_clean = pre_undersampled_low.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_undersampled_low_final.columns, fill_value=0)
    
    model = LogisticRegression()
    
    model.fit(X_undersampled_low_final, y_undersampled_low)

    y_pred = model.predict(X_validation_clean)
    
    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1] 
    roc_auc = roc_auc_score(y_validation, y_pred_proba)

    mlflow.log_metric("roc_auc", roc_auc)
    
    preprocessor_params = {
        'null_threshold': 0.6,
        'encoding_threshold': 7,
        'sampling_strategy': 'undersampling',
        'target_ratio': 0.3,
        'l1_regularization': False        
    }

    rfe_params = {
        'n_features_to_select': 15
    }

    correlation_filter_params = {
        'corr_threshold': 0.7
    }

    mlflow.log_params(preprocessor_params)  
    mlflow.log_params(rfe_params)  
    mlflow.log_params(correlation_filter_params)  

    with open("confusion_matrix.txt", "w") as f:
        f.write(str(conf_matrix))
    mlflow.log_artifact("confusion_matrix.txt")
    
    with open("classification_report.txt", "w") as f:
        f.write(class_report)
    mlflow.log_artifact("classification_report.txt")
    
    mlflow.sklearn.log_model(model, "logistic_regression_model_undersampled_low")
    
    mlflow.log_param("solver", model.solver)
    mlflow.log_param("C", model.C)
    
    # Log the model
    mlflow.sklearn.log_model(model, "logistic_regression_model_undersampled_low")
    
    # Register the model in the Model Registry
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/logistic_regression_model_undersampled_low"
    mlflow.register_model(model_uri, "logistic_regression_model_undersampled_low")

    mlflow.end_run()


Successfully registered model 'logistic_regression_model_undersampled_low'.
2025/04/28 02:04:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: logistic_regression_model_undersampled_low, version 1
Created version '1' of model 'logistic_regression_model_undersampled_low'.


🏃 View run LogisticRegression_undersampled at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/1/runs/ba6c76cdeb114b0298b26f7adfd26823
🧪 View experiment at: https://dagshub.com/lmamu21/fraud-detection.mlflow/#/experiments/1


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

mlflow.set_experiment("LogisticRegression")

# not actually run, because oversampled data took too much time to process

with mlflow.start_run(run_name='LogisticRegression_oversampled'):
    mlflow.set_tracking_uri("https://dagshub.com/lmamu21/fraud-detection.mlflow")
    
    X_validation_clean = pre_oversampled.transform(X_validation)
    X_validation_clean = X_validation_clean.reindex(columns=X_oversampled.columns, fill_value=0)
    
    model = LogisticRegression()
    
    model.fit(X_oversampled, y_oversampled)

    y_pred = model.predict(X_validation_clean)
    
    conf_matrix = confusion_matrix(y_validation, y_pred)
    class_report = classification_report(y_validation, y_pred)
    y_pred_proba = model.predict_proba(X_validation_clean)[:, 1] 
    roc_auc = roc_auc_score(y_validation, y_pred_proba)

    mlflow.log_metric("roc_auc", roc_auc)
    
    preprocessor_params = {
        'null_threshold': 0.6,
        'encoding_threshold': 7,
        'sampling_strategy': 'undersampling',
        'target_ratio': 0.3,
        'l1_regularization': False        
    }

    rfe_params = {
        'n_features_to_select': 15
    }

    correlation_filter_params = {
        'corr_threshold': 0.7
    }

    mlflow.log_params(preprocessor_params)  
    mlflow.log_params(rfe_params)  
    mlflow.log_params(correlation_filter_params)  

    with open("confusion_matrix.txt", "w") as f:
        f.write(str(conf_matrix))
    mlflow.log_artifact("confusion_matrix.txt")
    
    with open("classification_report.txt", "w") as f:
        f.write(class_report)
    mlflow.log_artifact("classification_report.txt")
    
    mlflow.sklearn.log_model(model, "logistic_regression_model_undersampled_low")
    
    mlflow.log_param("solver", model.solver)
    mlflow.log_param("C", model.C)
    
    # Log the model
    mlflow.sklearn.log_model(model, "logistic_regression_model_undersampled_low")
    
    # Register the model in the Model Registry
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/logistic_regression_model_undersampled_low"
    mlflow.register_model(model_uri, "logistic_regression_model_undersampled_low")

    mlflow.end_run()
