In [28]:
!pip install dagshub mlflow



In [29]:
import dagshub
dagshub.init(repo_owner='lkhok22', repo_name='ML-hw2', mlflow=True)

In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [31]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

# Read and merge, Reduce memory

In [33]:
df_train_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
df_train_tr = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
df_test_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
df_test_tr = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

In [34]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [35]:
df_train = pd.merge(df_train_tr, df_train_id, on='TransactionID', how='left')
df_test = pd.merge(df_test_tr, df_test_id, on='TransactionID', how='left')

In [36]:
del df_train_id, df_train_tr
del df_test_id,df_test_tr

In [37]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 1955.37 MB
Memory usage after optimization is: 645.97 MB
Decreased by 67.0%
Memory usage of dataframe is 1673.87 MB
Memory usage after optimization is: 561.50 MB
Decreased by 66.5%


In [38]:
X=df_train.drop(columns=['isFraud'])
y=df_train['isFraud']


In [39]:
from sklearn.model_selection import train_test_split

#80% train, 20% test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [40]:
train_ids = X_train.pop('TransactionID')
test_ids = X_test.pop('TransactionID')

# Cleaning

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin
class DropNullColumns(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.columns_to_drop_ = []

    def fit(self, X, y=None):
        null_ratios = X.isna().mean()
        self.columns_to_drop_ = null_ratios[null_ratios > self.threshold].index.tolist()
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop_, errors='ignore')


In [42]:
class ReplaceNulls(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        numeric_cols = df.select_dtypes(include=['number']).columns
        categoric_cols = df.select_dtypes(include=['object', 'category']).columns
        df[numeric_cols] = df[numeric_cols].fillna(-999)
        df[categoric_cols] = df[categoric_cols].fillna("NAN")
        return df

# Feature Engineering

In [43]:
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, threshold_for_woe=3):
        self.threshold = threshold_for_woe

    def fit(self, X, y):
        self.woe_columns_ = []
        self.one_hot_columns_ = []

        s = X.select_dtypes(include=['object', 'category']).nunique()
        self.woe_columns_ = s[s > self.threshold].index.tolist()
        self.one_hot_columns_ = s[s <= self.threshold].index.tolist()

        df = X.copy()
        df['target'] = y

        self.woe_mappings_ = {}

        for col in self.woe_columns_:
            grouped = df.groupby(col)['target'].agg(['count', 'sum'])
            grouped.columns = ['n_obs', 'n_pos']
            grouped['n_neg'] = grouped['n_obs'] - grouped['n_pos']
            eps = 1e-6  # small number to avoid division by zero
            grouped['prop_pos'] = grouped['n_pos'] / max(grouped['n_pos'].sum(), eps)
            grouped['prop_neg'] = grouped['n_neg'] / max(grouped['n_neg'].sum(), eps)
            grouped['woe'] = np.log((grouped['prop_pos'] + eps) / (grouped['prop_neg'] + eps))

            self.woe_mappings_[col] = grouped['woe'].fillna(0).to_dict()
        return self

    def transform(self, X):
        df = X.copy()

        for col in self.woe_columns_:
            df[f"{col}_woe"] = df[col].map(self.woe_mappings_[col])
            df.drop(columns=col, inplace=True)

        df = pd.get_dummies(df, columns=self.one_hot_columns_, drop_first=True)
        return df



# Feature Selection

In [16]:
class CorrelationFilter(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        df = X.copy()
        df['target'] = y
        corr_matrix = df.corr().abs()
        mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        corr_pairs = pd.DataFrame(corr_matrix.where(mask).stack(), columns=["correlation"]).reset_index()
        corr_pairs.columns = ['feature1', 'feature2', 'correlation']

        target_corr = X.corrwith(y).abs()
        self.to_drop_ = []

        for _, row in corr_pairs[corr_pairs['correlation'] > 0.8].iterrows():
            f1, f2 = row['feature1'], row['feature2']
            drop = f1 if target_corr[f1] < target_corr[f2] else f2
            self.to_drop_.append(drop)

        self.to_drop_ = list(set(self.to_drop_))
        return self

    def transform(self, X):
        return X.drop(columns=[col for col in self.to_drop_ if col in X.columns], errors='ignore')


# Training and Evaluation

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import numpy as np

# Set experiment
mlflow.set_experiment("LogisticRegression")

# Start one clean MLflow run
with mlflow.start_run(run_name="LogReg2"):

    pipeline = Pipeline([
        ('drop_nulls', DropNullColumns(threshold=0.8)),
        ('fill_na', ReplaceNulls()),
        ('encode', CustomPreprocessor(threshold_for_woe=3)),
        ('fill_na2', ReplaceNulls()),
        ('corr_filter', CorrelationFilter()),
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=1000, random_state=42))
    ])

    # Fit pipeline
    pipeline.fit(X_train, y_train)

    # Predict on train and test
    train_preds = pipeline.predict_proba(X_train)[:, 1]
    test_preds = pipeline.predict_proba(X_test)[:, 1]

    # Calculate AUC
    train_auc = roc_auc_score(y_train, train_preds)
    test_auc = roc_auc_score(y_test, test_preds)

    # -----------------------
    # Log everything
    # -----------------------

    # Parameters
    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)
    mlflow.log_param("random_state", 42)

    # Metrics
    mlflow.log_metric("train_auc", train_auc)
    mlflow.log_metric("test_auc", test_auc)

    # Log pipeline
    mlflow.sklearn.log_model(pipeline, "logreg_full_pipeline")

    # Print AUCs
    print(f"Train AUC: {train_auc:.4f}")
    print(f"Test AUC: {test_auc:.4f}")
