# Cleaning and Feature Selection

In [11]:
#დატის წაკითხვა
import pandas as pd
import numpy as np
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')

test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [3]:
train_transaction.shape

(590540, 394)

In [4]:
train_identity.shape

(144233, 41)

In [17]:
full_data = train_transaction.merge(train_identity, on='TransactionID', how='left')

In [None]:
full_data['isFraud'].value_counts(normalize=True)

მონაცემებს ვყოფთ სატრენინგო და სატესტო ნაწილებად, რათა ავირიდოთ მიკერძოება, რომელსაც გამოიწვევს მთელი სიმრავლიდან მიღებული გადაწყვეტილებები. იმისათვის, რომ, მაგალითად, სწორად შევარჩიოთ, რომელი სვეტი დაიდროფოს, გადაწყვეტილებას მივიღებთ მხოლოდ სატრენინგო სიმრავლიდან

In [20]:
X_train, X_test, y_train, y_test = train_test_split(full_data.drop('isFraud',axis=1), full_data['isFraud'], test_size=0.2, random_state=1, stratify=full_data['isFraud'], shuffle=True)

In [21]:
#80%-ზე მეტი გამოტოვებული მნიშვნელობების სვეტები იშლება
null_pcts = X_train.isnull().sum()/len(X_train)
null_columns = list(null_pcts[null_pcts>=0.8].index)
X_train = X_train.drop(null_columns, axis=1)

In [24]:
#90%-ზე მეტად კორელირებული ცვლადები იშლება
corr_matrix = X_train.drop(['TransactionID'], axis=1).select_dtypes(exclude='object').corr().abs()

upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

correlated_columns_to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

X_train = X_train.drop(columns=correlated_columns_to_drop)

  return op(a, b)


In [None]:
#თუ ცვლადს დაბალი roc_auc აქვს(თვითონ რიცხვით ცვლადს და არა მოდელს), მას ვშლით. დაბალში ვგულისხმობთ ქვედა 15%-ს ყველა ცვლადს შორის(ანუ ტოპ 85%-ს ვტოვებთ)
numerical_cols = X_train.drop(['TransactionID'], axis=1).select_dtypes(exclude='object').columns
roc_auc_features = [roc_auc_score(y_train[X_train.dropna(subset=[col]).index], 
                                  X_train.dropna(subset=[col])[col]) for col in numerical_cols]
columns_to_keep = pd.DataFrame(roc_auc_features, index=numerical_cols, columns=['ROC_AUC']).sort_values(by='ROC_AUC', ascending=False).iloc[:int(len(roc_auc_features)*0.85)].index
low_roc_auc_columns_to_drop = list(set(numerical_cols)-set(columns_to_keep))
X_train = X_train.drop(low_roc_auc_columns_to_drop, axis=1)

# Feature Engineering

In [None]:
X_train.select_dtypes(include='object').nunique()

მაღალი უნიკალური რაოდენობის მქონდე სვეტები frequency encoding-ით გარდაიქმნება, ხოლო დაბალი უნიკალური რაოდენობის მქონე სვეტები one_hot_encoding-ით. შევქმნათ ჩვენი frequencyencoder კლასი. onehotencoder-ს კი scikit-learn-დან გამოვიყენებთ. მთლიანი გარდაქმნის ეტაპი ქვევითაა მოცემული pipeline-ში.

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_dicts = {}

    def fit(self, X, y=None):
        for col in X.columns:
            self.freq_dicts[col] = X[col].value_counts(normalize=True).to_dict()
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            freq_map = self.freq_dicts.get(col, {})
            X_transformed[col] = X[col].map(freq_map).fillna(0)
        return X_transformed

In [26]:
all_columns_to_drop = ['TransactionID','dist2','D6','D7','D8','D9','D12','D13','D14','V138','V139','V140','V141','V142','V143','V144','V145','V146','V147','V148','V149','V150',
 'V151','V152','V153','V154','V155','V156','V157','V158','V159','V160','V161','V162','V163','V164','V165','V166','V322','V323',
 'V324','V325','V326','V327','V328','V329','V330','V331','V332','V333','V334','V335','V336','V337','V338','V339','id_03','id_04',
 'id_07','id_08','id_09','id_10','id_14','id_18','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_30','id_32','id_33','id_34',
'C2','C4','C6','C7','C8','C9','C10','C11','C12','C14','D2','V5','V11','V13','V16','V18','V20','V21','V22','V28','V30','V31','V32',
 'V33','V34','V36','V40','V42','V43','V45','V48','V49','V50','V51','V52','V54','V57','V58','V59','V60','V63','V64','V65','V68','V69',
 'V70','V71','V72','V73','V74','V76','V79','V80','V81','V84','V85','V88','V89','V90','V91','V92','V93','V94','V96','V97','V101','V102','V103',
 'V105','V106','V113','V126','V127','V128','V132','V133','V134','V137','V167','V168','V177','V178','V179','V182','V183','V190','V192',
 'V193','V196','V197','V198','V199','V201','V202','V203','V204','V206','V211','V212','V213','V216','V217','V218','V219','V222','V225',
 'V231','V232','V233','V235','V236','V237','V239','V243','V244','V245','V249','V251','V253','V254','V256','V257','V259','V263','V265',
 'V266','V269','V271','V272','V273','V274','V275','V276','V277','V278','V279','V280','V292','V293','V294','V295','V296','V297','V298',
 'V299','V301','V302','V304','V306','V307','V308','V309','V311','V312','V315','V316','V317','V318','V319','V321',
 'V173','D15','D3','V75','card1','id_19','V130','D11','V285','V35','V53','id_01','V29','card5','D4','V99','C13','V310','id_06','V10','C5','D1',
 'D10','V12','id_13','D5']  #null_columns, correlated_columns_to_drop, low_roc_auc_columns_to_drop სიების გაერთიანება

frequency_enc_columns = ['P_emaildomain', 'R_emaildomain', 'id_31', 'DeviceInfo']
one_hot_columns = ['ProductCD', 'card4', 'card6', 'id_15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 
                    'id_28', 'id_29', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType']

# Training

In [27]:
import xgboost as xgb

In [28]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql

In [29]:
!pip install dagshub

Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-3.5.2-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.7.1-py3-none-any.whl.metadata (1.4 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting dagshub-annotation-converter>=0.1.5 (from dagshub)
  Downloading dagshub_annotation_converter-0.1.9-py3-none-any.whl.metadata (2.5 kB)
Collecting graphql-core<3.2.5,>=3.2 (from gql[requests]->dagshub)
  Downloading graphql_core-3.2.4-py3-none-any.whl.metadata (10 kB)
Collecting backoff<3.0,>=1.11.1 (from gql[requests]->dagshub)
  Downloading backoff-2.2.1-py3-none-any.whl.meta

In [41]:
import mlflow
import dagshub
import mlflow.lightgbm
import mlflow.sklearn

mlflow.set_experiment("XGBoost_Training")
dagshub.init(repo_owner='nipkha21', repo_name='IEEE-CIS-Fraud-Detection-', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow')

2025/04/27 21:58:56 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Training' does not exist. Creating a new experiment.


In [32]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
# train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')


# full_data = train_transaction.merge(train_identity, on='TransactionID', how='left')
# X_train, X_test, y_train, y_test = train_test_split(full_data.drop('isFraud',axis=1), full_data['isFraud'], test_size=0.2, random_state=1, stratify=full_data['isFraud'], shuffle=True)

In [35]:
# from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [36]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), one_hot_columns),
        ('freqenc', FrequencyEncoder(), frequency_enc_columns)
    ],
    remainder='passthrough' 
)

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [44]:
xgboost_model = xgb.XGBClassifier(random_state=4, max_depth=5, scale_pos_weight=3.6)

full_pipeline = Pipeline(steps=[
    ('drop_columns', ColumnDropper(columns_to_drop=all_columns_to_drop)),
    ('preprocessing', preprocessor),
    ('model', xgboost_model)
])

In [45]:
with mlflow.start_run(run_name="xgboost_initial"):
    full_pipeline.fit(X_train, y_train)
    y_pred_prob_train = full_pipeline.predict_proba(X_train)[:, 1] 
    y_pred_prob_test = full_pipeline.predict_proba(X_test)[:, 1] 
    
    train_roc_auc = roc_auc_score(y_train, y_pred_prob_train)

    test_roc_auc = roc_auc_score(y_test, y_pred_prob_test)
    
    mlflow.sklearn.log_model(full_pipeline, artifact_path="model")
    mlflow.log_metric("train_roc_auc", train_roc_auc)
    mlflow.log_metric("test_roc_auc", test_roc_auc)
    
    mlflow.log_param("columns_to_drop", all_columns_to_drop)
    mlflow.log_param("one_hot_columns", one_hot_columns)
    mlflow.log_param("frequency_enc_columns", frequency_enc_columns)



🏃 View run xgboost_initial at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/aa82c212ae7c40c388ffff00d2337b08
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1


In [46]:
hyperparameter_combinations = [
    {"learning_rate": 0.01, "n_estimators": 200, "max_depth": 3, "subsample": 0.8, "colsample_bytree": 0.8, "scale_pos_weight":3.4},
    {"learning_rate": 0.1, "n_estimators": 150, "max_depth": 4, "subsample": 0.85, "colsample_bytree": 0.9, "scale_pos_weight":4.2},
    {"learning_rate": 0.05, "n_estimators": 300, "max_depth": 5, "subsample": 0.9, "colsample_bytree": 0.8, "scale_pos_weight":3.9},
    {"learning_rate": 0.1, "n_estimators": 200, "max_depth": 6, "subsample": 0.7, "colsample_bytree": 0.8, "scale_pos_weight":2.5},
    {"learning_rate": 0.1, "n_estimators": 100, "max_depth": 4, "subsample": 0.8, "colsample_bytree": 0.7, "scale_pos_weight":1.4},
    {"learning_rate": 0.05, "n_estimators": 500, "max_depth": 5, "subsample": 0.75, "colsample_bytree": 0.9, "scale_pos_weight":3},
    {"learning_rate": 0.02, "n_estimators": 250, "max_depth": 3, "subsample": 0.85, "colsample_bytree": 0.7, "scale_pos_weight":9.6},
    {"learning_rate": 0.1, "n_estimators": 350, "max_depth": 5, "subsample": 0.9, "colsample_bytree": 0.85, "scale_pos_weight":3.1},
    {"learning_rate": 0.03, "n_estimators": 450, "max_depth": 6, "subsample": 0.7, "colsample_bytree": 0.75, "scale_pos_weight":7.2},
    {"learning_rate": 0.04, "n_estimators": 200, "max_depth": 4, "subsample": 0.95, "colsample_bytree": 0.8, "scale_pos_weight":2.4},
]

for idx, hyperparameters in enumerate(hyperparameter_combinations):
    with mlflow.start_run(run_name=f"xgb_run_{idx+1}"):
        xgboost_model = xgb.XGBClassifier(**hyperparameters)
        full_pipeline = Pipeline(steps=[
                ('drop_columns', ColumnDropper(columns_to_drop=all_columns_to_drop)),
                ('preprocessing', preprocessor),
                ('model', xgboost_model)
            ])
        full_pipeline.fit(X_train, y_train)
        y_pred_prob_train = full_pipeline.predict_proba(X_train)[:, 1] 
        y_pred_prob_test = full_pipeline.predict_proba(X_test)[:, 1] 
        
        train_roc_auc = roc_auc_score(y_train, y_pred_prob_train)
    
        test_roc_auc = roc_auc_score(y_test, y_pred_prob_test)
        
        mlflow.sklearn.log_model(full_pipeline, artifact_path="model")
        mlflow.log_metric("train_roc_auc", train_roc_auc)
        mlflow.log_metric("test_roc_auc", test_roc_auc)
        
        mlflow.log_param("columns_to_drop", all_columns_to_drop)
        mlflow.log_param("one_hot_columns", one_hot_columns)
        mlflow.log_param("frequency_enc_columns", frequency_enc_columns)



🏃 View run xgb_run_1 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/f86b68c263a94ce5985e993e1b2bf3f5
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_2 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/6cc7306c06c7432e9af56baf920247c6
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_3 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/5b993cc46ab24330a094d6abd77fe70e
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_4 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/21604132652d403d8afdb2f5fbe336fe
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_5 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/353ead5bb5ad449ea283e0d3ab7a06ab
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_6 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/e89b8564e9a547fbac87e2d6c9f95f56
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_7 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/31ec44a8a93949b9aa90c1fb82d478c8
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_8 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/96fc2d255da04bd9ae3f739755f34633
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_9 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/a457fafc457143d489b80dd8f26cb1ff
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1




🏃 View run xgb_run_10 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1/runs/56a32457e05448329e876f66c34b5679
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/1
