# Cleaning and Feature Selection

In [23]:
#დატის წაკითხვა
import pandas as pd
import numpy as np
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')

test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
train_transaction.shape

In [None]:
train_identity.shape

In [None]:
full_data = train_transaction.merge(train_identity, on='TransactionID', how='left')

In [None]:
full_data['isFraud'].value_counts(normalize=True)

მონაცემებს ვყოფთ სატრენინგო და სატესტო ნაწილებად, რათა ავირიდოთ მიკერძოება, რომელსაც გამოიწვევს მთელი სიმრავლიდან მიღებული გადაწყვეტილებები. იმისათვის, რომ, მაგალითად, სწორად შევარჩიოთ, რომელი სვეტი დაიდროფოს, გადაწყვეტილებას მივიღებთ მხოლოდ სატრენინგო სიმრავლიდან

In [None]:
X_train, X_test, y_train, y_test = train_test_split(full_data.drop('isFraud',axis=1), full_data['isFraud'], test_size=0.2, random_state=1, stratify=full_data['isFraud'], shuffle=True)

In [None]:
#80%-ზე მეტი გამოტოვებული მნიშვნელობების სვეტები იშლება
null_pcts = X_train.isnull().sum()/len(X_train)
null_columns = list(null_pcts[null_pcts>=0.8].index)
X_train = X_train.drop(null_columns, axis=1)

In [None]:
#90%-ზე მეტად კორელირებული ცვლადები იშლება
corr_matrix = X_train.drop(['TransactionID'], axis=1).select_dtypes(exclude='object').corr().abs()

upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

correlated_columns_to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

X_train = X_train.drop(columns=correlated_columns_to_drop)

In [None]:
#თუ ცვლადს დაბალი roc_auc აქვს(თვითონ რიცხვით ცვლადს და არა მოდელს), მას ვშლით. დაბალში ვგულისხმობთ ქვედა 15%-ს ყველა ცვლადს შორის(ანუ ტოპ 85%-ს ვტოვებთ)
numerical_cols = X_train.drop(['TransactionID'], axis=1).select_dtypes(exclude='object').columns
roc_auc_features = [roc_auc_score(y_train[X_train.dropna(subset=[col]).index], 
                                  X_train.dropna(subset=[col])[col]) for col in numerical_cols]
columns_to_keep = pd.DataFrame(roc_auc_features, index=numerical_cols, columns=['ROC_AUC']).sort_values(by='ROC_AUC', ascending=False).iloc[:int(len(roc_auc_features)*0.85)].index
low_roc_auc_columns_to_drop = list(set(numerical_cols)-set(columns_to_keep))
X_train = X_train.drop(low_roc_auc_columns_to_drop, axis=1)

# Feature Engineering

In [None]:
X_train.select_dtypes(include='object').nunique()

მაღალი უნიკალური რაოდენობის მქონდე სვეტები frequency encoding-ით გარდაიქმნება, ხოლო დაბალი უნიკალური რაოდენობის მქონე სვეტები one_hot_encoding-ით. შევქმნათ ჩვენი frequencyencoder კლასი. onehotencoder-ს კი scikit-learn-დან გამოვიყენებთ. მთლიანი გარდაქმნის ეტაპი ქვევითაა მოცემული pipeline-ში.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_dicts = {}

    def fit(self, X, y=None):
        for col in X.columns:
            self.freq_dicts[col] = X[col].value_counts(normalize=True).to_dict()
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            freq_map = self.freq_dicts.get(col, {})
            X_transformed[col] = X[col].map(freq_map).fillna(0)
        return X_transformed

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
# train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')


# full_data = train_transaction.merge(train_identity, on='TransactionID', how='left')
# X_train, X_test, y_train, y_test = train_test_split(full_data.drop('isFraud',axis=1), full_data['isFraud'], test_size=0.2, random_state=1, stratify=full_data['isFraud'], shuffle=True)

In [None]:
all_columns_to_drop = ['TransactionID','dist2','D6','D7','D8','D9','D12','D13','D14','V138','V139','V140','V141','V142','V143','V144','V145','V146','V147','V148','V149','V150',
 'V151','V152','V153','V154','V155','V156','V157','V158','V159','V160','V161','V162','V163','V164','V165','V166','V322','V323',
 'V324','V325','V326','V327','V328','V329','V330','V331','V332','V333','V334','V335','V336','V337','V338','V339','id_03','id_04',
 'id_07','id_08','id_09','id_10','id_14','id_18','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_30','id_32','id_33','id_34',
'C2','C4','C6','C7','C8','C9','C10','C11','C12','C14','D2','V5','V11','V13','V16','V18','V20','V21','V22','V28','V30','V31','V32',
 'V33','V34','V36','V40','V42','V43','V45','V48','V49','V50','V51','V52','V54','V57','V58','V59','V60','V63','V64','V65','V68','V69',
 'V70','V71','V72','V73','V74','V76','V79','V80','V81','V84','V85','V88','V89','V90','V91','V92','V93','V94','V96','V97','V101','V102','V103',
 'V105','V106','V113','V126','V127','V128','V132','V133','V134','V137','V167','V168','V177','V178','V179','V182','V183','V190','V192',
 'V193','V196','V197','V198','V199','V201','V202','V203','V204','V206','V211','V212','V213','V216','V217','V218','V219','V222','V225',
 'V231','V232','V233','V235','V236','V237','V239','V243','V244','V245','V249','V251','V253','V254','V256','V257','V259','V263','V265',
 'V266','V269','V271','V272','V273','V274','V275','V276','V277','V278','V279','V280','V292','V293','V294','V295','V296','V297','V298',
 'V299','V301','V302','V304','V306','V307','V308','V309','V311','V312','V315','V316','V317','V318','V319','V321',
 'V173','D15','D3','V75','card1','id_19','V130','D11','V285','V35','V53','id_01','V29','card5','D4','V99','C13','V310','id_06','V10','C5','D1',
 'D10','V12','id_13','D5']  #null_columns, correlated_columns_to_drop, low_roc_auc_columns_to_drop სიების გაერთიანება

frequency_enc_columns = ['P_emaildomain', 'R_emaildomain', 'id_31', 'DeviceInfo']
one_hot_columns = ['ProductCD', 'card4', 'card6', 'id_15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 
                    'id_28', 'id_29', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType']

# Training

In [None]:
!pip install mlflow

In [None]:
!pip install dagshub

In [None]:
import xgboost as xgb
import mlflow
import dagshub
import mlflow.lightgbm
import mlflow.sklearn

mlflow.set_experiment("Random_Forest_Training")
dagshub.init(repo_owner='nipkha21', repo_name='IEEE-CIS-Fraud-Detection-', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow')

In [None]:
# from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

In [None]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), one_hot_columns),
        ('freqenc', FrequencyEncoder(), frequency_enc_columns)
    ],
    remainder='passthrough' 
)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer


In [None]:
numeric_columns = list(set(X_train.drop(all_columns_to_drop, axis=1).columns)-set(one_hot_columns)-set(frequency_enc_columns))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), one_hot_columns),
        ('freqenc', FrequencyEncoder(), frequency_enc_columns),
        ('imputer', SimpleImputer(strategy='median'), numeric_columns)
    ],
    remainder='passthrough' 
)

random_forest = RandomForestClassifier(class_weight='balanced', n_jobs=-1)

full_pipeline = Pipeline(steps=[
    ('drop_columns', ColumnDropper(columns_to_drop=all_columns_to_drop)),
    ('preprocessing', preprocessor),
    ('model', random_forest)
])

In [35]:
with mlflow.start_run(run_name="random_forest_initial"):
    full_pipeline.fit(X_train, y_train)
    y_pred_prob_train = full_pipeline.predict_proba(X_train)[:, 1] 
    y_pred_prob_test = full_pipeline.predict_proba(X_test)[:, 1] 
    
    train_roc_auc = roc_auc_score(y_train, y_pred_prob_train)
    
    test_roc_auc = roc_auc_score(y_test, y_pred_prob_test)
    mlflow.sklearn.log_model(full_pipeline, artifact_path="model")
    mlflow.log_metric("train_roc_auc", train_roc_auc)
    mlflow.log_metric("test_roc_auc", test_roc_auc)
    
    mlflow.log_param("columns_to_drop", all_columns_to_drop)
    mlflow.log_param("one_hot_columns", one_hot_columns)
    mlflow.log_param("frequency_enc_columns", frequency_enc_columns)



🏃 View run random_forest_initial at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/4/runs/30add64ed5a04b589e81d10e4db19802
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/4


In [43]:
import mlflow
import mlflow.sklearn

param_grid = [
    {"n_estimators": 100, "max_depth": 5, "min_samples_split": 2, "min_samples_leaf": 1},
    {"n_estimators": 200, "max_depth": 10, "min_samples_split": 2, "min_samples_leaf": 1},
    {"n_estimators": 200, "max_depth": 15, "min_samples_split": 2, "min_samples_leaf": 2},
    {"n_estimators": 100, "max_depth": None, "min_samples_split": 2, "min_samples_leaf": 2},
    {"n_estimators": 100, "max_depth": None, "min_samples_split": 5, "min_samples_leaf": 7},
    {"n_estimators": 100, "max_depth": 14, "min_samples_split": 5, "min_samples_leaf": 4},
]

for idx, params in enumerate(param_grid):
    random_forest = RandomForestClassifier(
        class_weight='balanced', 
        **params
    )

    full_pipeline.set_params(model=random_forest)

    with mlflow.start_run(run_name=f"random_forest_run_{idx+1}"):
        full_pipeline.fit(X_train, y_train)

        y_pred_prob_train = full_pipeline.predict_proba(X_train)[:, 1]
        y_pred_prob_test = full_pipeline.predict_proba(X_test)[:, 1]

        train_roc_auc = roc_auc_score(y_train, y_pred_prob_train)
        test_roc_auc = roc_auc_score(y_test, y_pred_prob_test)

        mlflow.sklearn.log_model(full_pipeline, artifact_path="model")

        mlflow.log_metric("train_roc_auc", train_roc_auc)
        mlflow.log_metric("test_roc_auc", test_roc_auc)

        mlflow.log_params(params)
        mlflow.log_param("columns_to_drop", all_columns_to_drop)
        mlflow.log_param("one_hot_columns", one_hot_columns)
        mlflow.log_param("frequency_enc_columns", frequency_enc_columns)



🏃 View run random_forest_run_7 at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/4/runs/bf9e157ebc3043bd8f9b5fb76578184b
🧪 View experiment at: https://dagshub.com/nipkha21/IEEE-CIS-Fraud-Detection-.mlflow/#/experiments/4
