<a href="https://colab.research.google.com/github/mkj0331/MoonKyoungJin/blob/main/XGBoost_24_%EC%B5%9C%EC%A2%85.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Anomalous Financial Transaction Detection

본 대회의 과제는 금융 거래 데이터에서 **이상 거래를 탐지하는 기능**을 개선하고 활용도를 높이는 분류 AI모델을 개발하는 것입니다.

특히, 클래스 불균형 문제를 해결하기 위해 오픈소스 생성형 AI 모델을 활용하여 부족한 클래스의 데이터를 보완하고, 이를 통해 분류 모델의 성능을 향상시키는 것이 핵심 목표입니다.

이러한 접근을 통해 금융보안에 특화된 데이터 분석 및 활용 역량을 강화하여 전문 인력을 양성하고, 금융권의 AI 활용 어려움에 따른 해결 방안을 함께 모색하며 금융 산업의 AI 활용 활성화를 지원하는 것을 목표로 합니다.

# Import Library

In [None]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.16.1-py3-none-any.whl.metadata (13 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.35.24-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.35.24-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.11.0 (from sdv)
  Downloading copulas-0.11.1-py3-none-any.whl.metadata (9.1 kB)
Collecting ctgan>=0.10.0 (from sdv)
  Downloading ctgan-0.10.1-py3-none-any.whl.metadata (11 kB)
Collecting deepecho>=0.6.0 (from sdv)
  Downloading deepecho-0.6.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.12.3 (from sdv)
  Downloading rdt-1.12.4-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.14.0 (from sdv)
  Downloading sdmetrics-0.15.1-py3-none-any.whl.metadata (8.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [None]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# 머신러닝 모델
from xgboost import XGBClassifier

# 합성 데이터 생성
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 생성 🏭

# Load Data

In [None]:
train_all = pd.read_csv("/content/drive/MyDrive/fsi 공모전/train.csv")
test_all = pd.read_csv("/content/drive/MyDrive/fsi 공모전/test.csv")

In [None]:
train = train_all.drop(columns="ID")

In [None]:
N_CLS_PER_GEN = 2000

# Time_difference 컬럼을 총 초로 변환
train['Time_difference'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()

# 0보다 작은 경우 1으로 대치
train['Time_difference'] = train['Time_difference'].clip(lower=1)

# velocity 컬럼 생성 => Distance / Time_difference
train['velocity'] = (train['Distance'])/(train['Time_difference'])

# 모든 Fraud_Type 목록 생성 (m 포함)
fraud_types = train['Fraud_Type'].unique()

# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data = pd.DataFrame()

N_SAMPLE = 100

# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):

    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 100개씩 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)

    # 메타데이터 생성 및 모델 학습
    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    # 데이터 타입 설정
    column_sdtypes = {
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        'Customer_identification_number': 'categorical',
        'Customer_personal_identifier': 'categorical',
        'Account_account_number': 'categorical',
        'IP_Address': 'ipv4_address',
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Fraud_Type': 'categorical',
        'Time_difference': 'numerical',
        'Customer_Birthyear': 'numerical'
    }

    # 각 컬럼에 대해 데이터 타입 설정
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )

    synthesizer = CTGANSynthesizer(
                            metadata,
                            epochs=100
                        )
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)

    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)

    # Distance 칼럼 복원 => Distance = velocity * Time_difference
    all_synthetic_data['Distance'] = (all_synthetic_data['velocity']) * (all_synthetic_data['Time_difference'])

    # velocity 칼럼 삭제
    all_synthetic_data = all_synthetic_data.drop('velocity', axis=1)
# 최종 결과 확인
print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)

100%|██████████| 13/13 [02:15<00:00, 10.41s/it]


Final All Synthetic Data Shape: (26000, 63)





In [None]:
all_synthetic_data['Fraud_Type'].value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
m,2000
a,2000
j,2000
h,2000
k,2000
c,2000
g,2000
i,2000
b,2000
f,2000


In [None]:
train.shape

(120000, 64)

In [None]:
df = all_synthetic_data.copy()

In [None]:
# Mobile이 아닌 기기 중 로밍된 데이터 제거
df = df[~((df['Channel'] != 'mobile') & (df['Customer_mobile_roaming_indicator'] == 1))]
# Others에서 OS가 Windows, Others가 아닌 데이터 삭제
exclude_condition1 = (df['Channel'] == 'Others') & (df['Operating_System'].isin(['iOS','Android','Linux','macOS']))
df = df[~exclude_condition1]
# ATM에서 OS가 Windows, Others가 아닌 데이터 삭제
exclude_condition2 = (df['Channel'] == 'ATM') & (df['Operating_System'].isin(['iOS','Android','Linux','macOS']))
df = df[~exclude_condition2]
# mobile에서 OS가 Windows, Linux, macOS인 데이터 삭제
exclude_condition3 = (df['Channel'] == 'mobile') & (df['Operating_System'].isin(['Windows','Linux','macOS']))
df = df[~exclude_condition3]
# internet에서 OS가 iOS, Android인 데이터 삭제
exclude_condition4 = (df['Channel'] == 'internet') & (df['Operating_System'].isin(['iOS','Android']))
df = df[~exclude_condition4]

In [None]:
synthetic_data = df.groupby('Fraud_Type').apply(lambda x: x.sample(n=1000, random_state=42) if len(x) >= 1000 else x).reset_index(drop=True)

In [None]:
# 1000개 중 17개로 원본데이터와 유사한 비율을 보임.
synthetic_data[synthetic_data['Customer_registration_datetime'] > '2013-01-01']['Fraud_Type'].value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
c,4


## 원본 데이터와 concat

In [None]:
origin_train = train_all.drop(columns="ID")

# train data Time_difference의 숫자형 변환
origin_train['Time_difference'] = pd.to_timedelta(origin_train['Time_difference']).dt.total_seconds()
# 0보다 작은 경우 1으로 대치
origin_train['Time_difference'] = origin_train['Time_difference'].clip(lower=1)

# 생성데이터와 훈련데이터 병합
train_total = pd.concat([origin_train, synthetic_data])
train_total.shape

(133000, 63)

In [None]:
train_df = train_total.copy()
test_data = test_all.copy()

In [None]:
# Fraud_Type이 'm'인 데이터 추출
type_m = train_df[train_df['Fraud_Type'] == 'm']

# 2200개만 선택
m_sample = type_m.sample(n=1100, random_state=42)

# 나머지 Fraud_Type (a부터 l까지) 추출
other_types = train_df[train_df['Fraud_Type'] != 'm']

# 데이터프레임 병합 (m을 제외한 클래스는 두 배로 복사)
train_data = pd.concat([m_sample, other_types, other_types, other_types])

# 행의 순서를 랜덤으로 섞기
train_data = train_data.sample(frac=1).reset_index(drop=True)

In [None]:
train_data['Fraud_Type'].value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
e,3300
c,3300
i,3300
j,3300
l,3300
k,3300
f,3300
a,3300
h,3300
d,3300


In [None]:
test_data.shape

(120000, 63)

In [None]:
# 날짜 데이터 삭제
train_data.drop('Account_creation_datetime', axis=1, inplace=True)
train_data.drop('Transaction_Datetime', axis=1, inplace=True)
train_data.drop('Last_atm_transaction_datetime', axis=1, inplace=True)
train_data.drop('Last_bank_branch_transaction_datetime', axis=1, inplace=True)
train_data.drop('Transaction_resumed_date', axis=1, inplace=True)

test_data.drop('Account_creation_datetime', axis=1, inplace=True)
test_data.drop('Transaction_Datetime', axis=1, inplace=True)
test_data.drop('Last_atm_transaction_datetime', axis=1, inplace=True)
test_data.drop('Last_bank_branch_transaction_datetime', axis=1, inplace=True)
test_data.drop('Transaction_resumed_date', axis=1, inplace=True)

In [None]:
# 날짜에 따른 이진변수 생성
train_data['Customer_registration_datetime'] = (train_data['Customer_registration_datetime'] > '2013-01-01').astype(int)
test_data['Customer_registration_datetime'] = (test_data['Customer_registration_datetime'] > '2013-01-01').astype(int)

In [None]:
# test data time_difference 숫자형 변환
test_data['Time_difference'] = pd.to_timedelta(test_data['Time_difference']).dt.total_seconds()
test_data['Time_difference'] = test_data['Time_difference'].clip(lower=0)

In [None]:
# '도' 단위로 사용
train_data['Location'] = train_data['Location'].apply(lambda x : x.split(' ')[0])
test_data['Location'] = test_data['Location'].apply(lambda x : x.split(' ')[0])

In [None]:
# 이체 한도를 순서형 범주처럼 변환
train_data['Account_amount_daily_limit'] = (train_data['Account_amount_daily_limit']/1000000).astype(int)
test_data['Account_amount_daily_limit'] = (test_data['Account_amount_daily_limit']/1000000).astype(int)

In [None]:
# IP 주소의 맨 앞 두자리만 사용 (한자리로 바꿀 수 있음)
train_data['IP_Address'] = train_data['IP_Address'].apply(lambda x: '.'.join(x.split('.')[:2]))
test_data['IP_Address'] = test_data['IP_Address'].apply(lambda x: '.'.join(x.split('.')[:2]))

In [None]:
# 고윳값이 1개이거나 거의 1개에 가까운 변수 제거
train_data.drop('Another_Person_Account', axis=1, inplace=True)
train_data.drop('Account_indicator_Openbanking', axis=1, inplace=True)
train_data.drop('First_time_iOS_by_vulnerable_user', axis=1, inplace=True)

test_data.drop('Another_Person_Account', axis=1, inplace=True)
test_data.drop('Account_indicator_Openbanking', axis=1, inplace=True)
test_data.drop('First_time_iOS_by_vulnerable_user', axis=1, inplace=True)

In [None]:
# 삭제해도 될 것 같은 변수들. 이름
train_data.drop('Customer_personal_identifier', axis=1, inplace=True)
test_data.drop('Customer_personal_identifier', axis=1, inplace=True)

# Data Preprocessing 1 : Select x, y

In [None]:
train_x = train_data.drop(columns=['Fraud_Type'])
train_y = train_data['Fraud_Type']

test_x = test_data.drop(columns=['ID'])

# Data Preprocessing 2 : 범주형 변수 인코딩

In [None]:
le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: a, 변환된 숫자: 0
원래 레이블: b, 변환된 숫자: 1
원래 레이블: c, 변환된 숫자: 2
원래 레이블: d, 변환된 숫자: 3
원래 레이블: e, 변환된 숫자: 4
원래 레이블: f, 변환된 숫자: 5
원래 레이블: g, 변환된 숫자: 6
원래 레이블: h, 변환된 숫자: 7
원래 레이블: i, 변환된 숫자: 8
원래 레이블: j, 변환된 숫자: 9
원래 레이블: k, 변환된 숫자: 10
원래 레이블: l, 변환된 숫자: 11
원래 레이블: m, 변환된 숫자: 12


In [None]:
# 훈련 데이터 복사
train_x_encoded = train_x.copy()

# 원핫 인코딩할 칼럼 목록
one_hot_columns = ['Customer_loan_type']

# 나머지 범주형 변수 선택 (원핫 인코딩을 하지 않을 범주형 변수)
categorical_columns = train_x_encoded.select_dtypes(include=['object', 'category']).columns.difference(one_hot_columns)

# Ordinal Encoding
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_x_encoded[categorical_columns] = ordinal_encoder.fit_transform(train_x_encoded[categorical_columns])

# 원핫 인코딩 적용
train_x_encoded = pd.get_dummies(train_x_encoded, columns=one_hot_columns, drop_first=True)

# 특성 순서 저장
feature_order = train_x_encoded.columns.tolist()

In [None]:
train_x_encoded.shape

(40700, 56)

In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [None]:
X = train_x_encoded[feature_order]
y = train_y_encoded

# Optuna를 사용하여 하이퍼파라미터 튜닝
def objective(trial):
    # XGB 하이퍼파라미터
    xgb_params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'max_depth': trial.suggest_int('max_depth', 4, 30),
        'gamma': trial.suggest_float('gamma', 0.0, 0.9),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 30),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1.0),  # L2 정규화
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1.0),  # L1 정규화
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0),
        'max_delta_step': trial.suggest_float('max_delta_step', 0, 10),
    }

    # 개별 모델 정의
    xgb = XGBClassifier(**xgb_params,
                      device='cuda',
                      random_state=42,
                      class_weight='balanced')

  # Perform k-fold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []

    for train_index, val_index in kf.split(X, y):
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        # Fit StackingRegressor

        xgb.fit(X_train_fold, y_train_fold, verbose=False)


        # Evaluate performance using mean squared error
        y_pred = xgb.predict(X_val_fold)
        f1 = f1_score(y_val_fold, y_pred, average='macro')
        f1_scores.append(f1)

    # Compute mean cross-validation score
    mean_score = sum(f1_scores) / len(f1_scores)

    return mean_score

# 시드 고정
sampler = optuna.samplers.TPESampler(seed=42)

# Perform hyperparameter optimization
study2 = optuna.create_study(direction='maximize', sampler = sampler)
study2.optimize(objective, n_trials=200)

# Get the best hyperparameters
best_params = study2.best_params
print("Best hyperparameters:", best_params)

[I 2024-08-28 14:25:13,388] A new study created in memory with name: no-name-13073d8a-373c-4342-aec5-a8e4b4f12cce
[I 2024-08-28 14:27:41,343] Trial 0 finished with value: 0.9923317916700498 and parameters: {'learning_rate': 0.04370861069626263, 'n_estimators': 1912, 'max_depth': 23, 'gamma': 0.5387926357773329, 'min_child_weight': 5.524540572830659, 'subsample': 0.32479561626896214, 'colsample_bytree': 0.5290418060840998, 'reg_lambda': 0.8663099696291603, 'reg_alpha': 0.6015138967314656, 'scale_pos_weight': 7.372653200164409, 'max_delta_step': 0.20584494295802447}. Best is trial 0 with value: 0.9923317916700498.
[I 2024-08-28 14:29:26,816] Trial 1 finished with value: 0.9950603226493719 and parameters: {'learning_rate': 0.0972918866945795, 'n_estimators': 1699, 'max_depth': 9, 'gamma': 0.16364247048639055, 'min_child_weight': 6.3187307857495805, 'subsample': 0.4433937943676302, 'colsample_bytree': 0.762378215816119, 'reg_lambda': 0.43251307362347363, 'reg_alpha': 0.2919379110578439, 's

[I 2024-08-28 14:51:01,682] Trial 16 finished with value: 0.9956638751447582 and parameters: {'learning_rate': 0.08107630501300286, 'n_estimators': 1516, 'max_depth': 28, 'gamma': 0.38232476113715913, 'min_child_weight': 10.480180004574486, 'subsample': 0.8949934195833377, 'colsample_bytree': 0.5766561179278353, 'reg_lambda': 0.14521569599048656, 'reg_alpha': 0.04081575960516835, 'scale_pos_weight': 1.003521452598798, 'max_delta_step': 9.776871518860196}. Best is trial 12 with value: 0.9969971241251224.
[I 2024-08-28 14:52:40,635] Trial 17 finished with value: 0.9920778175384921 and parameters: {'learning_rate': 0.055131131388013004, 'n_estimators': 1215, 'max_depth': 20, 'gamma': 0.18766589802984435, 'min_child_weight': 8.928657722615005, 'subsample': 0.20980473423481372, 'colsample_bytree': 0.8366733523272174, 'reg_lambda': 0.9912859189474561, 'reg_alpha': 0.17959932103282444, 'scale_pos_weight': 2.2890041635998633, 'max_delta_step': 2.9683677471242937}. Best is trial 12 with value: 

[I 2024-08-28 15:21:35,890] Trial 33 finished with value: 0.9966705890153594 and parameters: {'learning_rate': 0.07151978930757344, 'n_estimators': 1575, 'max_depth': 27, 'gamma': 0.10207271549457508, 'min_child_weight': 2.752705964569417, 'subsample': 0.8705700577903196, 'colsample_bytree': 0.5956891284569581, 'reg_lambda': 0.27696381861769426, 'reg_alpha': 0.16683242585105904, 'scale_pos_weight': 1.5269875408377835, 'max_delta_step': 7.586814576991795}. Best is trial 12 with value: 0.9969971241251224.
[I 2024-08-28 15:23:19,253] Trial 34 finished with value: 0.9958302335048052 and parameters: {'learning_rate': 0.040097213920907725, 'n_estimators': 1375, 'max_depth': 29, 'gamma': 0.3353518851359859, 'min_child_weight': 5.65398020881627, 'subsample': 0.7232642027693708, 'colsample_bytree': 0.7859671959510122, 'reg_lambda': 0.41093601413556197, 'reg_alpha': 0.001863459406494739, 'scale_pos_weight': 2.1776798284073546, 'max_delta_step': 8.561191732278907}. Best is trial 12 with value: 0.

[I 2024-08-28 15:54:41,271] Trial 50 finished with value: 0.9957242164781732 and parameters: {'learning_rate': 0.08381853195407346, 'n_estimators': 1496, 'max_depth': 4, 'gamma': 0.26410410561859526, 'min_child_weight': 1.1195713921396255, 'subsample': 0.7735302472214554, 'colsample_bytree': 0.5174176337591558, 'reg_lambda': 0.2918042986519578, 'reg_alpha': 0.14466719203091102, 'scale_pos_weight': 2.233751127623563, 'max_delta_step': 1.6591631045550792}. Best is trial 12 with value: 0.9969971241251224.
[I 2024-08-28 15:55:52,941] Trial 51 finished with value: 0.9950176518826837 and parameters: {'learning_rate': 0.06886434940381944, 'n_estimators': 1110, 'max_depth': 30, 'gamma': 0.8986828190179268, 'min_child_weight': 4.072072017383184, 'subsample': 0.8656714775370659, 'colsample_bytree': 0.7025558566030652, 'reg_lambda': 0.11156491076945235, 'reg_alpha': 0.6751104391614813, 'scale_pos_weight': 1.2308931620363737, 'max_delta_step': 9.981483196709153}. Best is trial 12 with value: 0.996

[I 2024-08-28 16:25:52,241] Trial 67 finished with value: 0.9955148743283064 and parameters: {'learning_rate': 0.07478995298902463, 'n_estimators': 1839, 'max_depth': 25, 'gamma': 0.2613552055756396, 'min_child_weight': 12.692486366625788, 'subsample': 0.8016389872887526, 'colsample_bytree': 0.6029484604546204, 'reg_lambda': 0.06793890024923045, 'reg_alpha': 0.24285395287094835, 'scale_pos_weight': 1.012494125633359, 'max_delta_step': 9.21506182233648}. Best is trial 54 with value: 0.9970081595993433.
[I 2024-08-28 16:26:41,333] Trial 68 finished with value: 0.9960722111018768 and parameters: {'learning_rate': 0.08646095716124884, 'n_estimators': 507, 'max_depth': 28, 'gamma': 0.10851720786706873, 'min_child_weight': 4.557900045753485, 'subsample': 0.7543007461865636, 'colsample_bytree': 0.7041335010779479, 'reg_lambda': 0.14355622662469014, 'reg_alpha': 0.052808545363822644, 'scale_pos_weight': 3.3393590057180473, 'max_delta_step': 7.309068135142363}. Best is trial 54 with value: 0.99

[I 2024-08-28 16:59:03,812] Trial 84 finished with value: 0.9968665225291866 and parameters: {'learning_rate': 0.042009572013753146, 'n_estimators': 1906, 'max_depth': 26, 'gamma': 0.28432857507849635, 'min_child_weight': 2.8071209124640184, 'subsample': 0.9504376950663768, 'colsample_bytree': 0.5248519889258145, 'reg_lambda': 0.34932201049574624, 'reg_alpha': 0.025319229856391557, 'scale_pos_weight': 9.649885929541455, 'max_delta_step': 7.436792516016767}. Best is trial 54 with value: 0.9970081595993433.
[I 2024-08-28 17:01:58,803] Trial 85 finished with value: 0.9966039117595337 and parameters: {'learning_rate': 0.04331669523820592, 'n_estimators': 1889, 'max_depth': 25, 'gamma': 0.29159526093268895, 'min_child_weight': 2.704756980859953, 'subsample': 0.9475472656876068, 'colsample_bytree': 0.5180457259147133, 'reg_lambda': 0.33439774283777945, 'reg_alpha': 0.019970074031703183, 'scale_pos_weight': 9.852517022145703, 'max_delta_step': 8.5489077418906}. Best is trial 54 with value: 0.

[I 2024-08-28 17:47:09,865] Trial 101 finished with value: 0.996763397166348 and parameters: {'learning_rate': 0.04342483181664214, 'n_estimators': 1638, 'max_depth': 26, 'gamma': 0.21596258547481156, 'min_child_weight': 1.6741193186617667, 'subsample': 0.951505284975448, 'colsample_bytree': 0.5691090994835536, 'reg_lambda': 0.4714502586343364, 'reg_alpha': 0.04265271542654893, 'scale_pos_weight': 8.101700528773605, 'max_delta_step': 2.5578283201543535}. Best is trial 54 with value: 0.9970081595993433.
[I 2024-08-28 17:49:03,673] Trial 102 finished with value: 0.9965884374564655 and parameters: {'learning_rate': 0.051363605349680444, 'n_estimators': 1600, 'max_depth': 9, 'gamma': 0.07724190675053089, 'min_child_weight': 1.5320373576672561, 'subsample': 0.916131693209753, 'colsample_bytree': 0.5247989018909525, 'reg_lambda': 0.40102724601370343, 'reg_alpha': 0.0013394381774896602, 'scale_pos_weight': 7.41093473607063, 'max_delta_step': 3.334476530127872}. Best is trial 54 with value: 0.

[I 2024-08-28 18:39:29,529] Trial 118 finished with value: 0.9970456548552994 and parameters: {'learning_rate': 0.01633581105409466, 'n_estimators': 1298, 'max_depth': 19, 'gamma': 0.19549597718675066, 'min_child_weight': 1.0577292574867116, 'subsample': 0.950660085491383, 'colsample_bytree': 0.5434842150999273, 'reg_lambda': 0.036726471285321506, 'reg_alpha': 0.22333351358069525, 'scale_pos_weight': 5.332726898195509, 'max_delta_step': 9.418970431556302}. Best is trial 118 with value: 0.9970456548552994.
[I 2024-08-28 18:42:38,013] Trial 119 finished with value: 0.996976805427051 and parameters: {'learning_rate': 0.01967275172197127, 'n_estimators': 1188, 'max_depth': 19, 'gamma': 0.11900610578436988, 'min_child_weight': 1.0236332257201974, 'subsample': 0.9441298123533292, 'colsample_bytree': 0.5468569348124183, 'reg_lambda': 0.08719016226432823, 'reg_alpha': 0.23378074431535056, 'scale_pos_weight': 5.3585568832538595, 'max_delta_step': 9.327129223870129}. Best is trial 118 with value

[I 2024-08-28 19:33:59,223] Trial 135 finished with value: 0.9966885783665111 and parameters: {'learning_rate': 0.023359410648122497, 'n_estimators': 1281, 'max_depth': 21, 'gamma': 0.11961256094481923, 'min_child_weight': 1.4842730431328606, 'subsample': 0.889526635244446, 'colsample_bytree': 0.8532333553209301, 'reg_lambda': 0.07493550193863922, 'reg_alpha': 0.23931285450842898, 'scale_pos_weight': 5.712901044191189, 'max_delta_step': 9.286812119201832}. Best is trial 120 with value: 0.9971188808096141.
[I 2024-08-28 19:37:24,623] Trial 136 finished with value: 0.9967713107730539 and parameters: {'learning_rate': 0.01456105756466665, 'n_estimators': 957, 'max_depth': 18, 'gamma': 0.08621014720474013, 'min_child_weight': 2.354368488652388, 'subsample': 0.9379860159270491, 'colsample_bytree': 0.5417035263325867, 'reg_lambda': 0.10060335230386641, 'reg_alpha': 0.20380408351912532, 'scale_pos_weight': 4.429955418057472, 'max_delta_step': 8.782960720156712}. Best is trial 120 with value: 

[I 2024-08-28 20:30:29,852] Trial 152 finished with value: 0.9970488557036286 and parameters: {'learning_rate': 0.014592255419837673, 'n_estimators': 1037, 'max_depth': 20, 'gamma': 0.1109174317391455, 'min_child_weight': 1.0296715679668385, 'subsample': 0.9082014453755004, 'colsample_bytree': 0.5374785326290292, 'reg_lambda': 0.09698896849630513, 'reg_alpha': 0.1848485939122324, 'scale_pos_weight': 5.060201819706927, 'max_delta_step': 9.981519003324683}. Best is trial 120 with value: 0.9971188808096141.
[I 2024-08-28 20:33:52,756] Trial 153 finished with value: 0.9970489356019943 and parameters: {'learning_rate': 0.014451649122265802, 'n_estimators': 991, 'max_depth': 19, 'gamma': 0.1491913984137076, 'min_child_weight': 1.0093529812957693, 'subsample': 0.9173109180248195, 'colsample_bytree': 0.5322889582176196, 'reg_lambda': 0.10786675147284201, 'reg_alpha': 0.1841965563818565, 'scale_pos_weight': 5.422866781668762, 'max_delta_step': 9.904666708531868}. Best is trial 120 with value: 0

[I 2024-08-28 21:31:00,685] Trial 169 finished with value: 0.9967824084971019 and parameters: {'learning_rate': 0.017442154373798282, 'n_estimators': 919, 'max_depth': 19, 'gamma': 0.04127256351262673, 'min_child_weight': 1.000852338133089, 'subsample': 0.6423300355505134, 'colsample_bytree': 0.5491400175701502, 'reg_lambda': 0.12937997269784973, 'reg_alpha': 0.2523283221824343, 'scale_pos_weight': 6.655394504232802, 'max_delta_step': 9.739319976321003}. Best is trial 120 with value: 0.9971188808096141.
[I 2024-08-28 21:34:46,757] Trial 170 finished with value: 0.9966828862286101 and parameters: {'learning_rate': 0.012641423364948435, 'n_estimators': 1087, 'max_depth': 20, 'gamma': 0.022294010555361843, 'min_child_weight': 3.3601779463639305, 'subsample': 0.9276155359782565, 'colsample_bytree': 0.5260614598062591, 'reg_lambda': 0.2672926917602372, 'reg_alpha': 0.28450734745183215, 'scale_pos_weight': 4.671909900029988, 'max_delta_step': 9.325605116448658}. Best is trial 120 with value:

[I 2024-08-28 22:25:04,332] Trial 186 finished with value: 0.9969786257364552 and parameters: {'learning_rate': 0.012262851367147557, 'n_estimators': 1168, 'max_depth': 19, 'gamma': 0.02225852522896373, 'min_child_weight': 1.0218232184919365, 'subsample': 0.9540671159003549, 'colsample_bytree': 0.789072782954296, 'reg_lambda': 0.06617669701703367, 'reg_alpha': 0.21192178746044313, 'scale_pos_weight': 4.729472962582659, 'max_delta_step': 9.872841595042084}. Best is trial 181 with value: 0.9971647378349546.
[I 2024-08-28 22:27:37,205] Trial 187 finished with value: 0.9964699972355255 and parameters: {'learning_rate': 0.01651561214943655, 'n_estimators': 1221, 'max_depth': 7, 'gamma': 0.07657649004466055, 'min_child_weight': 1.6200562891352743, 'subsample': 0.8986692819231222, 'colsample_bytree': 0.538967031959779, 'reg_lambda': 0.04466217185925567, 'reg_alpha': 0.4305573768838299, 'scale_pos_weight': 5.251085139461419, 'max_delta_step': 9.98575911084385}. Best is trial 181 with value: 0.

Best hyperparameters: {'learning_rate': 0.019935063753921546, 'n_estimators': 1209, 'max_depth': 21, 'gamma': 0.13021237049313528, 'min_child_weight': 1.0037029366354757, 'subsample': 0.9397452843795323, 'colsample_bytree': 0.5483858560356617, 'reg_lambda': 0.09062574910114087, 'reg_alpha': 0.23614577547483448, 'scale_pos_weight': 5.421487621314032, 'max_delta_step': 9.323657555341919}


# Model Define

In [None]:
best_params = {'learning_rate': 0.019935063753921546, 'n_estimators': 1209, 'max_depth': 21, 'gamma': 0.13021237049313528, 'min_child_weight': 1.0037029366354757, 'subsample': 0.9397452843795323, 'colsample_bytree': 0.5483858560356617, 'reg_lambda': 0.09062574910114087, 'reg_alpha': 0.23614577547483448, 'scale_pos_weight': 5.421487621314032, 'max_delta_step': 9.323657555341919}

model = XGBClassifier(**best_params,
                      device='cuda',
                      class_weight='balanced',
                      random_state=42)

model.fit(train_x_encoded[feature_order], train_y_encoded)

In [None]:
test_x_encoded = test_x.copy()

# Ordinal 인코딩
test_x_encoded[categorical_columns] = ordinal_encoder.transform(test_x[categorical_columns])

# 원핫 인코딩 적용
test_x_encoded = pd.get_dummies(test_x_encoded, columns=one_hot_columns, drop_first=True)

# 특성 순서 맞추기 및 데이터 타입 일치
test_x_encoded = test_x_encoded[feature_order]
for col in feature_order:
    test_x_encoded[col] = test_x_encoded[col].astype(train_x_encoded[col].dtype)

In [None]:
test_x_encoded.shape

(120000, 56)

In [None]:
# 예측
predictions = model.predict(test_x_encoded)
predictions_label = le_subclass.inverse_transform(predictions)

In [None]:
pd.DataFrame(predictions_label).value_counts()

m    60667
l     7869
e     6753
f     5829
b     5820
k     5384
j     5332
a     4916
i     4794
h     3594
d     3445
g     3261
c     2336
Name: count, dtype: int64

# Submission

In [None]:
# 분류 예측 결과 제출 데이터프레임(DataFrame)
# 분류 예측 결과 데이터프레임 파일명을 반드시 clf_submission.csv 로 지정해야합니다.
clf_submission = pd.read_csv("/home/elicer/data/sample_submission.csv")
clf_submission["Fraud_Type"] = predictions_label
clf_submission.head()

Unnamed: 0,ID,Fraud_Type
0,TEST_000000,l
1,TEST_000001,m
2,TEST_000002,m
3,TEST_000003,m
4,TEST_000004,m


In [None]:
# 합성 데이터 생성 결과 제출 데이터프레임(DataFrame)
# 합성 데이터 생성 결과 데이터프레임 파일명을 반드시 syn_submission.csv 로 지정해야합니다.
synthetic_data.head()

Unnamed: 0,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,...,Last_atm_transaction_datetime,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Fraud_Type,Transaction_resumed_date
0,1950,male,윤현숙,lJBEHU-BykuOFI,2007-09-28 14:16:02,C,1,1,1,1,...,2021-11-02 05:47:59,2012-06-20 19:05:27,0,1,0,2,0,0,a,2032-12-06 03:32:52
1,1957,female,박민수,tukfWe-XEGsGtB,2008-01-27 19:40:22,B,0,1,1,1,...,2035-09-01 05:23:45,2021-03-26 22:27:38,0,0,0,0,0,0,a,2021-02-14 09:03:59
2,1950,male,민성현,mCCycV-YyOCKxe,2003-02-22 06:08:59,A,0,1,0,1,...,2040-12-29 11:12:55,2006-05-15 06:29:58,0,0,0,0,0,0,a,2035-11-05 17:25:07
3,1960,female,송영철,pytIEg-LMwfKEl,2007-07-21 05:57:37,B,0,1,1,1,...,2027-11-13 12:10:03,2026-04-10 02:47:08,0,1,1,0,0,0,a,2043-02-08 08:36:07
4,1966,male,허현숙,tukfWe-XEGsGtB,2004-11-04 09:09:54,C,1,0,0,1,...,2034-05-07 08:24:39,2003-05-27 02:57:06,1,0,1,2,2,0,a,2043-02-08 08:36:07


In [None]:
'''
(*) 저장 시 각 파일명을 반드시 확인해주세요.
    1. 분류 예측 결과 데이터프레임 파일명 = clf_submission.csv
    2. 합성 데이터 생성 결과 데이터프레임 파일명 = syn_submission.csv

(*) 제출 파일(zip) 내에 두 개의 데이터프레임이 각각 위의 파일명으로 반드시 존재해야합니다.
(*) 파일명을 일치시키지 않으면 채점이 불가능합니다.
'''

# 폴더 생성 및 작업 디렉토리 변경
os.makedirs('./submission', exist_ok=True)
os.chdir("./submission/")

# CSV 파일로 저장
clf_submission.to_csv('./clf_submission.csv', encoding='UTF-8-sig', index=False)
synthetic_data.to_csv('./syn_submission.csv', encoding='UTF-8-sig', index=False)

# ZIP 파일 생성 및 CSV 파일 추가
with zipfile.ZipFile("../XGBoost_24.zip", 'w') as submission:
    submission.write('clf_submission.csv')
    submission.write('syn_submission.csv')

print('Done.')

Done.
