# Library

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import os
import zipfile
from scipy import stats
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import xgboost as xgb

#합성 데이터 생성 라이브러리
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer


warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'sdv'

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1


# Load Date

In [6]:
base_path='../'
data_path=base_path+'data'

In [7]:
train_all = pd.read_csv(data_path+"/train.csv")
test_all = pd.read_csv(data_path+"/test.csv")

In [8]:
display(train_all.head(),
        test_all.head())

Unnamed: 0,ID,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,...,Last_atm_transaction_datetime,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Fraud_Type,Transaction_resumed_date
0,TRAIN_000000,1980,male,이상호,BJWQxd-WBASPLJ,2003-01-06 18:38:01,B,0,1,0,...,2003-01-22 23:38:48,2003-01-22 23:38:48,1,1,1,0,0,0,m,2003-01-22 23:38:48
1,TRAIN_000001,1964,male,박상철,kurCwX-odPUXEt,2003-01-07 16:40:44,C,0,1,0,...,2003-01-21 21:29:08,2003-01-31 00:19:46,0,1,0,0,0,0,m,2003-01-19 21:29:08
2,TRAIN_000002,1982,female,조옥자,OiERQa-CTXBoaX,2003-01-11 14:08:36,B,1,1,0,...,2003-01-31 07:13:28,2003-01-31 07:13:28,0,0,1,1,1,0,m,2003-01-31 07:13:28
3,TRAIN_000003,1982,female,조옥자,OiERQa-CTXBoaX,2003-01-11 14:08:36,B,1,1,1,...,2003-01-31 11:49:56,2003-01-31 07:13:28,1,1,0,0,0,0,m,2003-01-31 07:13:28
4,TRAIN_000004,1982,female,조옥자,OiERQa-CTXBoaX,2003-01-11 14:08:36,B,1,1,1,...,2003-01-31 11:49:56,2003-01-31 07:13:28,1,0,0,1,1,0,m,2003-01-31 07:13:28


Unnamed: 0,ID,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,...,Unused_terminal_status,Last_atm_transaction_datetime,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Transaction_resumed_date
0,TEST_000000,1960,female,주지아,DOMcBN-kRMFflJ,2003-01-07 10:59:08,E,1,0,0,...,1,2003-01-10 05:27:56,2003-01-08 05:27:56,0,1,1,0,0,0,2003-01-08 05:27:56
1,TEST_000001,1960,female,주지아,DOMcBN-kRMFflJ,2003-01-07 10:59:08,E,1,1,1,...,0,2003-01-11 21:29:50,2003-01-08 05:27:56,0,1,0,0,0,0,2003-01-08 05:27:56
2,TEST_000002,1951,male,김정수,pZrAvI-mhxfVyw,2003-01-06 18:10:55,B,1,1,1,...,0,2003-01-13 01:08:19,2003-01-13 01:08:19,1,0,0,2,2,0,2003-01-13 01:08:19
3,TEST_000003,1999,female,김현지,fVlbzX-wvugTpH,2003-01-08 05:28:53,B,0,1,1,...,1,2003-01-21 10:03:32,2003-01-26 13:49:24,0,1,1,0,0,0,2003-01-20 10:03:32
4,TEST_000004,1996,female,박은정,chYftA-AjVuXMW,2003-01-17 03:37:22,A,0,1,0,...,1,2003-01-28 19:04:19,2003-01-28 19:04:19,0,1,1,0,0,0,2003-01-28 19:04:19


# 데이터 전처리

In [10]:
train = train_all.drop(columns="ID")
train["Fraud_Type"].value_counts()

Fraud_Type
m    118800
a       100
j       100
h       100
k       100
c       100
g       100
i       100
b       100
f       100
d       100
e       100
l       100
Name: count, dtype: int64

In [11]:
#각 클래스별로 1000개 생성
N_CLS_PER_GEN = 1000

In [12]:
# 이상치 처리 함수
def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()
    z_scores = np.abs(stats.zscore(series))
    return series.mask(z_scores > n_std, mean)

# Time_difference 컬럼을 총 초로 변환 및 이상치 처리
train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()
train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

In [13]:
train['Time_difference_seconds']

0            10430.0
1             4053.0
2             3179.0
3             5045.0
4             6209.0
             ...    
119995    31946659.0
119996        9038.0
119997      366939.0
119998        4806.0
119999      214554.0
Name: Time_difference_seconds, Length: 120000, dtype: float64

In [16]:
# 모든 Fraud_Type 목록 생성 (m 포함)
fraud_types = train['Fraud_Type'].unique()
fraud_types

array(['m', 'a', 'j', 'h', 'k', 'c', 'g', 'i', 'b', 'f', 'd', 'e', 'l'],
      dtype=object)

In [17]:
# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data = pd.DataFrame()

In [2]:
N_SAMPLE = 100

In [None]:
# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):
    
    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 100개씩 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)
    
    # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
    subset = subset.drop('Time_difference', axis=1)
    
    # 메타데이터 생성 및 모델 학습
    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    # 데이터 타입 설정
    column_sdtypes = {
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        'Customer_identification_number': 'categorical',  
        'Customer_personal_identifier': 'categorical',
        'Account_account_number': 'categorical',
        'IP_Address': 'ipv4_address',  
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Fraud_Type': 'categorical',
        'Time_difference_seconds': 'numerical',
        'Customer_Birthyear': 'numerical'
    }

    # 각 컬럼에 대해 데이터 타입 설정
    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )
    synthesizer = CTGANSynthesizer(
                            metadata,
                            epochs=100
                        )
    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)
    
    # 생성된 Time_difference_seconds의 이상치 처리
    synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])
    
    # Time_difference_seconds를 다시 timedelta로 변환
    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')
    
    # Time_difference_seconds 컬럼 제거
    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)
    
    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)