#1. 필수 라이브러리 설치&실행

In [None]:
!pip install scikit-fuzzy
!pip install optuna
!pip install sdv
!pip install xgboost
!pip install tensorflow
!pip install imbalanced-learn

In [None]:
import os
import zipfile

import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
import pickle

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

import xgboost as xgb

from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import skfuzzy as fuzz

#2. Train 데이터 축소(FCM, PCA) 및 이상치 처리

In [None]:
df = pd.read_csv('/data/train.csv')

features = df.select_dtypes(include=['number']).columns.difference(['ID'])

m_data = df[df['Fraud_Type'] == 'm']
other_data = df[df['Fraud_Type'] != 'm']

sampled_m_data = m_data.sample(n=len(other_data), random_state=42)

balanced_data = pd.concat([sampled_m_data, other_data])

scaler = StandardScaler()
scaled_data = scaler.fit_transform(balanced_data[features])

cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
    scaled_data.T, c=3, m=2.5, error=0.005, maxiter=1000, init=None)

cluster_labels = np.argmax(u, axis=0)

balanced_data['cluster_labels'] = cluster_labels

pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

balanced_data['pca_one'] = pca_result[:, 0]
balanced_data['pca_two'] = pca_result[:, 1]

plt.figure(figsize=(12, 8))
plt.scatter(balanced_data['pca_one'], balanced_data['pca_two'], c=balanced_data['Fraud_Type'].apply(lambda x: 'red' if x == 'm' else 'blue'), alpha=0.5)
plt.title('PCA of Dataset with Fuzzy C-Means')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

balanced_data = balanced_data.drop(columns=['cluster_labels'])

print(balanced_data['Fraud_Type'].value_counts())

In [None]:
train = balanced_data.copy()

print(train["Fraud_Type"].value_counts())

N_CLS_PER_GEN = 1000

def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()
    z_scores = np.abs(stats.zscore(series))
    return series.mask(z_scores > n_std, mean)

train['Time_difference'] = pd.to_timedelta(train['pca_one'] * 1000, unit='s')

train['Time_difference_seconds'] = train['Time_difference'].dt.total_seconds()
train['Time_difference_seconds'] = handle_outliers(train['Time_difference_seconds'])

fraud_types = train['Fraud_Type'].unique()


#3. 데이터 생성(CTGAN)

In [None]:
all_synthetic_data = pd.DataFrame()

N_SAMPLE = 100

all_synthetic_data = pd.DataFrame()

for fraud_type in tqdm(fraud_types):

    subset = train[train["Fraud_Type"] == fraud_type]

    subset = subset.sample(n=N_SAMPLE, random_state=42)

    subset = subset.drop('Time_difference', axis=1)

    metadata = SingleTableMetadata()

    metadata.detect_from_dataframe(subset)
    metadata.set_primary_key(None)

    column_sdtypes = {
        'Account_initial_balance': 'numerical',
        'Account_balance': 'numerical',
        'Customer_Gender': 'categorical',
        'Customer_identification_number': 'categorical',
        'Customer_personal_identifier': 'categorical',
        'Account_account_number': 'categorical',
        'IP_Address': 'ipv4_address',
        'Location': 'categorical',
        'Recipient_Account_Number': 'categorical',
        'Fraud_Type': 'categorical',
        'Time_difference_seconds': 'numerical',
        'Customer_Birthyear': 'numerical'
    }

    for column, sdtype in column_sdtypes.items():
        metadata.update_column(
            column_name=column,
            sdtype=sdtype
        )

    synthesizer = CTGANSynthesizer(
        metadata,
        epochs=100
    )

    synthesizer.fit(subset)

    synthetic_subset = synthesizer.sample(num_rows=N_CLS_PER_GEN)

    synthetic_subset['Time_difference_seconds'] = handle_outliers(synthetic_subset['Time_difference_seconds'])

    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')

    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)

    all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)


#4. Train 데이터 전처리

In [None]:
train = balanced_data.copy()

train_x = train.drop(columns=['Fraud_Type'])
train_y = train['Fraud_Type']

datetime_columns = ['Transaction_resumed_date', 'Last_atm_transaction_datetime', 'Last_bank_branch_transaction_datetime']
for col in datetime_columns:
    if col in train_x.columns:
        train_x[col] = pd.to_datetime(train_x[col])

train_x['Last_atm_transaction_timestamp'] = train_x['Last_atm_transaction_datetime'].astype(int) / 10**9
train_x['Last_bank_branch_transaction_timestamp'] = train_x['Last_bank_branch_transaction_datetime'].astype(int) / 10**9
train_x['Transaction_resumed_timestamp'] = train_x['Transaction_resumed_date'].astype(int) / 10**9

train_x = train_x.drop(columns=datetime_columns)

train_x['Time_difference'] = train_x['Transaction_resumed_timestamp'] - train_x['Last_atm_transaction_timestamp']

le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)

for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

categorical_columns = train_x.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

train_x_encoded = train_x.copy()
train_x_encoded[categorical_columns] = ordinal_encoder.fit_transform(train_x[categorical_columns])

feature_order = train_x_encoded.columns.tolist()
if 'ID' in feature_order:
    feature_order.remove('ID')

#5. XGBoost 모델 1

In [None]:
def objective(trial):
    xgb_params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 5),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 5.0),
        'max_delta_step': trial.suggest_float('max_delta_step', 0, 10),
        'device': 'cpu',
    }

    model = xgb.XGBClassifier(**xgb_params)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    for train_index, val_index in kf.split(train_x_encoded[feature_order], train_y_encoded):
        X_train_fold, X_val_fold = train_x_encoded[feature_order].iloc[train_index], train_x_encoded[feature_order].iloc[val_index]
        y_train_fold, y_val_fold = train_y_encoded[train_index], train_y_encoded[val_index]

        model.fit(X_train_fold, y_train_fold, verbose=False)

        y_pred = model.predict(X_val_fold)
        f1 = f1_score(y_val_fold, y_pred, average='macro')
        f1_scores.append(f1)

    mean_score = sum(f1_scores) / len(f1_scores)

    return mean_score

sampler = optuna.samplers.TPESampler(seed=42)

study2 = optuna.create_study(direction='maximize', sampler = sampler)
study2.optimize(objective, n_trials=100)

best_params = study2.best_params
print("Best hyperparameters:", best_params)

Best hyperparameters: {'learning_rate': 0.03500842897354163, 'n_estimators': 224, 'max_depth': 4, 'min_child_weight': 2.1605378743201147, 'gamma': 0.22804306127508525, 'subsample': 0.955950194857886, 'colsample_bytree': 0.8227035006806621, 'reg_lambda': 0.3490933288294228, 'reg_alpha': 0.21227891390474513, 'scale_pos_weight': 4.547887415904099, 'max_delta_step': 1.588355137064108}

In [None]:
model1 = xgb.XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
model1.fit(train_x_encoded[feature_order], train_y_encoded)

데이터 분류 성능 확인 -> i 데이터는 예측 잘 안 됨

In [None]:
csv_file_path = '/data/synthetic_data.csv'

all_synthetic_data = pd.read_csv(csv_file_path)

datetime_columns = ['Transaction_resumed_date', 'Last_atm_transaction_datetime', 'Last_bank_branch_transaction_datetime']
for col in datetime_columns:
    if col in all_synthetic_data.columns:
        all_synthetic_data[col] = pd.to_datetime(all_synthetic_data[col])

all_synthetic_data['Last_atm_transaction_timestamp'] = all_synthetic_data['Last_atm_transaction_datetime'].astype(int) / 10**9
all_synthetic_data['Last_bank_branch_transaction_timestamp'] = all_synthetic_data['Last_bank_branch_transaction_datetime'].astype(int) / 10**9
all_synthetic_data['Transaction_resumed_timestamp'] = all_synthetic_data['Transaction_resumed_date'].astype(int) / 10**9

all_synthetic_data = all_synthetic_data.drop(columns=datetime_columns)

all_synthetic_data['Time_difference'] = all_synthetic_data['Transaction_resumed_timestamp'] - all_synthetic_data['Last_atm_transaction_timestamp']

all_synthetic_data_encoded = all_synthetic_data.copy()
all_synthetic_data_encoded[categorical_columns] = ordinal_encoder.transform(all_synthetic_data[categorical_columns])

pca = PCA(n_components=2)
pca_result = pca.fit_transform(all_synthetic_data_encoded[features])

all_synthetic_data_encoded['pca_one'] = pca_result[:, 0]
all_synthetic_data_encoded['pca_two'] = pca_result[:, 1]

In [None]:
feature_order = train_x.columns.tolist()
if 'pca_one' not in feature_order:
    feature_order.append('pca_one')
if 'pca_two' not in feature_order:
    feature_order.append('pca_two')

all_synthetic_data_encoded = all_synthetic_data_encoded[feature_order]

for col in feature_order:
    all_synthetic_data_encoded[col] = all_synthetic_data_encoded[col].astype(train_x_encoded[col].dtype)

predictions = model.predict(all_synthetic_data_encoded)

predictions_label = le_subclass.inverse_transform(predictions)

In [None]:
proba_predictions = model.predict_proba(all_synthetic_data_encoded)

max_proba = np.max(proba_predictions, axis=1)
predicted_labels = np.argmax(proba_predictions, axis=1)

predictions_label = le_subclass.inverse_transform(predicted_labels)

comparison_df = all_synthetic_data.copy()
comparison_df['Predicted_Fraud_Type'] = predictions_label
comparison_df['Max_Proba'] = max_proba

filtered_data = comparison_df[
    (comparison_df['Fraud_Type'] == comparison_df['Predicted_Fraud_Type']) &
    (comparison_df['Fraud_Type'] != 'm') &
    (comparison_df['Max_Proba'] >= 0.8)

]

filtered_data = filtered_data.drop(columns=['Predicted_Fraud_Type', 'Max_Proba'])

print(filtered_data['Fraud_Type'].value_counts())

#6. XGBoost 모델 2

In [None]:
fraud_type_i_data = comparison_df[comparison_df['Fraud_Type'] == 'i']

filtered_data = filtered_data.drop(columns=['Predicted_Fraud_Type', 'Max_Proba'])
fraud_type_i_data = fraud_type_i_data.drop(columns=['Predicted_Fraud_Type', 'Max_Proba'])

combined_data = pd.concat([filtered_data, fraud_type_i_data])

combined_data_encoded = combined_data.copy()
combined_data_encoded[categorical_columns] = ordinal_encoder.transform(combined_data[categorical_columns])

In [None]:
params = {
    'learning_rate': 0.03500842897354163,
    'n_estimators': 224,
    'max_depth': 4,
    'min_child_weight': 2.1605378743201147,
    'gamma': 0.22804306127508525,
    'subsample': 0.955950194857886,
    'colsample_bytree': 0.8227035006806621,
    'reg_lambda': 0.3490933288294228,
    'reg_alpha': 0.21227891390474513,
    'scale_pos_weight': 4.547887415904099,
    'max_delta_step': 1.588355137064108,
    'device': 'cuda',
    'use_label_encoder': False,
    'eval_metric': 'mlogloss',
    'random_state': 42
}

model2 = xgb.XGBClassifier(**params)

model2.fit(train_x_encoded, train_y_encoded)

#7. 모델 앙상블

In [None]:
import pandas as pd
import pickle
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

models = [
    ('xgb1', model1),
    ('xgb2', model2)
]

ensemble_model = VotingClassifier(estimators=models, voting='soft')



# 7. Test 데이터 전처리 및 예측

In [None]:
test_x = pd.read_csv('/data/test.csv')

datetime_columns = ['Transaction_resumed_date', 'Last_atm_transaction_datetime', 'Last_bank_branch_transaction_datetime']
for col in datetime_columns:
    if col in test_x.columns:
        test_x[col] = pd.to_datetime(test_x[col])

test_x['Last_atm_transaction_timestamp'] = test_x['Last_atm_transaction_datetime'].astype(int) / 10**9
test_x['Last_bank_branch_transaction_timestamp'] = test_x['Last_bank_branch_transaction_datetime'].astype(int) / 10**9
test_x['Transaction_resumed_timestamp'] = test_x['Transaction_resumed_date'].astype(int) / 10**9

test_x = test_x.drop(columns=datetime_columns)

test_x['Time_difference'] = test_x['Transaction_resumed_timestamp'] - test_x['Last_atm_transaction_timestamp']

test_x_encoded = test_x.copy()
test_x_encoded[categorical_columns] = ordinal_encoder.transform(test_x[categorical_columns])

pca_test_result = pca.transform(test_x_encoded[features])
test_x_encoded['pca_one'] = pca_test_result[:, 0]
test_x_encoded['pca_two'] = pca_test_result[:, 1]

test_x_encoded = test_x_encoded[feature_order]

for col in feature_order:
    test_x_encoded[col] = test_x_encoded[col].astype(train_x_encoded[col].dtype)

ensemble_model.fit(train_x_encoded[feature_order], train_y_encoded)

with open('/data/앙상블1.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)

predictions = ensemble_model.predict(test_x_encoded)

predictions_label = le_subclass.inverse_transform(predictions)

In [None]:
clf_submission = pd.read_csv("/data/sample_submission.csv")
clf_submission["Fraud_Type"] = predictions_label
clf_submission.head()

all_synthetic_data.head()

os.makedirs('./submission', exist_ok=True)
os.chdir("./submission/")

clf_submission.to_csv('./clf_submission.csv', encoding='UTF-8-sig', index=False)
all_synthetic_data.to_csv('./syn_submission.csv', encoding='UTF-8-sig', index=False)

syn_submission_path = './syn_submission.csv'
syn_submission = pd.read_csv(syn_submission_path)
columns_to_remove = ['ID', 'pca_one', 'pca_two']
syn_submission_cleaned = syn_submission.drop(columns=[col for col in columns_to_remove if col in syn_submission.columns])
syn_submission_cleaned.to_csv(syn_submission_path, encoding='UTF-8-sig', index=False)

with zipfile.ZipFile("/data/submission_ensemble.zip", 'w') as submission:
    submission.write('clf_submission.csv')
    submission.write('syn_submission.csv')

print('Done.')