In [None]:
import os
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings("ignore")

In [2]:
cards = pd.read_csv("./data/cards_data.csv")
users = pd.read_csv("./data/users_data.csv")
transactions = pd.read_csv("./data/transactions_data.csv")

In [3]:
def money_to_float(s):
    s = pd.Series(s).astype(str).str.strip()
    s = s.str.replace(r'[$,]', '', regex=True)   
    return pd.to_numeric(s)   

def parse_mmYYYY(s):
    return pd.to_datetime(pd.Series(s).astype(str), format="%m/%Y")

In [4]:
# users: 화폐형 컬럼 숫자화, 불필요(PII) 최소 제거
for c in ["per_capita_income","yearly_income","total_debt"]:
    if c in users.columns:
        users[c] = money_to_float(users[c])
users = users.drop("address", axis=1) 

# 카드 숫자/날짜 변환
cards["credit_limit"] = money_to_float(cards["credit_limit"])
cards["acct_open_date"] = parse_mmYYYY(cards["acct_open_date"])
cards["expires"] = parse_mmYYYY(cards["expires"]) # datime 형태로 변환

# 카드 has_chaip, card_on_dark_web 정보 이진화
for c in ["has_chip","card_on_dark_web"]:
    cards[c] = cards[c].map({"YES":1,"Yes":1,"NO":0,"No":0}).astype('Int8')

# 카드 민감정보 제거
cards = cards.drop(["card_number", "cvv", "card_on_dark_web"], axis=1)

# transactions: 날짜/금액/파생
transactions["date"] = pd.to_datetime(transactions["date"])
transactions["hour"] = transactions["date"].dt.hour
transactions["dow"]  = transactions["date"].dt.dayofweek

# 음수(환불 등)는 분석 목적에 따라 분리
transactions["amount"] = money_to_float(transactions["amount"])
transactions["amount_pos_"] = transactions["amount"].clip(lower=0)
transactions["amount_log1p"] = np.log1p(transactions["amount_pos_"])

transactions["is_refund"] = (transactions["amount"] < 0).astype('Int8')
transactions["amount_pos"] = transactions["amount"].clip(lower=0)
transactions["amount_neg"] = transactions["amount"].clip(upper=0) 

# 우편번호 float → 문자열 5자리
transactions["zip"] = transactions["zip"].round().astype('Int64')
transactions["zip_str"] = transactions["zip"].astype("string").str.zfill(5)

# 거의 전부 NaN인 errors 컬럼 제거
transactions = transactions.drop("errors", axis=1) 

In [None]:
work = pd.merge(transactions, users, left_on='client_id', right_on='id', how='left', suffixes=('', '__USER'))
work = pd.merge(work, cards, left_on='card_id', right_on='id', how='left', suffixes=('', '__CARD'))

work.drop(columns=['id__USER', 'id__CARD'], inplace=True, errors='ignore') 

In [7]:
# JSON 파일 불러오기
fraud_label_file_path = './data/train_fraud_labels.json'

with open(fraud_label_file_path, 'r') as f:
    data = json.load(f)

# target 딕셔너리 추출
target_dict = data['target']

In [8]:
# 'is_raud'에 target 값들 입력
work['is_fraud'] = work['id'].astype(str).map(target_dict)
work['is_fraud'] = work['is_fraud'].map({'No': 0, 'Yes': 1}).astype('Int8') 

# fraud_label이 없는 행들은 제거
work_with_target = work.dropna()

In [9]:
filepath = './data/transactions_wtih_target.csv' 

if not os.path.exists(filepath):  
    work_with_target.to_csv(filepath, index=False)
else:
    print(f"⚠️ 파일 이미 존재: {filepath}, 저장 스킵")