In [29]:
import pandas as pd
import numpy as np
import random
from faker import Faker


In [30]:
#시드값 설정

seed_value = 738

random.seed(seed_value)
np.random.seed(seed_value)
fake = Faker()
Faker.seed(seed_value)

In [31]:
# 데이터 생성 함수
def create_sample_data(num_samples=1000, num_frauds=100):
    data = []
    for _ in range(num_samples):
        transaction_number = fake.uuid4()
        transaction_amount = round(random.uniform(5.0, 10000.0), 2)
        is_domestic_ip = random.choice([True, False])
        transaction_ip = fake.ipv4()
        recent_7d_amount = round(random.uniform(5.0, 70000.0), 2)
        recent_7d_payment_methods = random.randint(1,5)
        account_age_days = random.randint(1, 3650)
        transaction_time = fake.time()
        country_code = fake.country_code()
        customer_age = random.randint(18, 80)
        account_balance = round(random.uniform(0.0, 50000.0), 2)
        num_past_transactions = random.randint(0, 100)
        device_id = fake.uuid4()
        payment_method = random.choice(["credit_card", "debit_card", "paypal", "bank_transfer"])
        is_new_account = random.choice([True, False])
        has_promo_code = random.choice([True, False])
        shipping_address_change = random.choice([True, False])
        num_recent_login_failures = random.randint(0, 10)
        label = 0
        data.append([transaction_number, transaction_amount, is_domestic_ip, transaction_ip,
                    recent_7d_amount, recent_7d_payment_methods, account_age_days, transaction_time,
                    country_code, customer_age, account_balance, num_past_transactions, device_id, payment_method, is_new_account,
                    has_promo_code, shipping_address_change, num_recent_login_failures, label])
        
    # 사기 데이터 설정
    fraud_samples = random.sample(data, num_frauds)
    for sample in fraud_samples:
        sample[-1] = 1
            
    columns = ["transaction_number", "transaction_amount", "is_domestic_ip", "transaction_ip",
                "recent_7d_amount", "recent_7d_payment_methods", "account_age_days", "transaction_time",
                "country_code", "customer_age", "account_balance", "num_past_transactions", "device_id", "payment_method", "is_new_account",
                "has_promo_code", "shipping_address_change", "num_recent_login_failures", "label"]
        
    return pd.DataFrame(data, columns=columns)


In [32]:
#샘플 데이터 생성
sample_data = create_sample_data()
sample_data.head()

Unnamed: 0,transaction_number,transaction_amount,is_domestic_ip,transaction_ip,recent_7d_amount,recent_7d_payment_methods,account_age_days,transaction_time,country_code,customer_age,account_balance,num_past_transactions,device_id,payment_method,is_new_account,has_promo_code,shipping_address_change,num_recent_login_failures,label
0,4134ac65-17af-4295-a9ee-2572da24bf01,8521.97,True,180.113.37.42,17833.43,2,569,00:04:24,JM,73,31870.68,40,11d628e1-9250-450d-9010-fc19ee244682,credit_card,False,True,True,0,0
1,2d6d6470-be56-4d98-a39d-6aa867d8c0ab,7994.64,False,85.49.26.125,60694.16,3,1713,20:27:05,KG,70,48163.08,38,1b9e16a0-f698-4086-9091-397d6b0eb77c,credit_card,False,True,True,3,1
2,fff7ef6a-631d-4f69-9eee-67b54dc5d23c,6186.2,True,5.107.18.36,48230.44,4,1894,06:51:54,FJ,64,22340.42,25,3558cdc4-b0fe-490d-aeaa-54319e5158f0,paypal,True,False,True,6,0
3,658789a9-ec29-40a2-ac55-d79ab06141a3,3828.94,False,30.76.65.76,13009.76,4,2860,09:45:46,FR,61,39208.54,68,df0f0b54-8baa-46ba-963e-97809b79ef1e,credit_card,False,True,False,0,0
4,a95763a6-5b8f-47da-a504-1bf098821a29,4278.94,False,219.0.192.70,5958.43,4,1471,15:39:09,GE,28,26655.01,54,82a4a7a4-cc6f-46fb-a1f1-21196545a909,paypal,True,False,False,9,0


In [33]:
sample_data['label'].value_counts()

label
0    900
1    100
Name: count, dtype: int64

샘플 데이터를 위한 가정된 규칙에 따른 분류 및 탐지율 계산 함수

In [34]:
def calculate_detection_rate(df):
    # 규칙 정의
    rules = {
        '고액 거래' : df['transaction_amount'] > 5000,
        '다수 결제 수단 사용' : df['recent_7d_payment_methods'] >=3,
        '빈번한 로그인 실패' : df['num_recent_login_failures'] >=3,
        '신규 계정 대량 거래' : (df['account_age_days'] <= 30) & (df['transaction_amount'] > 3000),
        '국제 거래 및 고액' : (~df['is_domestic_ip']) & (df['transaction_amount'] > 2000),
    }
    
    # 결과 저장 리스트
    results = []
    
    for rule_name, condition in rules.items():
        #규칙 해당 트랜잭션
        filtered_transactions = df[condition]
        # 사기 거래 수와 정상 거래 수 계산
        num_good = (filtered_transactions['label'] == 0).sum()
        num_bad = (filtered_transactions['label'] == 1).sum()
        #사기거래 탐지율
        detection_rate = num_bad / (num_bad + num_good) if num_bad + num_good >0 else 0
        
        #결과 추가
        results.append({
            '규칙명': rule_name,
            '정상 거래 수' : num_good,
            '사기 거래 수' : num_bad,
            '탐지율' : detection_rate
        })
    
    #결과 데이터 프레임
    results_df = pd.DataFrame(results)
    return results_df


In [35]:
#규칙 적용 및 결과
rule_results = calculate_detection_rate(sample_data)
rule_results

Unnamed: 0,규칙명,정상 거래 수,사기 거래 수,탐지율
0,고액 거래,437,50,0.102669
1,다수 결제 수단 사용,548,55,0.091211
2,빈번한 로그인 실패,652,75,0.103164
3,신규 계정 대량 거래,6,1,0.142857
4,국제 거래 및 고액,365,32,0.080605
