In [66]:
import matplotlib.pyplot as plt
import matplotlib as mpl

plt.rcParams['font.family'] = 'AppleGothic'
mpl.rcParams['axes.unicode_minus'] = False

In [67]:
import pandas as pd

file_1 = pd.read_csv("train.csv")
file_2 = pd.read_csv("test.csv")

In [119]:
file_1.isnull().sum()

id                                       0
bank_months_count                        0
branch_application_count_8weeks          0
credit_risk_score                        0
current_address_months_count             0
customer_age                             0
date_of_birth_distinct_emails_4w         0
days_since_request                       0
device_distinct_emails_8w                0
device_os                                0
device_previously_fraudulent             0
email_is_free                            0
employment_status                        0
foreign_request                          0
has_other_cards                          0
housing_status                           0
income                                   0
keep_alive_session                       0
month                                    0
name_email_similarity                    0
payment_type                             0
phone_home_valid                         0
phone_mobile_valid                       0
prev_addres

In [69]:
file_2.isnull().sum()

id                                       0
bank_months_count                    76034
branch_application_count_8weeks          0
credit_risk_score                        0
current_address_months_count          1278
customer_age                             0
date_of_birth_distinct_emails_4w         0
days_since_request                       0
device_distinct_emails_8w              109
device_os                                0
device_previously_fraudulent             0
email_is_free                            0
employment_status                        0
foreign_request                          0
has_other_cards                          0
housing_status                           0
income                                   0
initial_transfer_amount             222873
keep_alive_session                       0
month                                    0
name_email_similarity                    0
payment_type                             0
phone_home_valid                         0
phone_mobil

In [70]:
# 결측치 처리

In [71]:
# device_distinct_emails_8w
# 크게 영향을 끼치지 않을 정도로 결측치 값이 너무 적음. 
# 주로 1과 2 중에서 최빈값인 1로 처리함.

In [72]:
print(file_1[['device_distinct_emails_8w', 'fraud']].corr())

                           device_distinct_emails_8w     fraud
device_distinct_emails_8w                   1.000000  0.037704
fraud                                       0.037704  1.000000


In [73]:
file_1['device_distinct_emails_8w'] = file_1['device_distinct_emails_8w'].fillna(1) # 최빈값인 1로 처리

In [74]:
# session_length_in_minutes
# 평균값으로 대체 

In [75]:
print(file_1[['session_length_in_minutes', 'fraud']].corr())

                           session_length_in_minutes     fraud
session_length_in_minutes                   1.000000  0.009253
fraud                                       0.009253  1.000000


In [76]:
mean_val = file_1['session_length_in_minutes'].mean()
file_1['session_length_in_minutes'] = file_1['session_length_in_minutes'].fillna(mean_val)

In [77]:
# current_address_months_count  
# 중앙값으로 대체

In [78]:
print(file_1[['current_address_months_count', 'fraud']].corr())

                              current_address_months_count     fraud
current_address_months_count                      1.000000  0.033186
fraud                                             0.033186  1.000000


In [79]:
mean_val = file_1['current_address_months_count'].mean()
file_1['current_address_months_count'] = file_1['current_address_months_count'].fillna(mean_val)

In [124]:
# bank_months_count
# 사기인 경우와 사기가 아닌 경우를 나누어, 평균, 표준편차를 구하여 조건부 랜덤 샘플링 기반 결측 대체 방벙블 사용함 

In [81]:
print(file_1[['bank_months_count', 'fraud']].corr())

                   bank_months_count     fraud
bank_months_count           1.000000  0.020829
fraud                       0.020829  1.000000


In [82]:
fraud_mean = file_1[file_1['fraud'] == 1]['bank_months_count'].mean()
nonfraud_mean = file_1[file_1['fraud'] == 0]['bank_months_count'].mean()

print(f"사기인 경우 평균: {fraud_mean:.2f}")
print(f"사기가 아닌 경우 평균: {nonfraud_mean:.2f}")

사기인 경우 평균: 17.33
사기가 아닌 경우 평균: 14.84


In [83]:
fraud_stats = file_1[file_1['fraud'] == 1]['bank_months_count'].agg(['mean', 'std'])
nonfraud_stats = file_1[file_1['fraud'] == 0]['bank_months_count'].agg(['mean', 'std'])

print(f"사기인 경우 → 평균: {fraud_stats['mean']:.2f}, 표준편차: {fraud_stats['std']:.2f}")
print(f"사기 아닌 경우 → 평균: {nonfraud_stats['mean']:.2f}, 표준편차: {nonfraud_stats['std']:.2f}")

사기인 경우 → 평균: 17.33, 표준편차: 11.84
사기 아닌 경우 → 평균: 14.84, 표준편차: 11.52


In [111]:
import numpy as np

cond_missing = file_1['bank_months_count'].isna()

# fraud==1 기준 평균±표준편차
fraud_mean = 17.33
fraud_std = 11.84

# fraud==0 기준 평균±표준편차
nonfraud_mean = 14.84
nonfraud_std = 11.52

# 랜덤 생성 함수
def sample_from_normal(mean, std, size):
    return np.clip(np.random.normal(mean, std, size), 0, None)

# 사기인 경우
fraud_mask = cond_missing & (file_1['fraud'] == 1)
file_1.loc[fraud_mask, 'bank_months_count'] = sample_from_normal(fraud_mean, fraud_std, fraud_mask.sum())

# 사기 아닌 경우
nonfraud_mask = cond_missing & (file_1['fraud'] == 0)
file_1.loc[nonfraud_mask, 'bank_months_count'] = sample_from_normal(nonfraud_mean, nonfraud_std, nonfraud_mask.sum())

In [84]:
# initial_transfer_amount
# 결측치 너무 많아서 그냥 제거

In [85]:
print(file_1[['initial_transfer_amount', 'fraud']].corr())

                         initial_transfer_amount     fraud
initial_transfer_amount                 1.000000  0.014471
fraud                                   0.014471  1.000000


In [117]:
file_1 = file_1.drop(columns=['initial_transfer_amount'])

In [122]:
# prev_address_months_count

In [126]:
print(file_1[['prev_address_months_count', 'fraud']].corr())

                           prev_address_months_count     fraud
prev_address_months_count                   1.000000  0.021867
fraud                                       0.021867  1.000000
