In [1]:
# 기본 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 경고 뜨지 않게
import warnings
warnings.filterwarnings('ignore')

In [2]:
# parquet 파일 불러오기
df = pd.read_parquet ('open/train/1.회원정보/2018_train_회원정보.parquet')
df

Unnamed: 0,기준년월,ID,남녀구분코드,연령,Segment,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,...,할인금액_제휴연회비_B0M,청구금액_기본연회비_B0M,청구금액_제휴연회비_B0M,상품관련면제카드수_B0M,임직원면제카드수_B0M,우수회원면제카드수_B0M,기타면제카드수_B0M,카드신청건수,Life_Stage,최종카드발급경과월
0,201807,TRAIN_000000,2,40대,D,1,1,0,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀성장(2),22
1,201807,TRAIN_000001,1,30대,E,1,1,1,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀성장(1),18
2,201807,TRAIN_000002,1,30대,C,1,1,0,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀출산기,20
3,201807,TRAIN_000003,2,40대,D,1,1,0,1,2,...,0,0,0,0개,0개,0개,0개,1,자녀성장(2),17
4,201807,TRAIN_000004,2,40대,E,1,1,1,1,1,...,0,0,0,0개,0개,0개,0개,1,자녀성장(1),15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,2,70대이상,E,1,1,1,1,1,...,0,0,0,0개,0개,0개,0개,0,노년생활,39
2399996,201812,TRAIN_399996,2,50대,D,1,1,1,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀성장(2),24
2399997,201812,TRAIN_399997,1,30대,C,1,1,0,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀출산기,18
2399998,201812,TRAIN_399998,1,40대,E,1,1,1,1,1,...,0,0,0,0개,0개,0개,0개,0,자녀성장(1),27


In [3]:
# 1. 날짜 컬럼 처리
date_cols = [
    '기준년월', '입회일자_신용', '최종유효년월_신용_이용가능',
    '최종유효년월_신용_이용', '최종카드발급일자'
]

for col in date_cols:
    # 이상치(예: 0, 999999 등) 먼저 제거
    df[col] = df[col].replace([0, 999999, 99999999], pd.NA)
    # YYYYMM 형식이라고 가정
    df[col] = pd.to_datetime(df[col].astype(str), format='%Y%m', errors='coerce')

In [4]:
# 2. 수치형/범주형 컬럼 분리
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# 수치형이지만 범주형처럼 보이는 컬럼들 찾기 (고유값이 적은 정수형)
numeric_to_categorical = []
for col in numeric_cols:
    unique_vals = df[col].dropna().unique()
    if pd.api.types.is_integer_dtype(df[col]) and len(unique_vals) <= 10:
        numeric_to_categorical.append(col)

# 조정
numeric_cols = [col for col in numeric_cols if col not in numeric_to_categorical]
categorical_cols += numeric_to_categorical

In [5]:
# 3. 수치형 / 범주형 데이터프레임 분리
df_numeric = df[numeric_cols].copy()
df_categorical = df[categorical_cols].copy()

In [6]:
# 4. 범주형 변수 고유값 요약
print("🎯 범주형 변수 고유값 분포:")
for col in df_categorical.columns:
    print(f"\n🔸 {col} (고유값 {df_categorical[col].nunique()}개):")
    print(df_categorical[col].value_counts(dropna=False).head(10))  # 상위 10개까지만 보기

🎯 범주형 변수 고유값 분포:

🔸 ID (고유값 400000개):
ID
TRAIN_000000    6
TRAIN_266650    6
TRAIN_266672    6
TRAIN_266671    6
TRAIN_266670    6
TRAIN_266669    6
TRAIN_266668    6
TRAIN_266667    6
TRAIN_266666    6
TRAIN_266665    6
Name: count, dtype: int64

🔸 연령 (고유값 6개):
연령
40대      733146
30대      592146
50대      546342
60대      246990
20대      191880
70대이상     89496
Name: count, dtype: int64

🔸 Segment (고유값 5개):
Segment
E    1922052
D     349242
C     127590
A        972
B        144
Name: count, dtype: int64

🔸 가입통신회사코드 (고유값 3개):
가입통신회사코드
S사      995637
K사      565208
L사      451585
None    387570
Name: count, dtype: int64

🔸 거주시도명 (고유값 17개):
거주시도명
서울    692656
경기    652987
인천    162815
부산    149007
대전    142800
충북    110601
경남     96244
경북     91237
대구     80687
울산     70491
Name: count, dtype: int64

🔸 직장시도명 (고유값 17개):
직장시도명
경기      598244
서울      576618
None    244969
인천      146944
부산      131285
대전      120697
충북      103951
경남       94527
경북       82975
대구       76605
Name: count, dtyp

In [8]:
# 5. 수치형 통계 요약
print("\n📊 수치형 변수 요약 통계:")
display (df_numeric.describe().T) 


📊 수치형 변수 요약 통계:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
입회경과개월수_신용,2400000.0,73.602731,72.410422,2.0,17.0,51.0,104.0,337.0
최종탈회후경과월,2400000.0,27.259906,38.302494,0.0,0.0,0.0,48.0,117.0
이용금액_R3M_신용체크,2400000.0,16937.448358,23247.585945,-8749.0,432.0,8879.0,21793.0,235195.0
이용금액_R3M_신용,2400000.0,15286.827581,21898.353968,-8749.0,0.0,7681.0,19568.0,163706.0
이용금액_R3M_신용_가족,2400000.0,280.848698,1764.875769,0.0,0.0,0.0,0.0,53199.0
이용금액_R3M_체크,2400000.0,1650.561734,7528.215703,0.0,0.0,0.0,0.0,163250.0
_1순위카드이용금액,2400000.0,11020.846708,14090.018772,-3925.0,0.0,6255.0,15979.0,118507.0
_1순위카드이용건수,2400000.0,39.345748,51.000916,-2.0,1.0,17.0,61.0,224.0
_2순위카드이용금액,2400000.0,3372.411952,7707.607849,-4042.0,0.0,0.0,3201.0,84966.0
_2순위카드이용건수,2400000.0,14.321176,29.710147,-1.0,0.0,0.0,16.0,209.0
