In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 결측치 시각화를 위한 라이브러리
import missingno

from functools import reduce

In [2]:
# 파일 경로 리스트
paths = [
    '회원정보_preprocessing_fin.csv',
    '신용정보_preprocessing_fin.csv',
    '승인매출정보_12월.csv',
    '청구입금정보_12월.csv',
    '잔액정보_12월.csv',
    '채널정보_전처리.csv',
    '마케팅정보_전처리.csv',
    '성과정보_preprocessing_fin.csv',
]

# CSV 파일들 읽어오기
dfs = [pd.read_csv(path) for path in paths]

In [3]:
# 중복된 'Segment' 컬럼 제거 (첫 번째 DataFrame만 유지)
for df in dfs[1:]:
    df.drop(columns=['Segment'], errors='ignore', inplace=True)

# 공통 키 'ID'로 병합
merged_df = reduce(
    lambda left, right: pd.merge(left, right, on='ID', how='outer'),
    dfs
)

# 'TRAIN'/'TEST' ID로 분리
train_df = merged_df[merged_df['ID'].str.startswith('TRAIN')].copy()
test_df  = merged_df[merged_df['ID'].str.startswith('TEST')].copy()

# 결과 확인
print(f"Train 데이터 수: {len(train_df)}")
print(f"Test 데이터 수:  {len(test_df)}")

Train 데이터 수: 400000
Test 데이터 수:  100000


In [4]:
# train_df와 test_df의 컬럼별 데이터 타입별 개수 확인
for name, df_part in [('train_df', train_df), ('test_df', test_df)]:
    print(f"\n{name} 컬럼 데이터 타입별 개수:")
    print(df_part.dtypes.value_counts())


train_df 컬럼 데이터 타입별 개수:
int64      613
bool        99
float64     54
object       2
Name: count, dtype: int64

test_df 컬럼 데이터 타입별 개수:
int64      613
bool        99
float64     54
object       2
Name: count, dtype: int64


In [5]:
# object + pandas string + category 모두 포함
text_cols = train_df.select_dtypes(include=['object','string','category']).columns.tolist()
print("문자열 계열 컬럼:", text_cols)

문자열 계열 컬럼: ['ID', 'Segment']


In [6]:
# train_df의 object 타입 컬럼 리스트
obj_cols_train = train_df.select_dtypes(include='object').columns.tolist()

# train_df object 컬럼 및 개수
print(f"train_df object 컬럼 ({len(obj_cols_train)}개):")
print(obj_cols_train)

# test_df의 object 타입 컬럼 리스트
obj_cols_test = test_df.select_dtypes(include='object').columns.tolist()

# test_df object 컬럼 및 개수
print(f"\ntest_df object 컬럼 ({len(obj_cols_test)}개):")
print(obj_cols_test)

train_df object 컬럼 (2개):
['ID', 'Segment']

test_df object 컬럼 (2개):
['ID', 'Segment']


In [7]:
# train_df에서 결측치가 있는 컬럼만 추출
na_counts = train_df.isna().sum()
na_counts = na_counts[na_counts > 0]

# to_string() 으로 전체 출력
print(na_counts.to_string())

Series([], )


In [8]:
# test_df에서 결측치가 있는 컬럼만 확인
na_counts_test = test_df.isna().sum()

# 결측치 개수가 0보다 큰 컬럼만 필터링
na_counts_test = na_counts_test[na_counts_test > 0]

# 결과 출력
print("결측치 있는 컬럼 및 개수 (test_df):")
print(na_counts_test)

결측치 있는 컬럼 및 개수 (test_df):
Segment    100000
dtype: int64


In [9]:
# 저장: 결과를 CSV로 내보내기
train_df.to_csv('merged_2018_train.csv', index=False)
test_df.to_csv('merged_2018_test.csv',  index=False)

In [10]:
df2 = pd.read_csv('merged_2018_train.csv')
df2

Unnamed: 0,ID,남녀구분코드,연령,Segment,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,소지카드수_이용가능_신용,...,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M
0,TRAIN_000000,2,2,D,1,1,0,1,1,1,...,0.999998,0.591302,1.001020,0.999998,0.999998,-0.057150,0.061048,0.000000,0.878859,1.398627
1,TRAIN_000001,1,1,E,1,1,1,1,1,1,...,0.965251,0.901252,0.999998,0.999998,0.999998,-0.033906,-0.020131,0.000000,0.000000,0.000000
2,TRAIN_000002,1,1,C,1,1,0,1,1,1,...,1.005795,0.585823,0.997353,0.000000,0.999998,-0.097278,-0.076351,-0.115879,0.187467,-1.198788
3,TRAIN_000003,2,2,D,1,1,0,1,2,2,...,0.999998,0.774731,1.003519,0.999998,0.999998,0.142766,0.090599,0.000000,0.781401,1.282494
4,TRAIN_000004,2,2,E,1,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,0.762016,0.986860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,TRAIN_399995,2,5,E,1,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,0.762016,0.986860
399996,TRAIN_399996,2,3,D,1,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.921733,-0.203251,-0.159143,0.000000,1.377071,2.533815
399997,TRAIN_399997,1,1,C,1,1,0,1,1,1,...,0.999998,0.345027,0.999998,0.999998,0.999998,0.027319,0.126581,0.000000,0.000000,0.000000
399998,TRAIN_399998,1,2,E,1,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,0.762016,0.986860
