In [1]:
import pandas as pd
import os

In [2]:
import pandas as pd
import os
from functools import reduce

# 경로 설정
base_path = r"C:\Users\user\Desktop\빅데이터 분석 부트캠프\00_workspace\11_파이널프로젝트관련\open\train"
months = ["07", "08", "09", "10", "11", "12"]

# (폴더명, 실제 파일명 prefix)
targets = [
    ("4.청구입금정보", "청구정보"),
    ("5.잔액정보", "잔액정보"),
    ("6.채널정보", "채널정보")
]

# 각 테이블별 병합 결과 저장
merged_tables = {}

for folder_name, category in targets:
    data_list = []
    for month in months:
        fname = f"2018{month}_train_{category}.parquet"
        file_path = os.path.join(base_path, folder_name, fname)
        try:
            df = pd.read_parquet(file_path, engine="pyarrow")
            data_list.append(df)
            print(f"✅ 불러옴: {fname} (Shape: {df.shape})")
        except FileNotFoundError:
            print(f"⚠️ 파일 없음: {file_path}")
    
    if data_list:
        merged_df = pd.concat(data_list, ignore_index=True)
        merged_tables[category] = merged_df
        print(f"📦 병합 완료: {category} (총 {merged_df.shape})\n")
    else:
        print(f"❌ 병합 실패: {category}\n")

# ✅ Segment 추출: 회원정보 201807만 사용
member_file = os.path.join(base_path, "1.회원정보", "201807_train_회원정보.parquet")
member_df = pd.read_parquet(member_file, engine="pyarrow")
segment_df = member_df[["ID", "Segment"]].copy()
print(f"✅ Segment 추출 완료 (Shape: {segment_df.shape})\n")

✅ 불러옴: 201807_train_청구정보.parquet (Shape: (400000, 46))
✅ 불러옴: 201808_train_청구정보.parquet (Shape: (400000, 46))
✅ 불러옴: 201809_train_청구정보.parquet (Shape: (400000, 46))
✅ 불러옴: 201810_train_청구정보.parquet (Shape: (400000, 46))
✅ 불러옴: 201811_train_청구정보.parquet (Shape: (400000, 46))
✅ 불러옴: 201812_train_청구정보.parquet (Shape: (400000, 46))
📦 병합 완료: 청구정보 (총 (2400000, 46))

✅ 불러옴: 201807_train_잔액정보.parquet (Shape: (400000, 82))
✅ 불러옴: 201808_train_잔액정보.parquet (Shape: (400000, 82))
✅ 불러옴: 201809_train_잔액정보.parquet (Shape: (400000, 82))
✅ 불러옴: 201810_train_잔액정보.parquet (Shape: (400000, 82))
✅ 불러옴: 201811_train_잔액정보.parquet (Shape: (400000, 82))
✅ 불러옴: 201812_train_잔액정보.parquet (Shape: (400000, 82))
📦 병합 완료: 잔액정보 (총 (2400000, 82))

✅ 불러옴: 201807_train_채널정보.parquet (Shape: (400000, 105))
✅ 불러옴: 201808_train_채널정보.parquet (Shape: (400000, 105))
✅ 불러옴: 201809_train_채널정보.parquet (Shape: (400000, 105))
✅ 불러옴: 201810_train_채널정보.parquet (Shape: (400000, 105))
✅ 불러옴: 201811_train_채널정보.parquet (Shape: (400000, 

In [7]:
# 저장 경로 지정
save_path = r"C:\Users\user\Desktop\빅데이터 분석 부트캠프\00_workspace\11_파이널프로젝트관련\open\train"

# 저장 폴더가 없다면 생성
os.makedirs(save_path, exist_ok=True)

# 병합된 테이블 저장
for category, df in merged_tables.items():
    # 파일명에서 확장자 없이 공통 이름 추출
    base_filename = f"merged_{category}"

    # Parquet로 저장
    parquet_path = os.path.join(save_path, base_filename + ".parquet")
    df.to_parquet(parquet_path, index=False, engine="pyarrow")
    print(f"📦 Parquet 저장 완료: {parquet_path}\n")



📦 Parquet 저장 완료: C:\Users\user\Desktop\빅데이터 분석 부트캠프\00_workspace\11_파이널프로젝트관련\open\train\merged_청구정보.parquet

📦 Parquet 저장 완료: C:\Users\user\Desktop\빅데이터 분석 부트캠프\00_workspace\11_파이널프로젝트관련\open\train\merged_잔액정보.parquet

📦 Parquet 저장 완료: C:\Users\user\Desktop\빅데이터 분석 부트캠프\00_workspace\11_파이널프로젝트관련\open\train\merged_채널정보.parquet



In [8]:
# Segment 테이블도 저장
segment_parquet = os.path.join(save_path, "segment_info.parquet")
segment_df.to_parquet(segment_parquet, index=False, engine="pyarrow")
print(f"✅ Segment 저장 완료\n📦 Parquet: {segment_parquet}")


✅ Segment 저장 완료
📦 Parquet: C:\Users\user\Desktop\빅데이터 분석 부트캠프\00_workspace\11_파이널프로젝트관련\open\train\segment_info.parquet


In [29]:
import pandas as pd
import os

# 이미 읽어온 DataFrame
# merged_df: 예를 들어 잔액정보 병합된 결과
# segment_df: Segment 정보가 있는 데이터프레임 (ID, Segment 컬럼만 있음)

# 예시로 잔액정보를 사용 (원하는 테이블로 교체 가능)
balance_df = merged_tables["잔액정보"]

# ID 기준 병합 (left join)
merged_with_segment = pd.merge(balance_df, segment_df, on="ID", how="left")

# 결과 확인
print(f"✅ 병합 완료: {merged_with_segment.shape}")
print(merged_with_segment[["ID", "Segment"]].head())

# 저장 경로
save_path = r"C:\Users\user\Desktop\병합결과"
os.makedirs(save_path, exist_ok=True)

# 파일명
file_name = "잔액정보_with_segment"

# Parquet 저장
parquet_path = os.path.join(save_path, file_name + ".parquet")
merged_with_segment.to_parquet(parquet_path, index=False, engine="pyarrow")
print(f"📦 Parquet 저장 완료: {parquet_path}")

✅ 병합 완료: (2400000, 83)
             ID Segment
0  TRAIN_000000       D
1  TRAIN_000001       E
2  TRAIN_000002       C
3  TRAIN_000003       D
4  TRAIN_000004       E
📁 CSV 저장 완료: C:\Users\user\Desktop\병합결과\잔액정보_with_segment.csv
📦 Parquet 저장 완료: C:\Users\user\Desktop\병합결과\잔액정보_with_segment.parquet


In [35]:
import pandas as pd
import os

# 이미 읽어온 DataFrame
# merged_df: 예를 들어 잔액정보 병합된 결과
# segment_df: Segment 정보가 있는 데이터프레임 (ID, Segment 컬럼만 있음)

# 예시로 잔액정보를 사용 (원하는 테이블로 교체 가능)
balance_df = merged_tables["청구정보"]

# ID 기준 병합 (left join)
merged_with_segment = pd.merge(balance_df, segment_df, on="ID", how="left")

# 결과 확인
print(f"✅ 병합 완료: {merged_with_segment.shape}")
print(merged_with_segment[["ID", "Segment"]].head())

# 저장 경로
save_path = r"C:\Users\user\Desktop\병합결과"
os.makedirs(save_path, exist_ok=True)

# 파일명
file_name = "청구정보_with_segment"

# Parquet 저장
parquet_path = os.path.join(save_path, file_name + ".parquet")
merged_with_segment.to_parquet(parquet_path, index=False, engine="pyarrow")
print(f"📦 Parquet 저장 완료: {parquet_path}")

✅ 병합 완료: (2400000, 47)
             ID Segment
0  TRAIN_000000       D
1  TRAIN_000001       E
2  TRAIN_000002       C
3  TRAIN_000003       D
4  TRAIN_000004       E
📦 Parquet 저장 완료: C:\Users\user\Desktop\병합결과\청구정보_with_segment.parquet


In [37]:
import pandas as pd
import os

# 이미 읽어온 DataFrame
# merged_df: 예를 들어 잔액정보 병합된 결과
# segment_df: Segment 정보가 있는 데이터프레임 (ID, Segment 컬럼만 있음)

# 예시로 잔액정보를 사용 (원하는 테이블로 교체 가능)
balance_df = merged_tables["채널정보"]

# ID 기준 병합 (left join)
merged_with_segment = pd.merge(balance_df, segment_df, on="ID", how="left")

# 결과 확인
print(f"✅ 병합 완료: {merged_with_segment.shape}")
print(merged_with_segment[["ID", "Segment"]].head())

# 저장 경로
save_path = r"C:\Users\user\Desktop\병합결과"
os.makedirs(save_path, exist_ok=True)

# 파일명
file_name = "채널정보_with_segment"

# Parquet 저장
parquet_path = os.path.join(save_path, file_name + ".parquet")
merged_with_segment.to_parquet(parquet_path, index=False, engine="pyarrow")
print(f"📦 Parquet 저장 완료: {parquet_path}")

✅ 병합 완료: (2400000, 106)
             ID Segment
0  TRAIN_000000       D
1  TRAIN_000001       E
2  TRAIN_000002       C
3  TRAIN_000003       D
4  TRAIN_000004       E
📦 Parquet 저장 완료: C:\Users\user\Desktop\병합결과\채널정보_with_segment.parquet


In [33]:
df1 = pd.read_parquet('open/train/잔액정보_with_segment.parquet')
df1

Unnamed: 0,기준년월,ID,잔액_일시불_B0M,잔액_할부_B0M,잔액_현금서비스_B0M,잔액_리볼빙일시불이월_B0M,잔액_리볼빙CA이월_B0M,잔액_카드론_B0M,월중평잔_일시불_B0M,월중평잔_할부_B0M,...,평잔_일시불_6M,평잔_일시불_해외_6M,평잔_RV일시불_6M,평잔_RV일시불_해외_6M,평잔_할부_6M,평잔_할부_해외_6M,평잔_CA_6M,평잔_CA_해외_6M,평잔_카드론_6M,Segment
0,201807,TRAIN_000000,998,962,22971,0,0,0,1084,547,...,2440,0,0,0,572,0,17008,0,0,D
1,201807,TRAIN_000001,2565,2390,0,0,0,0,4090,2553,...,2677,0,2830,0,2736,0,0,0,0,E
2,201807,TRAIN_000002,5312,5113,21531,6795,0,0,5006,8778,...,9118,0,8870,0,4429,0,43351,0,0,C
3,201807,TRAIN_000003,730,5025,26284,0,0,0,487,5607,...,884,0,0,0,5097,0,30697,0,0,D
4,201807,TRAIN_000004,0,0,0,0,0,0,0,0,...,21,0,0,0,0,0,0,0,0,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E
2399996,201812,TRAIN_399996,3351,0,0,0,0,27337,4412,0,...,12524,0,0,0,0,0,0,0,23031,D
2399997,201812,TRAIN_399997,2524,2960,0,0,0,0,2694,3374,...,3241,0,0,0,3995,0,0,0,0,C
2399998,201812,TRAIN_399998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E


In [39]:
df1 = pd.read_parquet('open/train/채널정보_with_segment.parquet')
df1

Unnamed: 0,기준년월,ID,인입횟수_ARS_R6M,이용메뉴건수_ARS_R6M,인입일수_ARS_R6M,인입월수_ARS_R6M,인입후경과월_ARS,인입횟수_ARS_B0M,이용메뉴건수_ARS_B0M,인입일수_ARS_B0M,...,당사PAY_방문월수_R6M,당사멤버쉽_방문횟수_B0M,당사멤버쉽_방문횟수_R6M,당사멤버쉽_방문월수_R6M,OS구분코드,홈페이지_금융건수_R6M,홈페이지_선결제건수_R6M,홈페이지_금융건수_R3M,홈페이지_선결제건수_R3M,Segment
0,201807,TRAIN_000000,10회 이상,10회 이상,8,6,0,2,6,2,...,0,22,221,6,Android,0,0,0,0,D
1,201807,TRAIN_000001,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,,0,0,0,0,E
2,201807,TRAIN_000002,1회 이상,1회 이상,1,1,0,2,5,1,...,0,0,0,0,Android,11,6,5,5,C
3,201807,TRAIN_000003,10회 이상,10회 이상,10,6,0,2,6,2,...,0,23,219,6,Android,0,0,0,0,D
4,201807,TRAIN_000004,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,Android,0,0,0,0,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,,0,0,0,0,E
2399996,201812,TRAIN_399996,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,1,,0,0,0,0,D
2399997,201812,TRAIN_399997,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,,0,0,0,0,C
2399998,201812,TRAIN_399998,1회 이상,1회 이상,0,0,0,0,0,0,...,0,0,0,0,,0,0,0,0,E


In [41]:
df1 = pd.read_parquet('open/train/청구정보_with_segment.parquet')
df1

Unnamed: 0,기준년월,ID,대표결제일,대표결제방법코드,대표청구지고객주소구분코드,대표청구서수령지구분코드,청구서수령방법,청구서발송여부_B0,청구서발송여부_R3M,청구서발송여부_R6M,...,상환개월수_결제일_R6M,상환개월수_결제일_R3M,선결제건수_R6M,선결제건수_R3M,연체건수_R6M,연체건수_R3M,혜택수혜금액_R3M,포인트_마일리지_환산_B0M,혜택수혜금액,Segment
0,201807,TRAIN_000000,27,자동이체,미확인,당사페이앱+이메일,문자메세지,1,1,1,...,5,3,0,0,1,0,3,0,0,D
1,201807,TRAIN_000001,13,자동이체,주거지,우편,우편,1,1,1,...,6,3,0,0,0,0,0,0,0,E
2,201807,TRAIN_000002,1,자동이체,미확인,이메일,이메일,1,1,1,...,6,3,0,0,0,0,121,0,50,C
3,201807,TRAIN_000003,5,자동이체,주거지,우편,우편,1,1,1,...,5,3,2,0,0,0,3,0,2,D
4,201807,TRAIN_000004,13,자동이체,주거지,우편,우편,0,1,1,...,1,1,0,0,0,0,0,0,0,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,25,자동이체,주거지,우편,우편,0,0,0,...,0,0,0,0,0,0,0,0,0,E
2399996,201812,TRAIN_399996,20,자동이체,미확인,이메일,이메일,1,1,1,...,6,3,0,0,0,0,164,0,53,D
2399997,201812,TRAIN_399997,20,자동이체,회사,우편,우편,1,1,1,...,6,3,0,0,0,0,0,0,0,C
2399998,201812,TRAIN_399998,20,자동이체,주거지,우편,우편,0,0,0,...,0,0,0,0,0,0,0,0,0,E


In [27]:
df1 = pd.read_parquet('open/train/merged_잔액정보.parquet')
df1

Unnamed: 0,기준년월,ID,잔액_일시불_B0M,잔액_할부_B0M,잔액_현금서비스_B0M,잔액_리볼빙일시불이월_B0M,잔액_리볼빙CA이월_B0M,잔액_카드론_B0M,월중평잔_일시불_B0M,월중평잔_할부_B0M,...,평잔_6M,평잔_일시불_6M,평잔_일시불_해외_6M,평잔_RV일시불_6M,평잔_RV일시불_해외_6M,평잔_할부_6M,평잔_할부_해외_6M,평잔_CA_6M,평잔_CA_해외_6M,평잔_카드론_6M
0,201807,TRAIN_000000,998,962,22971,0,0,0,1084,547,...,15988,2440,0,0,0,572,0,17008,0,0
1,201807,TRAIN_000001,2565,2390,0,0,0,0,4090,2553,...,7045,2677,0,2830,0,2736,0,0,0,0
2,201807,TRAIN_000002,5312,5113,21531,6795,0,0,5006,8778,...,66549,9118,0,8870,0,4429,0,43351,0,0
3,201807,TRAIN_000003,730,5025,26284,0,0,0,487,5607,...,30139,884,0,0,0,5097,0,30697,0,0
4,201807,TRAIN_000004,0,0,0,0,0,0,0,0,...,28,21,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2399996,201812,TRAIN_399996,3351,0,0,0,0,27337,4412,0,...,67826,12524,0,0,0,0,0,0,0,23031
2399997,201812,TRAIN_399997,2524,2960,0,0,0,0,2694,3374,...,8627,3241,0,0,0,3995,0,0,0,0
2399998,201812,TRAIN_399998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df1 = pd.read_parquet('open/train/segment_info.parquet')
df1

Unnamed: 0,ID,Segment
0,TRAIN_000000,D
1,TRAIN_000001,E
2,TRAIN_000002,C
3,TRAIN_000003,D
4,TRAIN_000004,E
...,...,...
399995,TRAIN_399995,E
399996,TRAIN_399996,D
399997,TRAIN_399997,C
399998,TRAIN_399998,E


In [None]:
# ✅ 모든 테이블 병합 (ID 기준 left join)
merge_list = list(merged_tables.values()) + [segment_df]
merged_result = reduce(lambda left, right: pd.merge(left, right, on="ID", how="left"), merge_list)

print(f"\n🎯 최종 병합 완료! Shape: {merged_result.shape}")