In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

In [3]:
#ID 컬럼 분리
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [4]:
import re

def clean_valuation(val):
    if pd.isna(val):
        return None
    if '이상' in val:
        # '6000이상' → 6000
        return int(re.sub('[^0-9]', '', val))
    elif '-' in val:
        # '2500-3500' → 평균값 계산
        low, high = map(int, val.split('-'))
        return (low + high) / 2
    else:
        # 숫자로 변환 가능한 경우
        try:
            return float(val)
        except:
            return None


In [5]:
# 설립연도 -> 연차로 변경
current_year = 2025

train['연차'] = current_year - train['설립연도']
test['연차'] = current_year - test['설립연도']

# 설립연도 제거
train.drop(columns = ['설립연도'], inplace = True)
test.drop(columns = ['설립연도'], inplace = True)

category_features = ['국가','분야']
numeric_features = ['연차', '투자단계', '직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)', '기업가치(백억원)']
bool_features = ['인수여부','상장여부']

# 투자단계 순서를 숫자로 매핑
investment_stage_map = {
    'Seed': 0,
    'Series A': 1,
    'Series B': 2,
    'Series C': 3,
    'IPO': 4,
    'Missing': -1
}

# 결측치 먼저 처리 후 매핑
train['투자단계'] = train['투자단계'].fillna('Missing').map(investment_stage_map)
test['투자단계'] = test['투자단계'].fillna('Missing').map(investment_stage_map)

# 기업가치 변환
train['기업가치(백억원)'] = train['기업가치(백억원)'].apply(clean_valuation)
test['기업가치(백억원)'] = test['기업가치(백억원)'].apply(clean_valuation)

# LabelEncoder 객체를 각 범주형 feature별로 따로 저장하여 사용
encoders = {}

# 범주형 데이터를 encoding
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0 으로 변환)
bool_map = {'Yes': 1, 'No': 0}

for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# 수치형 변수 결측치를 중간값으로 대체
for feature in numeric_features:
    median_value = train[feature].median()
    train[feature] = train[feature].fillna(median_value)
    test[feature] = test[feature].fillna(median_value)

# TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
features = [col for col in train.columns if col != '성공확률']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].max() + 1 for col in category_features]

In [5]:
# 설립연도 -> 연차로 변경
current_year = 2025

train['연차'] = current_year - train['설립연도']
test['연차'] = current_year - test['설립연도']

# 설립연도 제거
train.drop(columns = ['설립연도'], inplace = True)
test.drop(columns = ['설립연도'], inplace = True)

category_features = ['국가','분야']
numeric_features = ['연차', '투자단계', '직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)', '기업가치(백억원)']
bool_features = ['인수여부','상장여부']

# 투자단계 순서를 숫자로 매핑
investment_stage_map = {
    'Seed': 0,
    'Series A': 1,
    'Series B': 2,
    'Series C': 3,
    'IPO': 4,
    'Missing': -1
}

# 결측치 먼저 처리 후 매핑
train['투자단계'] = train['투자단계'].map(investment_stage_map)
test['투자단계'] = test['투자단계'].map(investment_stage_map)

# 기업가치 변환
train['기업가치(백억원)'] = train['기업가치(백억원)'].apply(clean_valuation)
test['기업가치(백억원)'] = test['기업가치(백억원)'].apply(clean_valuation)

# LabelEncoder 객체를 각 범주형 feature별로 따로 저장하여 사용
encoders = {}

# 범주형 데이터를 encoding
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0 으로 변환)
bool_map = {'Yes': 1, 'No': 0}

for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# # 수치형 변수 결측치를 중간값으로 대체
# for feature in numeric_features:
#     median_value = train[feature].median()
#     train[feature] = train[feature].fillna(median_value)
#     test[feature] = test[feature].fillna(median_value)

# TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
features = [col for col in train.columns if col != '성공확률']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].max() + 1 for col in category_features]

In [5]:
from sklearn.preprocessing import LabelEncoder

# 📌 설립연도 → 연차
current_year = 2025
train['연차'] = current_year - train['설립연도']
test['연차'] = current_year - test['설립연도']
train.drop(columns=['설립연도'], inplace=True)
test.drop(columns=['설립연도'], inplace=True)

# ✅ 전처리 대상 컬럼 분류
category_features = ['국가', '분야']
numeric_features = ['연차', '투자단계', '직원 수', '고객수(백만명)', '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)', '기업가치(백억원)']
bool_features = ['인수여부', '상장여부']

# ✅ 투자단계 문자열 → 숫자 맵핑
investment_stage_map = {
    'Seed': 0.546171,
    'Series A':  0.538014,
    'Series B': 0.527797,
    'Series C': 0.538425,
    'IPO': 0.536141
}

train['투자단계'] = train['투자단계'].map(investment_stage_map)
test['투자단계'] = test['투자단계'].map(investment_stage_map)

# ✅ 기업가치 문자열 처리 함수 적용
train['기업가치(백억원)'] = train['기업가치(백억원)'].apply(clean_valuation)
test['기업가치(백억원)'] = test['기업가치(백억원)'].apply(clean_valuation)

# ✅ 타겟 인코딩 평균값 딕셔너리
avg_success_dict = {
    '국가': {
        'CT009': 0.556098,
        'CT008': 0.549775,
        'CT006': 0.548276,
        'CT007': 0.540094,
        'CT005': 0.537220,
        'CT003': 0.535022,
        'CT010': 0.533333,
        'CT001': 0.531303,
        'CT004': 0.529274,
        'CT002': 0.515367
    },
    '분야': {
        '핀테크': 0.567151,
        '기술': 0.540103,
        '물류': 0.539939,
        '에듀테크': 0.539011,
        'AI': 0.537079,
        '푸드테크': 0.533731,
        '게임': 0.532869,
        '에너지': 0.529545,
        '이커머스': 0.520482,
        '헬스케어': 0.493671
    },
    '인수여부': {
        'No': 0.540349,
        'Yes': 0.534515
    },
    '상장여부': {
        'Yes': 0.545196,
        'No': 0.530004
    }
}

# ✅ 해당 컬럼 타겟 인코딩 적용
for col in ['국가', '분야', '인수여부', '상장여부']:
    train[col] = train[col].map(avg_success_dict[col])
    test[col] = test[col].map(avg_success_dict[col])

# ✅ 불리언 형태가 아직 문자열이면 변환 (보완용, 이미 숫자면 영향 없음)
# bool_map = {'Yes': 1, 'No': 0}
# for feature in bool_features:
#     train[feature] = train[feature].map(bool_map).fillna(train[feature])
#     test[feature] = test[feature].map(bool_map).fillna(test[feature])

# ✅ (선택) 수치형 변수 결측치는 중간값으로 대체
# for feature in numeric_features:
#     median_value = train[feature].median()
#     train[feature] = train[feature].fillna(median_value)
#     test[feature] = test[feature].fillna(median_value)

# ✅ feature 리스트 생성 (성공확률 제외)
features = [col for col in train.columns if col != '성공확률']

# ✅ TabNet용 cat_idxs, cat_dims (이 경우엔 타겟 인코딩으로 모두 수치화 → cat_idxs 비울 수 있음)
cat_idxs = []
cat_dims = []


In [6]:
for df in [train, test]:
    # 1인당 지표
    df['투자_직원수'] = df['총 투자금(억원)'] / df['직원 수']
    df['매출_직원수'] = df['연매출(억원)'] / df['직원 수']

    # 비율 지표
    df['매출_투자'] = df['연매출(억원)'] / df['총 투자금(억원)']
    df['투자_기업가치'] = df['총 투자금(억원)'] / df['기업가치(백억원)']
    df['매출_고객'] = df['연매출(억원)'] / df['고객수(백만명)']

    # 스케일 변환
    df['연차_루트'] = np.sqrt(df['연차'])

    # 추가 파생 피처
    df['기업가치_투자'] = df['기업가치(백억원)'] / df['총 투자금(억원)']
    df['팔로워_고객'] = df['SNS 팔로워 수(백만명)'] / df['고객수(백만명)']
    df['팔로워_기업가치'] = df['SNS 팔로워 수(백만명)'] / df['기업가치(백억원)']
    df['고객_직원수'] = df['고객수(백만명)'] / df['직원 수']
    df['매출_기업가치'] = df['연매출(억원)'] / df['기업가치(백억원)']
    df['매출_연차'] = df['연매출(억원)'] / df['연차']
    df['매출_팔로워'] = df['연매출(억원)'] / (df['SNS 팔로워 수(백만명)'] + 1e-6)


In [7]:
def compute_combined_stat_score(df, avg_dict):
    score = pd.Series(1.0, index=df.index)  # 초기값: 곱셈 단위 1.0
    for col in avg_dict:
        mapped = df[col].map(avg_dict[col])
        
        # 매핑 안 되는 값 있을 경우 확인용 로그 출력 (또는 raise 에러 처리 가능)
        if mapped.isna().any():
            missing_vals = df.loc[mapped.isna(), col].unique()
            raise ValueError(f"[{col}]에 매핑되지 않은 값 존재: {missing_vals}")
        
        score += mapped
    return score


In [8]:
# avg_dict는 이미 준비된 상태라고 가정
avg_dict = {
    '상장여부': {1: 0.545196, 0: 0.530004},
    '인수여부': {0: 0.540349, 1: 0.534515},
    '국가': {
        8: 0.556098, 7: 0.549775, 5: 0.548276, 6: 0.540094, 4: 0.537220,
        2: 0.535022, 9: 0.533333, 0: 0.531303, 3: 0.529274, 1: 0.515367
    },
    '분야': {
        9: 0.567151, 1: 0.552042, 3: 0.540103, 4: 0.539939, 6: 0.539011,
        0: 0.537079, 8: 0.533731, 2: 0.532869, 5: 0.529545, 7: 0.520482, 10: 0.493671
    },
    '투자단계': {
        0: 0.546171, 3: 0.538425, 1: 0.538014, 4: 0.536141, 2: 0.527797
    }
}

train['통합성공스코어'] = compute_combined_stat_score(train, avg_dict)
test['통합성공스코어'] = compute_combined_stat_score(test, avg_dict)


In [71]:
features += [
    '연차'
]


In [7]:
features += [
    '투자_직원수', '매출_직원수', '매출_투자',
    '투자_기업가치', '매출_고객', '연차_루트',
    '기업가치_투자', '팔로워_고객', '팔로워_기업가치',
    '고객_직원수', '매출_기업가치', '매출_연차',
    '매출_팔로워'
]


In [26]:
# features 리스트에서 제거
features = [f for f in features if f not in ['국가', '분야', '투자단계', '인수여부', '상장여부', '직원 수', 'SNS 팔로워 수(백만명)']]


In [17]:
# features 리스트에서 제거
features = [f for f in features if f not in ['통합성공스코어']]


In [None]:
group_cols = ['상장여부', '국가', '분야', '투자단계', '인수여부']
success_rate_mean = train.groupby(group_cols)['성공확률'].mean().reset_index()

# 보기 쉽게 정렬
success_rate_mean = success_rate_mean.sort_values(by='성공확률', ascending=False)

# 결과 확인
import pandas as pd
from IPython.display import display
display(success_rate_mean)


Unnamed: 0,상장여부,국가,분야,투자단계,인수여부,성공확률
793,0,8,7,4,0,0.9
495,0,5,3,4,0,0.9
1687,1,8,6,4,1,0.9
1628,1,7,10,4,0,0.9
1627,1,7,10,3,0,0.9
...,...,...,...,...,...,...
1388,1,5,2,1,0,0.1
1600,1,7,7,1,0,0.1
1387,1,5,2,0,1,0.1
207,0,2,2,4,1,0.1


In [5]:
print("✅ 상장여부별 성공확률 평균")
print(train.groupby('상장여부')['성공확률'].mean().sort_values(ascending=False))


✅ 상장여부별 성공확률 평균
상장여부
Yes    0.545196
No     0.530004
Name: 성공확률, dtype: float64


In [6]:
print("\n✅ 국가별 성공확률 평균")
print(train.groupby('국가')['성공확률'].mean().sort_values(ascending=False))



✅ 국가별 성공확률 평균
국가
CT009    0.556098
CT008    0.549775
CT006    0.548276
CT007    0.540094
CT005    0.537220
CT003    0.535022
CT010    0.533333
CT001    0.531303
CT004    0.529274
CT002    0.515367
Name: 성공확률, dtype: float64


In [7]:
print("\n✅ 분야별 성공확률 평균")
print(train.groupby('분야')['성공확률'].mean().sort_values(ascending=False))



✅ 분야별 성공확률 평균
분야
핀테크     0.567151
기술      0.540103
물류      0.539939
에듀테크    0.539011
AI      0.537079
푸드테크    0.533731
게임      0.532869
에너지     0.529545
이커머스    0.520482
헬스케어    0.493671
Name: 성공확률, dtype: float64


In [8]:
print("\n✅ 인수여부별 성공확률 평균")
print(train.groupby('인수여부')['성공확률'].mean().sort_values(ascending=False))



✅ 인수여부별 성공확률 평균
인수여부
No     0.540349
Yes    0.534515
Name: 성공확률, dtype: float64


In [15]:
print("\n✅ 투자단계별 성공확률 평균")
print(train.groupby('투자단계')['성공확률'].mean().sort_values(ascending=False))



✅ 투자단계별 성공확률 평균
투자단계
0    0.546171
3    0.538425
1    0.538014
4    0.536141
2    0.527797
Name: 성공확률, dtype: float64


In [9]:
# 평균값을 딕셔너리로 저장
국가별_연매출_평균_dict = train.groupby('국가')['연매출(억원)'].mean().to_dict()
분야별_고객수_평균_dict = train.groupby('분야')['고객수(백만명)'].mean().to_dict()

for df in [train, test]:
    국가_평균 = df['국가'].map(국가별_연매출_평균_dict)
    분야_평균 = df['분야'].map(분야별_고객수_평균_dict)

    # 분모에 1e6 더함
    df['연매출_국가평균_비율'] = df['연매출(억원)'] / (국가_평균 + 1e6)
    df['고객수_분야평균_비율'] = df['고객수(백만명)'] / (분야_평균 + 1e6)



In [10]:
features += [
    '연매출_국가평균_비율', '고객수_분야평균_비율'
]

In [81]:
for df in [train, test]:
    df['매출_총투자비율'] = df['연매출(억원)'] / (df['총 투자금(억원)'] + 1)
    df['기업가치_직원수'] = df['기업가치(백억원)'] / (df['직원 수'] + 1)
    df['투자_per_연차'] = df['총 투자금(억원)'] / (df['연차'] + 1)
    df['고객수_per_연차'] = df['고객수(백만명)'] / (df['연차'] + 1)
    df['매출_per_고객_per_직원'] = df['연매출(억원)'] / (df['고객수(백만명)'] * df['직원 수'] + 1)

In [82]:
features += [
    '매출_총투자비율', '기업가치_직원수', '투자_per_연차', '고객수_per_연차', '매출_per_고객_per_직원'
]

In [27]:
features

['고객수(백만명)',
 '총 투자금(억원)',
 '기업가치(백억원)',
 '연차',
 '투자_직원수',
 '매출_직원수',
 '매출_투자',
 '투자_기업가치',
 '매출_고객',
 '연차_루트',
 '기업가치_투자',
 '팔로워_고객',
 '팔로워_기업가치',
 '고객_직원수',
 '매출_기업가치',
 '매출_연차',
 '매출_팔로워',
 '연매출_국가평균_비율',
 '고객수_분야평균_비율']

## 연매출 제거 효과 있음 /
## 연차_루트, 기업가치, 투자금, 매출_기업가치, 매출_투자 제거 효과 없음
## // 중간 : 고객수(백만명)
## // 매출_기업가치, 연차 그대로임

In [12]:
features = [f for f in features if f not in ['연매출(억원)']]

In [94]:
features = [f for f in features if f not in ['매출_총투자비율', '기업가치_직원수', '투자_per_연차', '고객수_per_연차', '매출_per_고객_per_직원']]

In [76]:
features += [
    '투자_직원수'
]

In [45]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# 데이터 분리 (optuna는 fold보다 빠른 실험에 적합)
X_train, X_valid, y_train, y_valid = train_test_split(
    train[features], train['성공확률'], test_size=0.2, random_state=42
)

# DMatrix 변환
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# 1. objective 함수 정의
def objective(trial):
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'learning_rate': 0.01,
        'max_depth': 19,
        'subsample': trial.suggest_float('subsample', 0.8, 1, step=0.02),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, step=0.02),
        'lambda': 0,
        'alpha': 0,
        'tree_method': 'hist',
        'seed': 42
    }

    model = xgb.train(
        params=param,
        dtrain=dtrain,
        num_boost_round=2000,
        evals=[(dvalid, 'valid')],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    preds = model.predict(dvalid)
    score = mean_absolute_error(y_valid, preds)
    return score  # MAE가 낮을수록 좋음!

# 2. study 실행
study = optuna.create_study(direction='minimize')  # MAE 작을수록 좋음
study.optimize(objective, n_trials=300)  # 원하는 만큼 시도

# 3. 결과 확인
print("✅ Best trial:")
print(study.best_trial)

[I 2025-04-16 23:10:31,816] A new study created in memory with name: no-name-2d182a0f-0ad0-4e6c-b587-6e0d5e8ef573
[I 2025-04-16 23:10:35,847] Trial 0 finished with value: 0.19551063411312017 and parameters: {'subsample': 0.8200000000000001, 'colsample_bytree': 0.74}. Best is trial 0 with value: 0.19551063411312017.
[I 2025-04-16 23:10:40,622] Trial 1 finished with value: 0.19454891444545358 and parameters: {'subsample': 0.9, 'colsample_bytree': 0.78}. Best is trial 1 with value: 0.19454891444545358.
[I 2025-04-16 23:10:44,653] Trial 2 finished with value: 0.19468286883885458 and parameters: {'subsample': 0.98, 'colsample_bytree': 0.72}. Best is trial 1 with value: 0.19454891444545358.
[I 2025-04-16 23:10:47,178] Trial 3 finished with value: 0.19480198572079344 and parameters: {'subsample': 1.0, 'colsample_bytree': 0.66}. Best is trial 1 with value: 0.19454891444545358.
[I 2025-04-16 23:10:51,670] Trial 4 finished with value: 0.19553580454903652 and parameters: {'subsample': 0.940000000

KeyboardInterrupt: 

## 베스트2

In [None]:
    'learning_rate': 0.01999442928330417,
    'max_depth': 19,
    'subsample': 0.9480568751181326,
    'colsample_bytree': 0.754082682926498,
    'lambda': 0.00160708810021216,
    'alpha': 0.003025787373246697,

## GOAT

In [None]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'learning_rate': 0.015,
    'max_depth': 18,
    'subsample': 0.9480568751181326,
    'colsample_bytree': 0.754082682926498,
    'lambda': 0.00160708810021216,
    'alpha': 0,
    'tree_method': 'hist',
    'seed': 42
}

In [None]:
    'learning_rate': 0.015,
    'max_depth': 18,
    'subsample': 0.65,
    'colsample_bytree': 0.57,
    'lambda': 0,
    'alpha': 0,

In [None]:
    'learning_rate': 0.015,
    'max_depth': 18,
    'subsample': 0.80,
    'colsample_bytree': 0.79,
    'lambda': 0,
    'alpha': 0,

In [None]:
    'learning_rate': 0.015,
    'max_depth': 18,
    'subsample': 0.95,
    'colsample_bytree': 0.75,
    'lambda': 0,
    'alpha': 0,

In [None]:
    'learning_rate': 0.015,
    'max_depth': 18,
    'subsample': 0.77,
    'colsample_bytree': 0.94,
    'lambda': 0,
    'alpha': 0,

In [None]:
    'learning_rate': 0.01,
    'max_depth': 19,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'lambda': 0,
    'alpha': 0,

In [52]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

X = train[features]
y = train['성공확률']

kf = KFold(n_splits=10, shuffle=True, random_state=42)
models = []
cv_scores = []

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'learning_rate': 0.01,
    'max_depth': 27,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'lambda': 0,
    'alpha': 0,
    'tree_method': 'hist',
    'seed': 42
}


for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/10")

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=2000,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=1000,
        verbose_eval=100
    )

    preds = model.predict(dvalid)
    score = mean_absolute_error(y_valid, preds)
    print(f"  🔍 Fold {fold+1} MAE: {score:.4f}")

    models.append(model)
    cv_scores.append(score)

print("\n✅ 모든 fold 모델 학습 완료!")
print(f"📉 평균 MAE: {sum(cv_scores)/len(cv_scores):.4f}")



🔁 Fold 1/10
[0]	train-mae:0.20365	valid-mae:0.20036
[100]	train-mae:0.08588	valid-mae:0.19258
[200]	train-mae:0.03620	valid-mae:0.19109
[300]	train-mae:0.01540	valid-mae:0.19044
[400]	train-mae:0.00677	valid-mae:0.19024
[500]	train-mae:0.00299	valid-mae:0.19023
[600]	train-mae:0.00136	valid-mae:0.19019
[700]	train-mae:0.00066	valid-mae:0.19018
[800]	train-mae:0.00052	valid-mae:0.19019
[900]	train-mae:0.00049	valid-mae:0.19019
[1000]	train-mae:0.00049	valid-mae:0.19019
[1100]	train-mae:0.00049	valid-mae:0.19019
[1200]	train-mae:0.00048	valid-mae:0.19019
[1300]	train-mae:0.00048	valid-mae:0.19019
[1400]	train-mae:0.00048	valid-mae:0.19019
[1500]	train-mae:0.00048	valid-mae:0.19019
[1600]	train-mae:0.00048	valid-mae:0.19019
[1693]	train-mae:0.00047	valid-mae:0.19019
  🔍 Fold 1 MAE: 0.1902

🔁 Fold 2/10
[0]	train-mae:0.20300	valid-mae:0.20997
[100]	train-mae:0.08604	valid-mae:0.19997
[200]	train-mae:0.03696	valid-mae:0.19687
[300]	train-mae:0.01588	valid-mae:0.19606
[400]	train-mae:0.00702

In [25]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import numpy as np

X = train[features]
y = train['성공확률']

# ✅ 여러 개 seed 설정
seeds = [13, 42, 77, 123, 999]
n_splits = 20

models = []
cv_scores = []

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'learning_rate': 0.01,
    'max_depth': 19,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'lambda': 0,
    'alpha': 0,
    'tree_method': 'hist'
}

# ✅ seed마다 KFold 10회씩 실행
for seed in seeds:
    print(f"\n🧪 Seed {seed} 시작")
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        print(f"\n🔁 Seed {seed} - Fold {fold+1}/{n_splits}")

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dvalid = xgb.DMatrix(X_valid, label=y_valid)

        model = xgb.train(
            params={**params, 'seed': seed},
            dtrain=dtrain,
            num_boost_round=2000,
            evals=[(dtrain, 'train'), (dvalid, 'valid')],
            early_stopping_rounds=1000,
            verbose_eval=100
        )

        preds = model.predict(dvalid)
        score = mean_absolute_error(y_valid, preds)
        print(f"  🔍 Fold MAE: {score:.4f}")

        models.append(model)
        cv_scores.append(score)

# ✅ 전체 결과 출력
print("\n✅ 모든 Seed-Fold 모델 학습 완료!")
print(f"📉 평균 MAE (Across All Seeds): {np.mean(cv_scores):.4f}")



🧪 Seed 13 시작

🔁 Seed 13 - Fold 1/20
[0]	train-mae:0.20386	valid-mae:0.20343
[100]	train-mae:0.10247	valid-mae:0.19651
[200]	train-mae:0.05309	valid-mae:0.19672
[300]	train-mae:0.02756	valid-mae:0.19703
[400]	train-mae:0.01499	valid-mae:0.19750
[500]	train-mae:0.00842	valid-mae:0.19770
[600]	train-mae:0.00486	valid-mae:0.19773
[700]	train-mae:0.00279	valid-mae:0.19774
[800]	train-mae:0.00165	valid-mae:0.19777
[900]	train-mae:0.00101	valid-mae:0.19777
[1000]	train-mae:0.00068	valid-mae:0.19778
[1100]	train-mae:0.00060	valid-mae:0.19778
[1125]	train-mae:0.00059	valid-mae:0.19778
  🔍 Fold MAE: 0.1978

🔁 Seed 13 - Fold 2/20
[0]	train-mae:0.20347	valid-mae:0.20905
[100]	train-mae:0.10424	valid-mae:0.19978
[200]	train-mae:0.05496	valid-mae:0.19779
[300]	train-mae:0.02918	valid-mae:0.19694
[400]	train-mae:0.01580	valid-mae:0.19635
[500]	train-mae:0.00899	valid-mae:0.19618
[600]	train-mae:0.00510	valid-mae:0.19589
[700]	train-mae:0.00296	valid-mae:0.19588
[800]	train-mae:0.00177	valid-mae:0.19

In [None]:
import xgboost as xgb
import numpy as np

# 저장된 모델들로 예측
predictions_list = []

dtest = xgb.DMatrix(test[features])

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(dtest)  # DMatrix 입력
    predictions_list.append(preds)

# 평균 예측
final_predictions = np.mean(predictions_list, axis=0)

# 제출 파일 생성
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('./0416_14_submission.csv', index=False, encoding='utf-8-sig')


Predict with fold 1
Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5
Predict with fold 6
Predict with fold 7
Predict with fold 8
Predict with fold 9
Predict with fold 10


In [50]:
import xgboost as xgb
import numpy as np

# 저장된 모델들로 예측
predictions_list = []
dtest = xgb.DMatrix(test[features])  # 테스트셋 변환

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(dtest)
    predictions_list.append(preds)

# 🎯 MAE의 역수를 가중치로 사용 (MAE가 낮을수록 더 높은 가중치)
weights = 1 / np.array(cv_scores)
weights = weights / weights.sum()  # 정규화

# 🎯 가중 평균 계산
final_predictions = np.average(predictions_list, axis=0, weights=weights)

# 제출 파일 저장
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('./0414_19_submission.csv', index=False, encoding='utf-8-sig')


Predict with fold 1
Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5
Predict with fold 6
Predict with fold 7
Predict with fold 8
Predict with fold 9
Predict with fold 10
