# 암환자 유전체 데이터 기반 암종 분류 AI 모델 개발


- '2024 생명연구자원 AI활용 경진대회'는 바이오 데이터를 기반으로 한 AI 기술의 문제 해결 능력을 탐구하는 것을 목표로 합니다. <br>이 대회는 바이오 분야에서 AI 활용의 저변을 확대하고, 복잡한 바이오 데이터를 효율적으로 분석 및 해석할 수 있는 AI 알고리즘 개발에 초점을 맞추고 있습니다. <br><br>
- 본 대회의 구체적인 과제는 암환자 유전체 데이터의 변이 정보를 활용하여 암종을 분류하는 AI 모델을 개발하는 것입니다. <br>참가자들은 제공된 학습 데이터셋(암환자 유전체 변이 정보)을 사용하여 특정 변이 정보를 바탕으로 암종을 정확하게 분류할 수 있는 AI 알고리즘을 개발해야 합니다. <br><br>
- 이 대회의 궁극적인 목적은 바이오 데이터의 활용도를 높이고, 바이오 분야에서 AI 기술의 적용 가능성을 극대화하며, 인공지능 기술이 실제 바이오 의료 문제 해결에 어떻게 기여할 수 있는지 탐구하는 것입니다.

# Import library

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import xgboost as xgb
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler

import lightgbm
from lightgbm import LGBMClassifier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, f1_score

pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

# Load Data

In [1]:
df = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("test.csv") # 테스트 데이터
df['SUBCLASS'].value_counts()

NameError: name 'pd' is not defined

# Data Preprocessing - 변수생성

## 불필요 컬럼 삭제

In [3]:
# 각 컬럼이 "WT" 값으로만 구성된지 확인
wt_columns = [col for col in df.columns if (df[col] == "WT").all()]

# "WT" 값만 있는 컬럼들 삭제
df = df.drop(columns=wt_columns)

## 파생변수 생성 - 변이 정보 관련

In [4]:
#SUBCLASS별 "WT"가 아닌 행의 개수
genes = []
for disease in df["SUBCLASS"].unique():
    disease_df = df[df["SUBCLASS"] == disease]

    non_wt_counts = (disease_df != 'WT').sum()  # "WT"가 아닌 행 개수
    non_wt_counts_sorted = non_wt_counts.sort_values(ascending=False) #빈도 수가 높은 순서대로 정렬

    n = 10 # 추출한 상위 유전자 개수
    genes.extend(list(non_wt_counts_sorted[2:2+n].index))

genes = list(set(genes)) #중복 제거
print(len(genes))

108


In [5]:
# 변이 정보를 변환하는 함수들
def convert_to_amino_acid_change(mutation):
    if mutation == "WT":
        return mutation
    mutations = mutation.split()  # 공백으로 변이 분리
    converted_mutations = []

    for mut in mutations:
        match = re.search(r'([A-Z\-]+)(\d+)([A-Za-z*]+)', mut)
        if match:
            converted_mutations.append(f"{match.group(1)}_{match.group(3)}")
        else:
            converted_mutations.append(mut)

    return ",".join(converted_mutations)

def extract_position(mutation):
    if mutation == "WT":
        return mutation
    mutations = mutation.split()  # 공백으로 변이 분리
    positions = []

    for mut in mutations:
        match = re.search(r'([A-Z\-]+)(\d+)([A-Za-z*]+)', mut)
        if match:
            positions.append(match.group(2))
        else:
            positions.append(mut)

    return ",".join(positions)

def position_and_new_amino_acid(mutation):
    if mutation == "WT":
        return mutation
    mutations = mutation.split()  # 공백으로 변이 분리
    pos_amino_acids = []

    for mut in mutations:
        match = re.search(r'([A-Z\-]+)(\d+)([A-Za-z*]+)', mut)
        if match:
            pos_amino_acids.append(f"{match.group(2)}_{match.group(3)}")
        else:
            pos_amino_acids.append(mut)
    return ",".join(pos_amino_acids)
    
# 새로운 Column을 데이터프레임에 추가
for col in genes:
    df[f'{col}_amino_acid_change'] = df[col].apply(convert_to_amino_acid_change)
    df[f'{col}_position_only'] = df[col].apply(extract_position)
    df[f'{col}_position_amino_acid'] = df[col].apply(position_and_new_amino_acid)

  df[f'{col}_amino_acid_change'] = df[col].apply(convert_to_amino_acid_change)
  df[f'{col}_position_only'] = df[col].apply(extract_position)
  df[f'{col}_position_amino_acid'] = df[col].apply(position_and_new_amino_acid)
  df[f'{col}_amino_acid_change'] = df[col].apply(convert_to_amino_acid_change)
  df[f'{col}_position_only'] = df[col].apply(extract_position)
  df[f'{col}_position_amino_acid'] = df[col].apply(position_and_new_amino_acid)
  df[f'{col}_amino_acid_change'] = df[col].apply(convert_to_amino_acid_change)
  df[f'{col}_position_only'] = df[col].apply(extract_position)
  df[f'{col}_position_amino_acid'] = df[col].apply(position_and_new_amino_acid)
  df[f'{col}_amino_acid_change'] = df[col].apply(convert_to_amino_acid_change)
  df[f'{col}_position_only'] = df[col].apply(extract_position)
  df[f'{col}_position_amino_acid'] = df[col].apply(position_and_new_amino_acid)
  df[f'{col}_amino_acid_change'] = df[col].apply(convert_to_amino_acid_change)
  df[f'{col}_position_only'] = df

## 파생변수 생성 - 도메인 지식 기반, 중요 유전자 합

In [6]:
# 상위 5개의 유전자 컬럼 선택
top_genes = ['TP53', 'PIK3CA', 'BRCA1', 'BRCA2', 'IDH1']

# 각 유전자 값을 ','로 연결하여 새로운 파생변수 생성
df['gene_combined'] = df[top_genes].apply(lambda row: ','.join(row.values.astype(str)), axis=1)

  df['gene_combined'] = df[top_genes].apply(lambda row: ','.join(row.values.astype(str)), axis=1)


## 파생변수 생성 - 도메인 지식 기반, 중요 유전자 weight 추가

In [7]:
df['Weight'] = 0

gene_weights = ['SDHD', 'BRCA1', 'RET', 'PTEN', 'APC', 'ATRX', 'TP53', 'SDHB', 
                'RB1', 'NPM1', 'BCL2', 'AR', 'CDH1', 'MDM2', 'VHL', 'KIT', 
                'AXIN1', 'CDKN2A', 'EGFR', 'IDH2', 'PIK3CA', 'MTOR', 'IDH1', 
                'BRCA2', 'BRAF', 'FGFR3', 'IGF2', 'BCL6', 'CTNNB1', 'MYC']

# 모든 행에 대해 반복문을 실행하여 각 행의 'WT'가 아닌 값들의 개수를 'Weight'에 저장
for index, row in df.iterrows():
    non_wt_count = 0
    
    # 각 컬럼에 대해 'WT'가 아닌 값 있는지 확인하고 누적
    for column in gene_weights:
        if row[column] != 'WT':
            non_wt_count += 1
    
    # 해당 행의 'Weight' 값을 업데이트
    df.loc[index, 'Weight'] = non_wt_count

  df['Weight'] = 0


In [8]:
df.head()

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,ABAT,ABCA1,ABCA2,ABCA3,ABCA4,...,DMD_position_only,DMD_position_amino_acid,MYH4_amino_acid_change,MYH4_position_only,MYH4_position_amino_acid,ALMS1_amino_acid_change,ALMS1_position_only,ALMS1_position_amino_acid,gene_combined,Weight
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,"WT,WT,WT,WT,WT",0
1,TRAIN_0001,SARC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,"M237I,WT,WT,WT,WT",3
2,TRAIN_0002,SKCM,R895R,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,"E_K,R_C",4121800,"412_K,1800_C",WT,WT,WT,"WT,WT,WT,P606S,R132C",3
3,TRAIN_0003,KIRC,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,"WT,WT,WT,WT,WT",1
4,TRAIN_0004,GBMLGG,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,"M246R,WT,WT,WT,WT",2


# test 데이터 전처리

In [9]:
df_test = df_test.drop(columns=wt_columns)

# 새로운 Column을 데이터프레임에 추가
for col in genes:
    df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
    df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
    df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amino_acid)
    
df_test['gene_combined'] = df_test[top_genes].apply(lambda row: ','.join(row.values.astype(str)), axis=1)

df_test['Weight'] = 0

# 모든 행에 대해 반복문을 실행하여 각 행의 'WT'가 아닌 값들의 개수를 'Weight'에 저장
for index, row in df_test.iterrows():
    non_wt_count = 0
    
    # 각 컬럼에 대해 'WT'가 아닌 값 있는지 확인하고 누적
    for column in gene_weights:
        if row[column] != 'WT':
            non_wt_count += 1
    
    # 해당 행의 'Weight' 값을 업데이트
    df_test.loc[index, 'Weight'] = non_wt_count
    
df_test.head()

  df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
  df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
  df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amino_acid)
  df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
  df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
  df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amino_acid)
  df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
  df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
  df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amino_acid)
  df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
  df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
  df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amin

  df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
  df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
  df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amino_acid)
  df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
  df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
  df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amino_acid)
  df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
  df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
  df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amino_acid)
  df_test[f'{col}_amino_acid_change'] = df_test[col].apply(convert_to_amino_acid_change)
  df_test[f'{col}_position_only'] = df_test[col].apply(extract_position)
  df_test[f'{col}_position_amino_acid'] = df_test[col].apply(position_and_new_amin

Unnamed: 0,ID,A2M,AAAS,AADAT,ABAT,ABCA1,ABCA2,ABCA3,ABCA4,ABCA5,...,DMD_position_only,DMD_position_amino_acid,MYH4_amino_acid_change,MYH4_position_only,MYH4_position_amino_acid,ALMS1_amino_acid_change,ALMS1_position_only,ALMS1_position_amino_acid,gene_combined,Weight
0,TEST_0000,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,Y_H,1289,1289_H,"R89Q R248Q R209Q R116Q,WT,WT,WT,WT",2
1,TEST_0001,WT,WT,WT,WT,R587Q,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,"WT,E545G,WT,I605Yfs,WT",6
2,TEST_0002,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,"WT,WT,WT,WT,WT",0
3,TEST_0003,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,"R306*,WT,WT,WT,R132H",2
4,TEST_0004,WT,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,L_L,785,785_L,L_F,3131,3131_F,"V157F,WT,WT,WT,WT",3


# Data Preprocessing - encoding

In [None]:
# SUBCLASS 가 범주형이기 때문에 LabelEncoder 사용
le_subclass = LabelEncoder()
df['SUBCLASS'] = le_subclass.fit_transform(df['SUBCLASS'])

# 변환된 레이블 확인
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: ACC, 변환된 숫자: 0
원래 레이블: BLCA, 변환된 숫자: 1
원래 레이블: BRCA, 변환된 숫자: 2
원래 레이블: CESC, 변환된 숫자: 3
원래 레이블: COAD, 변환된 숫자: 4
원래 레이블: DLBC, 변환된 숫자: 5
원래 레이블: GBMLGG, 변환된 숫자: 6
원래 레이블: HNSC, 변환된 숫자: 7
원래 레이블: KIPAN, 변환된 숫자: 8
원래 레이블: KIRC, 변환된 숫자: 9
원래 레이블: LAML, 변환된 숫자: 10
원래 레이블: LGG, 변환된 숫자: 11
원래 레이블: LIHC, 변환된 숫자: 12
원래 레이블: LUAD, 변환된 숫자: 13
원래 레이블: LUSC, 변환된 숫자: 14
원래 레이블: OV, 변환된 숫자: 15
원래 레이블: PAAD, 변환된 숫자: 16
원래 레이블: PCPG, 변환된 숫자: 17
원래 레이블: PRAD, 변환된 숫자: 18
원래 레이블: SARC, 변환된 숫자: 19
원래 레이블: SKCM, 변환된 숫자: 20
원래 레이블: STES, 변환된 숫자: 21
원래 레이블: TGCT, 변환된 숫자: 22
원래 레이블: THCA, 변환된 숫자: 23
원래 레이블: THYM, 변환된 숫자: 24
원래 레이블: UCEC, 변환된 숫자: 25


In [11]:
## x 의 경우도 범주형으로 구성되어 있어, 알맞은 인코딩 필요
X = df.drop(columns=['SUBCLASS', 'ID'])
y_subclass = df['SUBCLASS']

categorical_columns = X.select_dtypes(include=['object', 'category']).columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_encoded = X.copy()
X_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_subclass, test_size=0.2 , random_state= 42)

# Model Define and Train

In [13]:
import optuna

def objective(trial):
    params = {
        'subsample': trial.suggest_float('subsample', 0.7, 0.8),
        'n_estimators': trial.suggest_int('n_estimators', 100, 150),
        'min_child_weight': trial.suggest_int('min_child_weight', 6, 8),
        'max_depth': trial.suggest_int('max_depth', 8, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.15),
        'gamma': trial.suggest_float('gamma', 0.1, 0.2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.5),
        'random_state': 42,
        'eval_metric':'mlogloss',
        'early_stopping_rounds':10
    }
    model = xgb.XGBClassifier(**params)
    
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=10)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)
    
# Optuna 스터디 생성 및 최적화 실행
# minimize(최소 값) 모델 평가 방법에 따라 변경 maximizer(최대 값)
xgb_study = optuna.create_study(direction='maximize')
# 시도 횟수 n_trials = 50번 
xgb_study.optimize(objective, n_trials=18)

print(f"모델 이름: 최적의 값 = {xgb_study.best_value}, 최적의 파라미터 = {xgb_study.best_params}")
# 최적의 파라미터와 RMSE 출력
print("Best trial:")
trial = xgb_study.best_trial

print("Value: ", trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f"{key}: {value}")

[I 2024-10-03 12:50:01,673] A new study created in memory with name: no-name-447d5d6f-1a8f-4e08-8f7c-7374aeb7a621


[0]	validation_0-mlogloss:3.09121
[10]	validation_0-mlogloss:2.51857
[20]	validation_0-mlogloss:2.33915
[30]	validation_0-mlogloss:2.25159
[40]	validation_0-mlogloss:2.20824
[50]	validation_0-mlogloss:2.18309
[60]	validation_0-mlogloss:2.16697
[70]	validation_0-mlogloss:2.15728
[80]	validation_0-mlogloss:2.15465
[84]	validation_0-mlogloss:2.15549


[I 2024-10-03 12:52:28,802] Trial 0 finished with value: 0.314262691377921 and parameters: {'subsample': 0.7345202584405346, 'n_estimators': 108, 'min_child_weight': 6, 'max_depth': 9, 'learning_rate': 0.10328918930324617, 'gamma': 0.18179197368969463, 'colsample_bytree': 0.4986247697007434}. Best is trial 0 with value: 0.314262691377921.


[0]	validation_0-mlogloss:3.08739
[10]	validation_0-mlogloss:2.50446
[20]	validation_0-mlogloss:2.32789
[30]	validation_0-mlogloss:2.24475
[40]	validation_0-mlogloss:2.20039
[50]	validation_0-mlogloss:2.17532
[60]	validation_0-mlogloss:2.16150
[70]	validation_0-mlogloss:2.14367
[80]	validation_0-mlogloss:2.13745
[90]	validation_0-mlogloss:2.13694
[100]	validation_0-mlogloss:2.13639
[104]	validation_0-mlogloss:2.13570


[I 2024-10-03 12:56:14,661] Trial 1 finished with value: 0.33279613215149073 and parameters: {'subsample': 0.7989800743722261, 'n_estimators': 150, 'min_child_weight': 6, 'max_depth': 10, 'learning_rate': 0.10716641628485234, 'gamma': 0.15879906581681646, 'colsample_bytree': 0.46362088096352305}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.07342
[10]	validation_0-mlogloss:2.50054
[20]	validation_0-mlogloss:2.33212
[30]	validation_0-mlogloss:2.25350
[40]	validation_0-mlogloss:2.21738
[50]	validation_0-mlogloss:2.19542
[60]	validation_0-mlogloss:2.18209
[70]	validation_0-mlogloss:2.17436
[80]	validation_0-mlogloss:2.17228
[88]	validation_0-mlogloss:2.17319


[I 2024-10-03 12:58:56,432] Trial 2 finished with value: 0.3166800966962127 and parameters: {'subsample': 0.7386033353928984, 'n_estimators': 133, 'min_child_weight': 8, 'max_depth': 9, 'learning_rate': 0.11944228264620783, 'gamma': 0.15205710388634294, 'colsample_bytree': 0.46458362408054543}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.09569
[10]	validation_0-mlogloss:2.53721
[20]	validation_0-mlogloss:2.36430
[30]	validation_0-mlogloss:2.27531
[40]	validation_0-mlogloss:2.22943
[50]	validation_0-mlogloss:2.20374
[60]	validation_0-mlogloss:2.18902
[70]	validation_0-mlogloss:2.17920
[80]	validation_0-mlogloss:2.17563
[90]	validation_0-mlogloss:2.17317
[100]	validation_0-mlogloss:2.17281
[110]	validation_0-mlogloss:2.17376
[111]	validation_0-mlogloss:2.17340


[I 2024-10-03 13:01:39,128] Trial 3 finished with value: 0.3166800966962127 and parameters: {'subsample': 0.7846189924463736, 'n_estimators': 134, 'min_child_weight': 8, 'max_depth': 8, 'learning_rate': 0.11138401018452673, 'gamma': 0.1299697505020445, 'colsample_bytree': 0.3390222313947445}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.10748
[10]	validation_0-mlogloss:2.56373
[20]	validation_0-mlogloss:2.38216
[30]	validation_0-mlogloss:2.28686
[40]	validation_0-mlogloss:2.23662
[50]	validation_0-mlogloss:2.20716
[60]	validation_0-mlogloss:2.18859
[70]	validation_0-mlogloss:2.17665
[80]	validation_0-mlogloss:2.17088
[90]	validation_0-mlogloss:2.16809
[100]	validation_0-mlogloss:2.16661
[108]	validation_0-mlogloss:2.16556


[I 2024-10-03 13:04:01,128] Trial 4 finished with value: 0.3150684931506849 and parameters: {'subsample': 0.7951878267658069, 'n_estimators': 109, 'min_child_weight': 8, 'max_depth': 9, 'learning_rate': 0.10344313369454272, 'gamma': 0.1554991416041803, 'colsample_bytree': 0.3118056670918576}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.03284
[10]	validation_0-mlogloss:2.42478
[20]	validation_0-mlogloss:2.27599
[30]	validation_0-mlogloss:2.21533
[40]	validation_0-mlogloss:2.18741
[50]	validation_0-mlogloss:2.17304
[60]	validation_0-mlogloss:2.16612
[70]	validation_0-mlogloss:2.16208
[80]	validation_0-mlogloss:2.16179
[84]	validation_0-mlogloss:2.16348


[I 2024-10-03 13:06:36,827] Trial 5 finished with value: 0.3150684931506849 and parameters: {'subsample': 0.7098593767317048, 'n_estimators': 138, 'min_child_weight': 6, 'max_depth': 9, 'learning_rate': 0.1459060997612946, 'gamma': 0.14386911763040566, 'colsample_bytree': 0.4312554884970746}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.10099
[10]	validation_0-mlogloss:2.53579
[20]	validation_0-mlogloss:2.35686
[30]	validation_0-mlogloss:2.26688
[40]	validation_0-mlogloss:2.22038
[50]	validation_0-mlogloss:2.19140
[60]	validation_0-mlogloss:2.17324
[70]	validation_0-mlogloss:2.16260
[80]	validation_0-mlogloss:2.15915
[90]	validation_0-mlogloss:2.15869
[100]	validation_0-mlogloss:2.15748
[110]	validation_0-mlogloss:2.15686
[116]	validation_0-mlogloss:2.15712


[I 2024-10-03 13:09:52,992] Trial 6 finished with value: 0.30942788074133765 and parameters: {'subsample': 0.7988747729287881, 'n_estimators': 149, 'min_child_weight': 7, 'max_depth': 8, 'learning_rate': 0.105005059376045, 'gamma': 0.13768090412314424, 'colsample_bytree': 0.398680368990782}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.04428
[10]	validation_0-mlogloss:2.45287
[20]	validation_0-mlogloss:2.29775
[30]	validation_0-mlogloss:2.22911
[40]	validation_0-mlogloss:2.19762
[50]	validation_0-mlogloss:2.18463
[60]	validation_0-mlogloss:2.17850
[70]	validation_0-mlogloss:2.17325
[80]	validation_0-mlogloss:2.17341
[84]	validation_0-mlogloss:2.17396


[I 2024-10-03 13:12:26,424] Trial 7 finished with value: 0.314262691377921 and parameters: {'subsample': 0.7354653316972382, 'n_estimators': 110, 'min_child_weight': 8, 'max_depth': 10, 'learning_rate': 0.14053490374171018, 'gamma': 0.18194863071478856, 'colsample_bytree': 0.4351072087266531}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.10220
[10]	validation_0-mlogloss:2.52980
[20]	validation_0-mlogloss:2.34989
[30]	validation_0-mlogloss:2.25999
[40]	validation_0-mlogloss:2.21174
[50]	validation_0-mlogloss:2.18244
[60]	validation_0-mlogloss:2.16193
[70]	validation_0-mlogloss:2.15117
[80]	validation_0-mlogloss:2.14359
[90]	validation_0-mlogloss:2.13612
[100]	validation_0-mlogloss:2.12793
[110]	validation_0-mlogloss:2.12743
[115]	validation_0-mlogloss:2.12591


[I 2024-10-03 13:15:56,779] Trial 8 finished with value: 0.3279613215149073 and parameters: {'subsample': 0.7768281080128001, 'n_estimators': 145, 'min_child_weight': 6, 'max_depth': 9, 'learning_rate': 0.10412358909816481, 'gamma': 0.19717148947704982, 'colsample_bytree': 0.37450960598703475}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.07887
[10]	validation_0-mlogloss:2.50618
[20]	validation_0-mlogloss:2.34007
[30]	validation_0-mlogloss:2.25918
[40]	validation_0-mlogloss:2.21786
[50]	validation_0-mlogloss:2.19701
[60]	validation_0-mlogloss:2.18358
[70]	validation_0-mlogloss:2.17592
[80]	validation_0-mlogloss:2.17266
[90]	validation_0-mlogloss:2.17258
[95]	validation_0-mlogloss:2.17400


[I 2024-10-03 13:18:04,065] Trial 9 finished with value: 0.30781627719580984 and parameters: {'subsample': 0.7362040134631644, 'n_estimators': 140, 'min_child_weight': 8, 'max_depth': 10, 'learning_rate': 0.1255775485746425, 'gamma': 0.14711342022181004, 'colsample_bytree': 0.3177666145147919}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.05424
[10]	validation_0-mlogloss:2.45846
[20]	validation_0-mlogloss:2.30016
[30]	validation_0-mlogloss:2.23020
[40]	validation_0-mlogloss:2.19509
[50]	validation_0-mlogloss:2.18031
[60]	validation_0-mlogloss:2.17432
[70]	validation_0-mlogloss:2.16945
[80]	validation_0-mlogloss:2.16945
[81]	validation_0-mlogloss:2.16979


[I 2024-10-03 13:21:07,905] Trial 10 finished with value: 0.3150684931506849 and parameters: {'subsample': 0.7622092299312849, 'n_estimators': 122, 'min_child_weight': 7, 'max_depth': 10, 'learning_rate': 0.12961245326929716, 'gamma': 0.10059111132440321, 'colsample_bytree': 0.4988519800403756}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.08930
[10]	validation_0-mlogloss:2.50299
[20]	validation_0-mlogloss:2.32824
[30]	validation_0-mlogloss:2.24689
[40]	validation_0-mlogloss:2.20237
[50]	validation_0-mlogloss:2.17619
[60]	validation_0-mlogloss:2.16256
[70]	validation_0-mlogloss:2.15301
[80]	validation_0-mlogloss:2.14598
[90]	validation_0-mlogloss:2.14114
[100]	validation_0-mlogloss:2.13558
[106]	validation_0-mlogloss:2.13467


[I 2024-10-03 13:24:25,640] Trial 11 finished with value: 0.32473811442385175 and parameters: {'subsample': 0.7744115454534476, 'n_estimators': 149, 'min_child_weight': 6, 'max_depth': 10, 'learning_rate': 0.11415087436485286, 'gamma': 0.19184178048629397, 'colsample_bytree': 0.3729384056581659}. Best is trial 1 with value: 0.33279613215149073.


[0]	validation_0-mlogloss:3.10887
[10]	validation_0-mlogloss:2.54519
[20]	validation_0-mlogloss:2.35949
[30]	validation_0-mlogloss:2.26748
[40]	validation_0-mlogloss:2.21813
[50]	validation_0-mlogloss:2.18601
[60]	validation_0-mlogloss:2.16500
[70]	validation_0-mlogloss:2.15337
[80]	validation_0-mlogloss:2.14523
[90]	validation_0-mlogloss:2.13834
[100]	validation_0-mlogloss:2.12801
[110]	validation_0-mlogloss:2.12608
[120]	validation_0-mlogloss:2.12467
[130]	validation_0-mlogloss:2.12454
[134]	validation_0-mlogloss:2.12547


[I 2024-10-03 13:28:02,052] Trial 12 finished with value: 0.33440773569701854 and parameters: {'subsample': 0.7719742930320259, 'n_estimators': 145, 'min_child_weight': 6, 'max_depth': 8, 'learning_rate': 0.10041640682365643, 'gamma': 0.16356069584519212, 'colsample_bytree': 0.36603633239533173}. Best is trial 12 with value: 0.33440773569701854.


[0]	validation_0-mlogloss:3.08356
[10]	validation_0-mlogloss:2.50768
[20]	validation_0-mlogloss:2.33356
[30]	validation_0-mlogloss:2.25105
[40]	validation_0-mlogloss:2.20941
[50]	validation_0-mlogloss:2.18485
[60]	validation_0-mlogloss:2.17139
[70]	validation_0-mlogloss:2.16237
[80]	validation_0-mlogloss:2.15929
[89]	validation_0-mlogloss:2.15999


[I 2024-10-03 13:30:48,735] Trial 13 finished with value: 0.314262691377921 and parameters: {'subsample': 0.7593573931217817, 'n_estimators': 125, 'min_child_weight': 7, 'max_depth': 8, 'learning_rate': 0.11227621044797333, 'gamma': 0.16639434694744712, 'colsample_bytree': 0.43736728228024063}. Best is trial 12 with value: 0.33440773569701854.


[0]	validation_0-mlogloss:3.10733
[10]	validation_0-mlogloss:2.54212
[20]	validation_0-mlogloss:2.35844
[30]	validation_0-mlogloss:2.26931
[40]	validation_0-mlogloss:2.22038
[50]	validation_0-mlogloss:2.18732
[60]	validation_0-mlogloss:2.16549
[70]	validation_0-mlogloss:2.15198
[80]	validation_0-mlogloss:2.14454
[90]	validation_0-mlogloss:2.13747
[100]	validation_0-mlogloss:2.12817
[110]	validation_0-mlogloss:2.12600
[117]	validation_0-mlogloss:2.12584


[I 2024-10-03 13:34:01,511] Trial 14 finished with value: 0.3271555197421434 and parameters: {'subsample': 0.7838817282623494, 'n_estimators': 118, 'min_child_weight': 6, 'max_depth': 8, 'learning_rate': 0.1009324158175127, 'gamma': 0.16604883212558566, 'colsample_bytree': 0.3612353567605962}. Best is trial 12 with value: 0.33440773569701854.


[0]	validation_0-mlogloss:3.06095
[10]	validation_0-mlogloss:2.44962
[20]	validation_0-mlogloss:2.28951
[30]	validation_0-mlogloss:2.21957
[40]	validation_0-mlogloss:2.18756
[50]	validation_0-mlogloss:2.16795
[60]	validation_0-mlogloss:2.15893
[70]	validation_0-mlogloss:2.15485
[80]	validation_0-mlogloss:2.14500
[90]	validation_0-mlogloss:2.14236
[100]	validation_0-mlogloss:2.14187
[109]	validation_0-mlogloss:2.14437


[I 2024-10-03 13:37:45,379] Trial 15 finished with value: 0.32473811442385175 and parameters: {'subsample': 0.7634136766442814, 'n_estimators': 142, 'min_child_weight': 6, 'max_depth': 10, 'learning_rate': 0.13299274957453083, 'gamma': 0.12397817529719785, 'colsample_bytree': 0.3950450592007103}. Best is trial 12 with value: 0.33440773569701854.


[0]	validation_0-mlogloss:3.07038
[10]	validation_0-mlogloss:2.48822
[20]	validation_0-mlogloss:2.32165
[30]	validation_0-mlogloss:2.24241
[40]	validation_0-mlogloss:2.20376
[50]	validation_0-mlogloss:2.18483
[60]	validation_0-mlogloss:2.17314
[70]	validation_0-mlogloss:2.16558
[80]	validation_0-mlogloss:2.16487
[86]	validation_0-mlogloss:2.16641


[I 2024-10-03 13:40:42,434] Trial 16 finished with value: 0.30781627719580984 and parameters: {'subsample': 0.7875183055016935, 'n_estimators': 132, 'min_child_weight': 7, 'max_depth': 8, 'learning_rate': 0.11924771318411209, 'gamma': 0.16854430143133495, 'colsample_bytree': 0.46095371238108995}. Best is trial 12 with value: 0.33440773569701854.


[0]	validation_0-mlogloss:3.09767
[10]	validation_0-mlogloss:2.52404
[20]	validation_0-mlogloss:2.34545
[30]	validation_0-mlogloss:2.25618
[40]	validation_0-mlogloss:2.21164
[50]	validation_0-mlogloss:2.18125
[60]	validation_0-mlogloss:2.16559
[70]	validation_0-mlogloss:2.15460
[80]	validation_0-mlogloss:2.14979
[90]	validation_0-mlogloss:2.14408
[100]	validation_0-mlogloss:2.13631
[101]	validation_0-mlogloss:2.13587


[I 2024-10-03 13:43:31,190] Trial 17 finished with value: 0.32554391619661566 and parameters: {'subsample': 0.7522918607127617, 'n_estimators': 102, 'min_child_weight': 6, 'max_depth': 9, 'learning_rate': 0.10955384507115101, 'gamma': 0.11635327004079105, 'colsample_bytree': 0.3425932235870229}. Best is trial 12 with value: 0.33440773569701854.


모델 이름: 최적의 값 = 0.33440773569701854, 최적의 파라미터 = {'subsample': 0.7719742930320259, 'n_estimators': 145, 'min_child_weight': 6, 'max_depth': 8, 'learning_rate': 0.10041640682365643, 'gamma': 0.16356069584519212, 'colsample_bytree': 0.36603633239533173}
Best trial:
Value:  0.33440773569701854
Params: 
subsample: 0.7719742930320259
n_estimators: 145
min_child_weight: 6
max_depth: 8
learning_rate: 0.10041640682365643
gamma: 0.16356069584519212
colsample_bytree: 0.36603633239533173


In [14]:
# Kaggle에서는 특정모형의 과대적합을 줄이기 위해 OOF(Out-Of-Fold) Prediction을 자주 사용한다.
xgb_models = cross_validate(xgb.XGBClassifier(**xgb_study.best_params), # 최적화된 hyperparameter 사용
                        X_train, y_train, cv=10, scoring='f1_weighted',
                        return_estimator=True)

xgb_oof_pred = np.array([m.predict_proba(X_test) for m in xgb_models['estimator']]).mean(axis=0)
xgb_test_pred = np.argmax(xgb_oof_pred, axis=1)

f1_test = f1_score(y_test, xgb_test_pred, average='weighted')
print(f'테스트 데이터 F1 Score: {f1_test:.4f}')

테스트 데이터 F1 Score: 0.3059


In [16]:
import optuna

def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 40),
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
        'max_depth': trial.suggest_int('max_depth', 10, 30),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.8),
        'random_state': 42,
        'force_row_wise': True,
        'verbose': -1
    }
    model = LGBMClassifier(**params)
    
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)
    
# Optuna 스터디 생성 및 최적화 실행
# minimize(최소 값) 모델 평가 방법에 따라 변경 maximizer(최대 값)
lgbm_study = optuna.create_study(direction='maximize')
# 시도 횟수 n_trials = 50번 
lgbm_study.optimize(objective, n_trials=18)

print(f"모델 이름: 최적의 값 = {lgbm_study.best_value}, 최적의 파라미터 = {lgbm_study.best_params}")
# 최적의 파라미터와 RMSE 출력
print("Best trial:")
trial = lgbm_study.best_trial

print("Value: ", trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f"{key}: {value}")

[I 2024-10-03 14:22:10,856] A new study created in memory with name: no-name-4b9e215b-cb0a-4ea6-9315-e6ad2af6585b
[I 2024-10-03 14:22:18,309] Trial 0 finished with value: 0.257856567284448 and parameters: {'num_leaves': 29, 'n_estimators': 193, 'min_child_weight': 19, 'max_depth': 30, 'learning_rate': 0.15802020980843215, 'colsample_bytree': 0.7331129341468581}. Best is trial 0 with value: 0.257856567284448.
[I 2024-10-03 14:22:28,150] Trial 1 finished with value: 0.2876712328767123 and parameters: {'num_leaves': 28, 'n_estimators': 102, 'min_child_weight': 9, 'max_depth': 23, 'learning_rate': 0.02428062701952882, 'colsample_bytree': 0.5612184011230047}. Best is trial 1 with value: 0.2876712328767123.
[I 2024-10-03 14:22:34,255] Trial 2 finished with value: 0.2852538275584206 and parameters: {'num_leaves': 20, 'n_estimators': 124, 'min_child_weight': 10, 'max_depth': 10, 'learning_rate': 0.11098815178984976, 'colsample_bytree': 0.7135235969317334}. Best is trial 1 with value: 0.2876712

모델 이름: 최적의 값 = 0.30781627719580984, 최적의 파라미터 = {'num_leaves': 33, 'n_estimators': 170, 'min_child_weight': 6, 'max_depth': 13, 'learning_rate': 0.04851311147406713, 'colsample_bytree': 0.5642949900740744}
Best trial:
Value:  0.30781627719580984
Params: 
num_leaves: 33
n_estimators: 170
min_child_weight: 6
max_depth: 13
learning_rate: 0.04851311147406713
colsample_bytree: 0.5642949900740744


In [17]:
# Kaggle에서는 특정모형의 과대적합을 줄이기 위해 OOF(Out-Of-Fold) Prediction을 자주 사용한다.
lgbm_models = cross_validate(LGBMClassifier(**lgbm_study.best_params), # 최적화된 hyperparameter 사용
                        X_train, y_train, cv=10, scoring='f1_weighted',
                        return_estimator=True)

lgbm_oof_pred = np.array([m.predict_proba(X_test) for m in lgbm_models['estimator']]).mean(axis=0)
lgbm_test_pred = np.argmax(lgbm_oof_pred, axis=1)

f1_test = f1_score(y_test, xgb_test_pred, average='weighted')
print(f'테스트 데이터 F1 Score: {f1_test:.4f}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 67348
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2949
[LightGBM] [Info] Start training from score -4.452557
[LightGBM] [Info] Start training from score -4.169694
[LightGBM] [Info] Start training from score -2.051171
[LightGBM] [Info] Start training from score -3.685302
[LightGBM] [Info] Start training from score -3.244745
[LightGBM] [Info] Start training from score -5.184925
[LightGBM] [Info] Start training from score -2.667228
[LightGBM] [Info] Start training from score -3.360375
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.3162































































[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65290
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2890
[LightGBM] [Info] Start training from score -4.452557
[LightGBM] [Info] Start training from score -4.169694
[LightGBM] [Info] Start training from score -2.051171
[LightGBM] [Info] Start training from score -3.685302
[LightGBM] [Info] Start training from score -3.244745
[LightGBM] [Info] Start training from score -5.184925
[LightGBM] [Info] Start training from score -2.664008
[LightGBM] [Info] Start training from score -3.366848
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.316204
[LightGBM] [Info] Start training from score -3.694270
[LightG



































































[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027612 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67845
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2964
[LightGBM] [Info] Start training from score -4.452557
[LightGBM] [Info] Start training from score -4.169694
[LightGBM] [Info] Start training from score -2.051171
[LightGBM] [Info] Start training from score -3.685302
[LightGBM] [Info] Start training from score -3.244745
[LightGBM] [Info] Start training from score -5.184925
[LightGBM] [Info] Start training from score -2.664008
[LightGBM] [Info] Start training from score -3.366848
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.316204
[LightGBM] [Info] Start training from score -3.694270
[LightG































































[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66145
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2926
[LightGBM] [Info] Start training from score -4.452557
[LightGBM] [Info] Start training from score -4.169694
[LightGBM] [Info] Start training from score -2.052915
[LightGBM] [Info] Start training from score -3.685302
[LightGBM] [Info] Start training from score -3.244745
[LightGBM] [Info] Start training from score -5.184925
[LightGBM] [Info] Start training from score -2.664008
[LightGBM] [Info] Start training from score -3.360375
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.3162





































































[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025767 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66039
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2910
[LightGBM] [Info] Start training from score -4.452557
[LightGBM] [Info] Start training from score -4.169694
[LightGBM] [Info] Start training from score -2.052915
[LightGBM] [Info] Start training from score -3.685302
[LightGBM] [Info] Start training from score -3.244745
[LightGBM] [Info] Start training from score -5.184925
[LightGBM] [Info] Start training from score -2.664008
[LightGBM] [Info] Start training from score -3.360375
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.316204
[LightGBM] [Info] Start training from score -3.685302
[LightG



































































[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023839 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64819
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2883
[LightGBM] [Info] Start training from score -4.452557
[LightGBM] [Info] Start training from score -4.169694
[LightGBM] [Info] Start training from score -2.052915
[LightGBM] [Info] Start training from score -3.685302
[LightGBM] [Info] Start training from score -3.244745
[LightGBM] [Info] Start training from score -5.184925
[LightGBM] [Info] Start training from score -2.664008
[LightGBM] [Info] Start training from score -3.360375
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.316204
[LightGBM] [Info] Start training from score -3.685302
[LightG



































































[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025853 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66912
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2956
[LightGBM] [Info] Start training from score -4.433509
[LightGBM] [Info] Start training from score -4.155305
[LightGBM] [Info] Start training from score -2.052915
[LightGBM] [Info] Start training from score -3.694270
[LightGBM] [Info] Start training from score -3.250509
[LightGBM] [Info] Start training from score -5.145704
[LightGBM] [Info] Start training from score -2.667228
[LightGBM] [Info] Start training from score -3.360375
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.3162





































































[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66871
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2930
[LightGBM] [Info] Start training from score -4.433509
[LightGBM] [Info] Start training from score -4.155305
[LightGBM] [Info] Start training from score -2.052915
[LightGBM] [Info] Start training from score -3.694270
[LightGBM] [Info] Start training from score -3.250509
[LightGBM] [Info] Start training from score -5.145704
[LightGBM] [Info] Start training from score -2.667228
[LightGBM] [Info] Start training from score -3.360375
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.721669
[LightGBM] [Info] Start training from score -3.3162





































































[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65251
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2883
[LightGBM] [Info] Start training from score -4.452557
[LightGBM] [Info] Start training from score -4.155305
[LightGBM] [Info] Start training from score -2.052915
[LightGBM] [Info] Start training from score -3.694270
[LightGBM] [Info] Start training from score -3.250509
[LightGBM] [Info] Start training from score -5.184925
[LightGBM] [Info] Start training from score -2.667228
[LightGBM] [Info] Start training from score -3.360375
[LightGBM] [Info] Start training from score -2.447963
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.316204
[LightGBM] [Info] Start training from score -3.685302
[LightG

































































[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66836
[LightGBM] [Info] Number of data points in the train set: 4464, number of used features: 2918
[LightGBM] [Info] Start training from score -4.452557
[LightGBM] [Info] Start training from score -4.169694
[LightGBM] [Info] Start training from score -2.052915
[LightGBM] [Info] Start training from score -3.694270
[LightGBM] [Info] Start training from score -3.244745
[LightGBM] [Info] Start training from score -5.184925
[LightGBM] [Info] Start training from score -2.667228
[LightGBM] [Info] Start training from score -3.360375
[LightGBM] [Info] Start training from score -2.445376
[LightGBM] [Info] Start training from score -2.874371
[LightGBM] [Info] Start training from score -3.730972
[LightGBM] [Info] Start training from score -3.3162







































































테스트 데이터 F1 Score: 0.3059


# Ensemble 

In [18]:
# 두 모델의 예측 확률을 가중 평균 (0.5씩 가중치)
ensemble_oof_pred = (xgb_oof_pred * 0.5) + (lgbm_oof_pred * 0.5)

# 가장 높은 확률을 가진 클래스를 선택
ensemble_test_pred = np.argmax(ensemble_oof_pred, axis=1)

In [19]:
f1_test = f1_score(y_test, ensemble_test_pred, average='weighted')
print(f'테스트 데이터 F1 Score: {f1_test:.4f}')

테스트 데이터 F1 Score: 0.3014


# Inference

In [20]:
test_X = df_test.drop(columns=['ID'])
X_encoded = test_X.copy()
X_encoded[categorical_columns] = ordinal_encoder.transform(test_X[categorical_columns])

In [21]:
xgb_oof = np.array([m.predict_proba(X_encoded) for m in xgb_models['estimator']]).mean(axis=0)
lgbm_oof = np.array([m.predict_proba(X_encoded) for m in lgbm_models['estimator']]).mean(axis=0)

ensemble_oof = (xgb_oof * 0.5) + (lgbm_oof * 0.5)

predictions = np.argmax(ensemble_oof, axis=1)

In [22]:
original_labels = le_subclass.inverse_transform(predictions)

# Submisson

In [23]:
submisson = pd.read_csv("./sample_submission.csv")

In [24]:
submisson["SUBCLASS"] = original_labels

In [25]:
submisson.to_csv('./v5_submission.csv', encoding='UTF-8-sig', index=False)

In [26]:
df_sub = pd.read_csv("v5_submission.csv") # 학습용 데이터
df_sub['SUBCLASS'].value_counts()

SUBCLASS
BRCA      585
COAD      497
KIPAN     308
STES      303
GBMLGG    162
HNSC      112
SKCM       83
UCEC       73
THCA       67
LUAD       58
OV         50
LGG        46
KIRC       37
LUSC       34
LIHC       27
ACC        25
LAML       20
PRAD       15
CESC       15
SARC       15
PAAD        9
BLCA        3
TGCT        2
Name: count, dtype: int64