In [1]:
import pandas as pd
import numpy as np

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [2]:
train=train.drop('index',axis=1)
train=train.drop('occyp_type',axis=1)

test=test.drop('index',axis=1)
test=test.drop('occyp_type',axis=1)

In [3]:
duplicate_rows = train[train.duplicated(train.columns.difference(['credit']), keep=False)]

print("중복된 행의 개수:", len(duplicate_rows))

중복된 행의 개수: 4497


In [4]:
train = train.sort_values('credit', ascending=False).drop_duplicates(train.columns.difference(['credit']), keep='first')

In [5]:
duplicate_rows = train[train.duplicated(train.columns.difference(['credit']), keep=False)]

print("중복된 행의 개수:", len(duplicate_rows))

중복된 행의 개수: 0


In [8]:
train = train.drop('FLAG_MOBIL', axis=1)
train = train.drop('child_num', axis=1)

train['gender'] = train['gender'].replace(['F','M'],[0,1])
train['car'] = train['car'].replace(['N','Y'],[0,1])
train['reality'] = train['reality'].replace(['N','Y'],[0,1])

test = test.drop('FLAG_MOBIL', axis=1)
test = test.drop('child_num', axis=1)

test['gender'] = test['gender'].replace(['F','M'],[0,1])
test['car'] = test['car'].replace(['N','Y'],[0,1])
test['reality'] = test['reality'].replace(['N','Y'],[0,1])

In [10]:
def days_to_year(x):
    if x<0:
        return (x*(-1))/365
    return 0

def minus_to_plus(x):
    if x<0:
        return x*(-1)
    return 0

In [11]:
train['DAYS_BIRTH'] = train['DAYS_BIRTH'].apply(days_to_year)
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(days_to_year)
train['begin_month'] = train['begin_month'].apply(minus_to_plus)

test['DAYS_BIRTH'] = test['DAYS_BIRTH'].apply(days_to_year)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].apply(days_to_year)
test['begin_month'] = test['begin_month'].apply(minus_to_plus)

In [13]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

train['income_type'] = label_encoder.fit_transform(train['income_type'])
test['income_type'] = label_encoder.fit_transform(test['income_type'])

#교육 수준 순서 고려
edu_type_mapping = {
    'Lower secondary': 0,
    'Secondary / secondary special': 1,
    'Incomplete higher': 2,
    'Higher education': 3,
    'Academic degree': 4
}
train['edu_type'] = train['edu_type'].map(edu_type_mapping)
test['edu_type'] = test['edu_type'].map(edu_type_mapping)

train['family_type'] = label_encoder.fit_transform(train['family_type'])
test['family_type'] = label_encoder.fit_transform(test['family_type'])

train['house_type'] = label_encoder.fit_transform(train['house_type'])
test['house_type'] = label_encoder.fit_transform(test['house_type'])

In [15]:
train.loc[train['family_size'] >= 4,'family_size']=4
test.loc[test['family_size']>=4, 'family_size']=4

In [17]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report, log_loss
from imblearn.over_sampling import SMOTE

# 데이터 준비
X = train.drop('credit', axis=1)
y = train['credit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

smote = SMOTE(random_state=42)
X_Sover, y_Sover = smote.fit_resample(X_train, y_train)
print(X_Sover.shape, y_Sover.shape)

(38043, 16) (38043,)


In [18]:
display(y_train.value_counts(),y_Sover.value_counts())

2.0    12681
1.0     4438
0.0     2164
Name: credit, dtype: int64

2.0    12681
1.0    12681
0.0    12681
Name: credit, dtype: int64

In [40]:
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split

# Objective 함수 정의
def objective(trial):
    # 하이퍼파라미터 탐색 공간 설정
    params = {
        'iterations': trial.suggest_int('iterations', 50, 200),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': 42,
        'verbose': False
    }
    
    # CatBoost Pool 생성
    catboost_pool = Pool(X_Sover, y_Sover)
    
    # 모델 생성 및 학습
    model = CatBoostClassifier(**params, loss_function='MultiClass')
    model.fit(catboost_pool, verbose=False)
    
    # 예측
    preds_proba = model.predict_proba(X_test)
    
    # 평가
    logloss = log_loss(y_test, preds_proba)
    
    return logloss

# Study 객체 생성 및 최적화 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=12)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-05-09 10:35:16,384] A new study created in memory with name: no-name-62e20bc9-4a0c-4b1e-9d41-ae7e22f65417
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),
[I 2024-05-09 10:35:17,306] Trial 0 finished with value: 0.8734468297758057 and parameters: {'iterations': 96, 'learning_rate': 0.10837269698685946, 'depth': 5, 'l2_leaf_reg': 1.7390120924335961, 'border_count': 70}. Best is trial 0 with value: 0.8734468297758057.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),
[I 2024-05-09 10:35:19,410] Trial 1 finished with value: 0.9057521972467732 and parameters: {'iterations': 147, 'learning_rate': 0.01927235269020872, 'depth': 7, 'l2_leaf_reg': 4.43337517367455, 'border_count': 220}. Best is trial 0 with value: 0.8734468297758057.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  '

[I 2024-05-09 10:35:38,718] Trial 8 finished with value: 0.7899766683476285 and parameters: {'iterations': 180, 'learning_rate': 0.15767494364442305, 'depth': 10, 'l2_leaf_reg': 2.4687984943380297, 'border_count': 230}. Best is trial 8 with value: 0.7899766683476285.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),
[I 2024-05-09 10:35:44,475] Trial 9 finished with value: 0.8114960699107742 and parameters: {'iterations': 185, 'learning_rate': 0.07333774403467648, 'depth': 9, 'l2_leaf_reg': 0.8151135435725437, 'border_count': 136}. Best is trial 8 with value: 0.7899766683476285.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),
[I 2024-05-09 10:35:56,788] Trial 10 finished with value: 0.8027918833151056 and parameters: {'iterations': 164, 'learning_rate': 0.23375513969695308, 'depth': 10, 'l2_leaf_reg': 0.6890

Best trial:
  Value: 0.7899766683476285
  Params: 
    iterations: 180
    learning_rate: 0.15767494364442305
    depth: 10
    l2_leaf_reg: 2.4687984943380297
    border_count: 230


In [19]:
from sklearn.model_selection import StratifiedKFold

n_fold = 15
n_class = 3

# 하이퍼파라미터 설정
best_params = {
    'iterations': 180,
    'learning_rate': 0.15767494364442305,
    'depth': 10,
    'l2_leaf_reg': 2.4687984943380297,
    'border_count': 230,
    'random_seed': 19,
    'verbose': False
}

# StratifiedKFold를 사용하여 데이터를 15개의 폴드로 나눔
skf = StratifiedKFold(n_splits=15, random_state=42, shuffle=True)

# 교차 검증 수행
fold_accuracies = []
fold_log_losses = []
for train_index, test_index in skf.split(X_Sover, y_Sover):
    X_train_fold, X_test_fold = X_Sover.iloc[train_index], X_Sover.iloc[test_index]
    y_train_fold, y_test_fold = y_Sover.iloc[train_index], y_Sover.iloc[test_index]
    
    # CatBoost 모델 생성
    model = CatBoostClassifier(**best_params, loss_function='MultiClass')
    
    # 모델 학습
    model.fit(X_train_fold, y_train_fold, verbose=False)
    
    # 검증 데이터에 대한 예측
    preds_proba = model.predict_proba(X_test_fold)
    preds_class = model.predict(X_test_fold)
    
    # 평가 지표 계산
    accuracy = accuracy_score(y_test_fold, preds_class)
    logloss = log_loss(y_test_fold, preds_proba)
    
    fold_accuracies.append(accuracy)
    fold_log_losses.append(logloss)

# 결과 출력
print("Average Accuracy Score:", np.mean(fold_accuracies))
print("Average Log Loss:", np.mean(fold_log_losses))

Average Accuracy Score: 0.7655548564733012
Average Log Loss: 0.604622603901755
