In [2]:
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner

In [3]:
# 훈련 데이터 불러오기
df = pd.read_csv('../_data/dacon/wine/train.csv')

In [4]:
# 타입 변경
# white -> 0, red -> 1
type_to_num = {'white':0, 'red':1}
df['type'] = df['type'].replace(type_to_num)

In [6]:
# 퀄리티 변경
# 4 -> 0, 5 -> 1, 6 -> 2, 7 -> 3, 8 -> 4
quality_to_num = {4:0, 5:1, 6:2, 7:3, 8:4}
df['quality'] = df['quality'].replace(quality_to_num)

# Feature, Target 지정
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 
            'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 
            'pH', 'sulphates', 'alcohol', 'type']
target = 'quality'



In [7]:
# 교차 검증용 데이터셋 구성
df_trains = []
df_valids = []
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
for train_index, valid_index in skf.split(df[features], df[target]):
    df_train = df.loc[train_index]
    df_valid = df.loc[valid_index]
    df_trains.append(df_train)
    df_valids.append(df_valid)

In [8]:
def accuracy(true, pred):
    return np.mean(true==pred)

def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'max_depth': trial.suggest_int('max_depth', 1, 10, step=1, log=False), 
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True), 
        'n_estimators': trial.suggest_int('n_estimators', 8, 1024, step=1, log=True), 
        'objective': 'multiclass', 
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50, step=1, log=False), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'random_state': 0
    }
    
    score = []
    for df_train, df_valid in zip(df_trains, df_valids):
        clf = LGBMClassifier(**params)
        clf.fit(df_train[features], df_train[target])
        
        pred = clf.predict(df_valid[features])
        true = df_valid[target].values
        score.append(accuracy(true, pred))
    score = np.mean(score)
    return score


In [9]:
# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=0), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=50)

# Best Parameter로 모델 훈련
clfs = []
for df_train in df_trains:
    clf = LGBMClassifier(**study.best_params)
    clf.fit(df_train[features], df_train[target])
    clfs.append(clf)

[32m[I 2021-12-10 16:01:42,513][0m A new study created in memory with name: no-name-89ffbb91-b209-4f53-a13b-193731debba6[0m
[32m[I 2021-12-10 16:01:43,620][0m Trial 0 finished with value: 0.5456553466583086 and parameters: {'num_leaves': 54, 'max_depth': 8, 'learning_rate': 0.006431172050131994, 'n_estimators': 109, 'class_weight': None, 'min_child_samples': 27, 'subsample': 0.9675319002346239, 'colsample_bytree': 0.9890988281503088, 'reg_alpha': 0.3834415188257777, 'reg_lambda': 7.917250380826646}. Best is trial 0 with value: 0.5456553466583086.[0m
[32m[I 2021-12-10 16:01:43,719][0m Trial 1 finished with value: 0.47352869399610487 and parameters: {'num_leaves': 47, 'max_depth': 6, 'learning_rate': 0.05981221901152557, 'n_estimators': 11, 'class_weight': 'balanced', 'min_child_samples': 44, 'subsample': 0.9334470252849552, 'colsample_bytree': 0.9610036444740457, 'reg_alpha': 0.978618342232764, 'reg_lambda': 7.9915856421672355}. Best is trial 0 with value: 0.5456553466583086.[0

In [17]:
# 테스트 데이터 불러오기
df_test = pd.read_csv('../_data/dacon/wine/test.csv')
df_test['type'] = df_test['type'].replace(type_to_num)

# 예측 수행 (soft voting)
pred = [clf.predict_proba(df_test[features]) for clf in clfs]
pred = np.mean(pred, axis=0)
pred = np.argmax(pred, axis=1)+4

In [1]:
df_submit = pd.read_csv('../_data/dacon/wine/sample_submission.csv')
df_submit['quality'] = pred
df_submit.to_csv("../_data/dacon/wine/winequality1.csv", index=False) 

NameError: name 'pd' is not defined

In [20]:
from sklearn.metrics import accuracy_score
print(accuracy_score(df_test))

TypeError: accuracy_score() missing 1 required positional argument: 'y_pred'