### https://qiita.com/uratatsu/items/49d54484f9099bc3acbb#%E3%82%B9%E3%82%B3%E3%82%A2%E6%8E%A8%E7%A7%BB%E3%81%8B%E3%82%89%E3%81%BF%E3%82%8Bkaggle%E3%82%B3%E3%83%B3%E3%83%9A%E3%81%AE%E3%83%97%E3%83%AD%E3%82%BB%E3%82%B9


In [5]:
import utils as Util

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

import lightgbm as lgb
import optuna

In [2]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
sample = pd.read_csv("./data/submit_sample.csv",header=None)

In [None]:
# カテゴリカルな特徴量を数値に変換
categorical_columns = ['region', 'manufacturer', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'state']
train = Util.bulk_label_encoding(train, categorical_columns)
test = Util.bulk_label_encoding(test, categorical_columns)

# 特徴量とターゲット変数を分ける
target_columns = ["odometer", "fuel", "drive", "transmission", "year", "cylinders"]
X = train[target_columns]
y = train["price"]

# LightGBMのパラメータ設定
params = {
    'objective': 'regression',
    'metric': 'mape',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# KFoldを使った交差検証
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

scores = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    d_train = lgb.Dataset(X_train, label=y_train)
    d_val = lgb.Dataset(X_val, label=y_val)
    
    # model = lgb.train(params, d_train, num_boost_round=1000, valid_sets=[d_train, d_val], early_stopping_rounds=50, verbose_eval=100)
    model = lgb.train(params, d_train, num_boost_round=1000, valid_sets=[d_train, d_val])
    
    y_pred = model.predict(X_val, num_iteration=model.best_iteration)
    score = mean_absolute_percentage_error(y_val, y_pred) * 100
    scores.append(score)

mean_score = np.mean(scores)
print("Mean Cross-Validation MAPE:", mean_score)

In [None]:
# カテゴリカルな特徴量を数値に変換
categorical_columns = ['region', 'manufacturer', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'state']
train = Util.bulk_label_encoding(train, categorical_columns)
test = Util.bulk_label_encoding(test, categorical_columns)

# 特徴量とターゲット変数を分ける
target_columns = ["odometer", "fuel", "drive", "transmission", "year", "cylinders"]
X = train[target_columns]
y = train["price"]

# 訓練データとテストデータに分割
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        'objective': 'regression',
        'max_bin': trial.suggest_int('max_bin', 255, 500),
        'learning_rate': 0.05,
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
    }
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000)

    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)
    score = mean_absolute_percentage_error(y_valid, y_pred_valid) * 100
    return score
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective, n_trials=40)
study.best_params
# ↓study.best_paramsはこれ
# {'max_bin': 426, 'num_leaves': 37}