#### ライブラリの読み込み

In [2]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import japanize_matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

#### データの読み込み

In [3]:
PATH = "" # データのパスを設定

df = pd.read_csv(PATH + "df.csv")
sub_df = pd.read_csv(PATH + 'sample_submission.csv')
df.shape, sub_df.shape

((784713, 22), (19301, 2))

In [4]:
# train, testデータに分割
train = df[df["取引価格（総額）_log"].notnull()]
test = df[df["取引価格（総額）_log"].isnull()]

# trainをX, yに分割
X = train.drop("取引価格（総額）_log", axis=1).values
y = train[["取引価格（総額）_log"]].values
test = test.drop("取引価格（総額）_log", axis=1).values

X.shape, y.shape, test.shape

((765412, 21), (765412, 1), (19301, 21))

#### KFoldで学習

In [5]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import lightgbm as lgb

In [6]:
FOLD = 5
NUM_ROUND = 100
VERBOSE_EVAL = -1

params = {
    'objective': 'mae', #目的関数を指定
    'metric': 'mae', # 評価指標を指定
    'num_leaves': 20, #決定木の葉の数
    'max_depth': 14, #定木の深さの最大値
    "feature_fraction": 0.5, #各木で使用する特徴量の割合
    'subsample_freq': 1, #バッチサンプリングの頻度
    "bagging_fraction": 0.95, #バッグサンプリングで使用するサンプルの割合
    'min_data_in_leaf': 100, #1つの葉に最低何個のデータポイントが必要かを指定
    'learning_rate': 0.1, #学習率
    "boosting": "gbdt", #使用するブースティングアルゴリズムを指定
    "lambda_l1": 0.1, #L1正則化の強さ
    "lambda_l2": 15, #L2正則化の強さ
    "verbosity": -1, #出力レベルを設定（-1はログの表示なし）
    "random_state": 42, #乱数のシードを指定
    "num_boost_round": 30000, #ブースティングの反復回数
    "early_stopping_rounds": 100 #早期停止のための回数
}

valid_scores = []
models = []
kf = KFold(n_splits=FOLD, shuffle=True, random_state=42)

for fold, (train_indices, valid_indices) in tqdm(enumerate(kf.split(X)), total=FOLD):
    X_train, X_valid = X[train_indices], X[valid_indices]
    y_train, y_valid = y[train_indices], y[valid_indices]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid)

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=NUM_ROUND,
        verbose_eval=VERBOSE_EVAL
    )

    y_valid_pred = model.predict(X_valid)
    score = mean_absolute_error(y_valid, y_valid_pred)
    tqdm.write(f'fold {fold} MAE: {score}')
    valid_scores.append(score)

    models.append(model)

  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[29997]	valid_0's l1: 0.0737867
fold 0 MAE: 0.0737866885997491
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[30000]	valid_0's l1: 0.0735844
fold 1 MAE: 0.07358445261331459
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[30000]	valid_0's l1: 0.0729966
fold 2 MAE: 0.07299663688963381
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[29996]	valid_0's l1: 0.0736899
fold 3 MAE: 0.0736899107145412
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[30000]	valid_0's l1: 0.0731596
fold 4 MAE: 0.07315957981796929


#### 予測ラベルを作成

In [15]:
test_predictions = []
for model in tqdm(models, total=len(models)):
    y_test_pred = model.predict(test)
    test_predictions.append(y_test_pred)

pre = np.mean(test_predictions, axis=0)

  0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
pd_pre = pd.DataFrame(pre, columns=["取引価格（総額）_log"])
pd_pre

Unnamed: 0,取引価格（総額）_log
0,7.747308
1,7.731821
2,7.131194
3,6.694156
4,7.668088
...,...
19296,7.554827
19297,7.555310
19298,7.557137
19299,7.242938


#### submissionの形式にする

In [17]:
sub_df["取引価格（総額）_log"] = pd_pre["取引価格（総額）_log"]
sub_df

Unnamed: 0,ID,取引価格（総額）_log
0,1000077,7.747308
1,1000081,7.731821
2,1000128,7.131194
3,1000129,6.694156
4,1000130,7.668088
...,...,...
19296,47006020,7.554827
19297,47006229,7.555310
19298,47006331,7.557137
19299,47006332,7.242938


#### CSV出力

In [18]:
sub_df.to_csv(PATH + 'submission.csv', index=False)