# Tutorial

- model
  - LightGBM
- Preprocess
  - Drop columns which have nan values

In [1]:
from pathlib import Path
import glob
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import pandas as pd
import japanize_matplotlib
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

from utils import load_data

## Constants

In [4]:
# 現在時刻を取得
now = datetime.now()

# 文字列に変換 (例: "2024-07-23 18:57:30")
now_str = now.strftime("%Y-%m-%d_%H-%M-%S")

DATA_DIR = Path("./data/")
OUTPUT_DIR = Path("./logs/")
RESULT_DIR = OUTPUT_DIR / now_str
SEED = 42
TARGET = "取引価格（総額）_log"

## Preprocess

In [None]:
# Load data
train_df, test_df, sub_df = load_data(DATA_DIR)

# trainとtestを結合する
df = pd.concat([train_df, test_df])

# typeを確認する
df.info()

In [None]:
# nullが含まれるカラムを特定する
rm_cols = []
rm_cols += df.columns[df.isnull().sum() >= 1].tolist()

# 削除対象のカラムリストから目的変数とIDを除外する
if TARGET in rm_cols:
    rm_cols.remove(TARGET)
if "ID" in rm_cols:
    rm_cols.remove("ID")

rm_cols

In [None]:
# 不要なカラムを削除する
df.drop(rm_cols, axis=1, inplace=True)
df.info()

In [None]:
# 取引時点でソートし，indexを振り直す
df.sort_values("取引時点", inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
df["取引時点"].unique()

In [None]:
val_min_idx = min(df[df['取引時点'].str.contains('2023年第1四半期|2023年第2四半期', regex=True)].index)
test_min_idx = min(df[df['取引時点'].str.contains('2023年第3四半期|2023年第4四半期', regex=True)].index)

val_min_idx, test_min_idx

In [None]:
obj_cols = df.columns[df.dtypes == "object"].tolist()
obj_cols

In [None]:
df[obj_cols] = df[obj_cols].astype(str)
df.info()

In [None]:
# Dtypeがobjectのカラムをintに変換する
for col in obj_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

df.head()

In [None]:
# Split train, valid, test data
train_df = df.iloc[:val_min_idx, :]
val_df = df.iloc[val_min_idx:test_min_idx, :]
test_df = df.iloc[test_min_idx:, :]

display(train_df.shape, val_df.shape, test_df.shape)

In [None]:
# 特徴量を選定する
feat_cols = df.columns.tolist()
feat_cols.remove(TARGET)
feat_cols.remove("ID")
feat_cols

In [None]:
# 学習＿検証・評価データを作成する
train_x = train_df[feat_cols]
train_y = train_df[TARGET]
val_x = val_df[feat_cols]
val_y = val_df[TARGET]
test_x = test_df[feat_cols]
test_y = test_df[TARGET]

In [None]:
lgb_params = {
    'objective': 'regression',
    'metric': 'mae', #コンペの評価関数を設定
    'num_leaves': 42,
    'max_depth': 7,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.95,
    'min_data_in_leaf': 2,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.1,
    "lambda_l2": 10,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 50000, #学習イテレーション数
    "early_stopping_rounds": 100
}

train_data = lgb.Dataset(train_x, label=train_y)
val_data = lgb.Dataset(val_x, label=val_y)

model = lgb.train(
    lgb_params, train_data, categorical_feature=feat_cols, valid_sets=[train_data, val_data], callbacks=[lgb.log_evaluation(100)]
    )

val_pred = model.predict(val_x, num_iteration=model.best_iteration)
score = mean_absolute_error(val_y, val_pred)

pred_df = pd.DataFrame(sorted(zip(val_x.index, val_pred, val_y)), columns=['index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(), train_x.columns)), columns=['importance', 'feature'])

print(f'score: {score:.4f}')

In [None]:
lgb.plot_importance(model, figsize=(12,8), max_num_features=50, importance_type='gain')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()
plt.close()

In [None]:
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
type(test_pred)

In [None]:
test_df[TARGET] = test_pred

In [None]:
sub_df = pd.merge(sub_df[['ID']], test_df[['ID', TARGET]], on='ID')
sub_df.to_csv('test_submission.csv', index=False)