<a href="https://colab.research.google.com/github/maskmo/igb-project/blob/Milestone-3A/Milestone3A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [182]:
!pip install shap
!pip install optuna
!pip install lightgbm



In [183]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
import numpy as np
import optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from optuna.integration import LightGBMPruningCallback

In [184]:
x_data = pd.read_csv('/content/train.csv', index_col='Id')
x_test = pd.read_csv('/content/test.csv', index_col='Id')

cat_col_list = []
for col in x_data.columns:
    if x_data[col].dtype not in ['float', 'int']:
        cat_col_list.append(col)
num_col = x_data.columns.drop(cat_col_list)
cat_col = x_data.columns.drop(num_col)
x_data.drop(columns=cat_col)
x_data.drop(columns=["LotFrontage"])
x_data.dropna()

y = x_data.SalePrice
# x_data.drop(['SalePrice'], axis=1, inplace=True)

In [185]:
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y, train_size=0.8, test_size=0.2, random_state=0)

In [256]:
from sklearn.model_selection import KFold
def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 4020, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1)
    }
    cv = KFold(n_splits=5, shuffle=True, random_state=1121218)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = lgbm.LGBMRegressor(objective="regression", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="l1",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "l1")
            ],  # Add a pruning callback
        )
        preds = model.predict(X_test)
        cv_scores[idx] = model.score(X_test, y_test)

    return np.mean(cv_scores)

In [254]:
ind = pd.Series(range(1,1169))
x_train = x_train.set_index(ind)
y_train = y_train.reset_index()

ValueError: ignored

In [257]:
from sklearn.preprocessing import LabelEncoder
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X = x_train[num_col], y = y_train['SalePrice'])
study.optimize(func, n_trials=20)

[I 2023-07-20 05:35:01,206] A new study created in memory with name: LGBM Classifier








[I 2023-07-20 05:35:01,713] Trial 0 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.05333228052399561, 'num_leaves': 20, 'max_depth': 8, 'min_data_in_leaf': 1700, 'max_bin': 276, 'lambda_l1': 35, 'lambda_l2': 70, 'min_gain_to_split': 5.854954601621591, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 0 with value: -0.006543464652617503.








[I 2023-07-20 05:35:02,198] Trial 1 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.08924817126378787, 'num_leaves': 820, 'max_depth': 8, 'min_data_in_leaf': 1100, 'max_bin': 235, 'lambda_l1': 30, 'lambda_l2': 60, 'min_gain_to_split': 11.69832232480213, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 0 with value: -0.006543464652617503.








[I 2023-07-20 05:35:02,705] Trial 2 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.032982855125049566, 'num_leaves': 2060, 'max_depth': 6, 'min_data_in_leaf': 5900, 'max_bin': 248, 'lambda_l1': 5, 'lambda_l2': 55, 'min_gain_to_split': 7.06914302309792, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: -0.006543464652617503.








[I 2023-07-20 05:35:03,233] Trial 3 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.037037698685149396, 'num_leaves': 3560, 'max_depth': 9, 'min_data_in_leaf': 9300, 'max_bin': 297, 'lambda_l1': 100, 'lambda_l2': 65, 'min_gain_to_split': 6.4180862441052735, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 0 with value: -0.006543464652617503.








[I 2023-07-20 05:35:03,771] Trial 4 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.01699560032664188, 'num_leaves': 2320, 'max_depth': 5, 'min_data_in_leaf': 5100, 'max_bin': 259, 'lambda_l1': 60, 'lambda_l2': 70, 'min_gain_to_split': 5.06060389484179, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 0 with value: -0.006543464652617503.








[I 2023-07-20 05:35:04,480] Trial 5 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.08255449408776862, 'num_leaves': 440, 'max_depth': 12, 'min_data_in_leaf': 4300, 'max_bin': 277, 'lambda_l1': 15, 'lambda_l2': 50, 'min_gain_to_split': 8.524446458093248, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: -0.006543464652617503.
















[I 2023-07-20 05:35:05,394] Trial 6 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.04922063372390385, 'num_leaves': 1500, 'max_depth': 8, 'min_data_in_leaf': 6900, 'max_bin': 295, 'lambda_l1': 60, 'lambda_l2': 90, 'min_gain_to_split': 14.028061905142849, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 0 with value: -0.006543464652617503.








[I 2023-07-20 05:35:06,278] Trial 7 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.18910176186825106, 'num_leaves': 1580, 'max_depth': 9, 'min_data_in_leaf': 4300, 'max_bin': 283, 'lambda_l1': 75, 'lambda_l2': 65, 'min_gain_to_split': 3.042576116118944, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 0 with value: -0.006543464652617503.












[I 2023-07-20 05:35:07,192] Trial 8 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.059822923444290174, 'num_leaves': 500, 'max_depth': 5, 'min_data_in_leaf': 8600, 'max_bin': 295, 'lambda_l1': 85, 'lambda_l2': 80, 'min_gain_to_split': 6.451109365245339, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 0 with value: -0.006543464652617503.













[I 2023-07-20 05:35:08,119] Trial 9 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.047872746878217985, 'num_leaves': 3500, 'max_depth': 11, 'min_data_in_leaf': 9400, 'max_bin': 218, 'lambda_l1': 35, 'lambda_l2': 55, 'min_gain_to_split': 14.532028479501832, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 0 with value: -0.006543464652617503.












[1;30;43mStreaming output truncated to the last 5000 lines.[0m




[1;30;43mStreaming output truncated to the last 5000 lines.[0m




[1;30;43mStreaming output truncated to the last 5000 lines.[0m




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[I 2023-07-20 05:43:40,413] Trial 10 finished with value: -4.675766551333864 and parameters: {'n_estimators': 10000, 'learning_rate': 0.014014497502966943, 'num_leaves': 60, 'max_depth': 3, 'min_data_in_leaf': 200, 'max_bin': 262, 'lambda_l1': 40, 'lambda_l2': 15, 'min_gain_to_split': 0.8214164700347597, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 10 with value: -4.675766551333864.








[I 2023-07-20 05:43:41,279] Trial 11 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.010488745993742119, 'num_leaves': 40, 'max_depth': 3, 'min_data_in_leaf': 600, 'max_bin': 264, 'lambda_l1': 40, 'lambda_l2': 15, 'min_gain_to_split': 0.293590588292608, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 10 with value: -4.675766551333864.












[I 2023-07-20 05:43:42,134] Trial 12 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.020497495050747355, 'num_leaves': 40, 'max_depth': 3, 'min_data_in_leaf': 2300, 'max_bin': 269, 'lambda_l1': 20, 'lambda_l2': 0, 'min_gain_to_split': 1.1721746040575463, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 10 with value: -4.675766551333864.












[I 2023-07-20 05:43:43,036] Trial 13 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.010049781855006839, 'num_leaves': 1160, 'max_depth': 6, 'min_data_in_leaf': 2600, 'max_bin': 244, 'lambda_l1': 45, 'lambda_l2': 30, 'min_gain_to_split': 4.076718546669483, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 10 with value: -4.675766551333864.












[I 2023-07-20 05:43:44,034] Trial 14 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.02085510693452679, 'num_leaves': 2780, 'max_depth': 10, 'min_data_in_leaf': 200, 'max_bin': 201, 'lambda_l1': 60, 'lambda_l2': 40, 'min_gain_to_split': 2.2524651133357745, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 10 with value: -4.675766551333864.












[I 2023-07-20 05:43:44,905] Trial 15 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2655491528777317, 'num_leaves': 880, 'max_depth': 7, 'min_data_in_leaf': 2000, 'max_bin': 278, 'lambda_l1': 25, 'lambda_l2': 100, 'min_gain_to_split': 0.08975030859166289, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 10 with value: -4.675766551333864.












[I 2023-07-20 05:43:45,752] Trial 16 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.027514386159218696, 'num_leaves': 460, 'max_depth': 4, 'min_data_in_leaf': 3500, 'max_bin': 228, 'lambda_l1': 55, 'lambda_l2': 25, 'min_gain_to_split': 2.684289009828831, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 10 with value: -4.675766551333864.












[I 2023-07-20 05:43:46,585] Trial 17 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.11494640624833237, 'num_leaves': 2780, 'max_depth': 7, 'min_data_in_leaf': 1600, 'max_bin': 255, 'lambda_l1': 0, 'lambda_l2': 10, 'min_gain_to_split': 4.127790026379996, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 10 with value: -4.675766551333864.












[I 2023-07-20 05:43:47,435] Trial 18 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.015613434802092232, 'num_leaves': 1400, 'max_depth': 10, 'min_data_in_leaf': 3100, 'max_bin': 269, 'lambda_l1': 75, 'lambda_l2': 35, 'min_gain_to_split': 9.212618318361551, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 10 with value: -4.675766551333864.












[I 2023-07-20 05:43:48,286] Trial 19 finished with value: -0.006543464652617503 and parameters: {'n_estimators': 10000, 'learning_rate': 0.025860876807642364, 'num_leaves': 940, 'max_depth': 5, 'min_data_in_leaf': 300, 'max_bin': 285, 'lambda_l1': 45, 'lambda_l2': 75, 'min_gain_to_split': 1.6365932184808507, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 10 with value: -4.675766551333864.




In [258]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): -4.67577
	Best params:
		n_estimators: 10000
		learning_rate: 0.014014497502966943
		num_leaves: 60
		max_depth: 3
		min_data_in_leaf: 200
		max_bin: 262
		lambda_l1: 40
		lambda_l2: 15
		min_gain_to_split: 0.8214164700347597
		bagging_fraction: 0.4
		bagging_freq: 1
		feature_fraction: 0.7


In [267]:
params_grid = {
                'n_estimators': 10000,
                'learning_rate': 0.014014497502966943,
                'num_leaves': 60,
                'max_depth': 3,
                'min_data_in_leaf': 200,
                'max_bin': 262,
                'lambda_l1': 40,
                'lambda_l2': 15,
                'min_gain_to_split': 0.8214164700347597,
                'bagging_fraction': 0.4,
                'bagging_freq': 1,
                'feature_fraction': 0.7,
}

model = lgbm.LGBMRegressor(objective="regression", **params_grid)
reg = model.fit(
            x_train[num_col],
            y_train.SalePrice,
            eval_metric="l1",
            eval_set=[(x_valid[num_col], y_valid)],
            early_stopping_rounds=100,
        )
pred = reg.predict(x_valid[num_col])
reg.score(x_valid[num_col], y_valid)
print(pred)



[1]	valid_0's l1: 56730.1	valid_0's l2: 6.85167e+09
[2]	valid_0's l1: 56084.5	valid_0's l2: 6.76604e+09
[3]	valid_0's l1: 55781.5	valid_0's l2: 6.72559e+09
[4]	valid_0's l1: 55170.2	valid_0's l2: 6.64572e+09
[5]	valid_0's l1: 54874.4	valid_0's l2: 6.605e+09
[6]	valid_0's l1: 54234.1	valid_0's l2: 6.52451e+09
[7]	valid_0's l1: 53952.7	valid_0's l2: 6.47793e+09
[8]	valid_0's l1: 53378.4	valid_0's l2: 6.40117e+09
[9]	valid_0's l1: 52766.2	valid_0's l2: 6.32281e+09
[10]	valid_0's l1: 52248.4	valid_0's l2: 6.25101e+09
[11]	valid_0's l1: 51722.5	valid_0's l2: 6.17926e+09
[12]	valid_0's l1: 51209.2	valid_0's l2: 6.11335e+09
[13]	valid_0's l1: 50943.8	valid_0's l2: 6.07151e+09
[14]	valid_0's l1: 50381.8	valid_0's l2: 6.00209e+09
[15]	valid_0's l1: 50114.4	valid_0's l2: 5.95974e+09
[16]	valid_0's l1: 49543.1	valid_0's l2: 5.89374e+09
[17]	valid_0's l1: 49067.2	valid_0's l2: 5.82805e+09
[18]	valid_0's l1: 48790.2	valid_0's l2: 5.78691e+09
[19]	valid_0's l1: 48249	valid_0's l2: 5.72075e+09
[20]	v