In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import lightgbm as lgb
import optuna.integration.lightgbm as oplgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import category_encoders as ce
import seaborn as sns

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")
df_sample = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv")

In [None]:
df_train

In [None]:
df_test

In [None]:
train_id = df_train["id"]
test_id = df_test["id"]

df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

# OneHotEncoder

In [None]:
cat_features = [f"cat{i}" for i in range(9 + 1)]

In [None]:
onehot_encoder = ce.one_hot.OneHotEncoder()
onehot_encoder.fit(pd.concat([df_train[cat_features], df_test[cat_features]], axis=0))
train_ohe = onehot_encoder.transform(df_train[cat_features])
test_ohe = onehot_encoder.transform(df_test[cat_features])
train_ohe.columns = [f"OHE_{col}" for col in train_ohe]
test_ohe.columns = [f"OHE_{col}" for col in test_ohe]

In [None]:
numerical_features = [f"cont{i}" for i in range(13 + 1)]

In [None]:
train_x = pd.concat([
    df_train[numerical_features],
    train_ohe
], axis=1)

In [None]:
test_x = pd.concat([
    df_test[numerical_features],
    test_ohe
], axis=1)

In [None]:
train_y = df_train["target"]

In [None]:
train_x

In [None]:
test_x

# Optuna

In [None]:
oplgb_train_data = oplgb.Dataset(train_x, train_y)

In [None]:
oplgb_params = {
    "objective": "regression",
    "metric": "root_mean_squared_error",
    "verbosity": -1,
    "learning_rate": 0.01
}

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=2021)

In [None]:
tuner_cv = oplgb.LightGBMTunerCV(oplgb_params, oplgb_train_data, num_boost_round=1000, early_stopping_rounds=100, folds=folds, verbose_eval=100, time_budget=21600)
tuner_cv.run()

In [None]:
tuner_cv.best_params

In [None]:
class FoldsAverageLGBM:
    def __init__(self, folds):
        self.folds = folds
        self.models = []
        
    def fit(self, lgb_params, train_x, train_y):
        oof_preds = np.zeros_like(train_y)
        
        self.train_x = train_x
        self.train_y = train_y.values
        
        for tr_idx, va_idx in tqdm(folds.split(train_x)):
            tr_x, va_x = self.train_x.iloc[tr_idx], self.train_x.iloc[va_idx]
            tr_y, va_y = self.train_y[tr_idx], self.train_y[va_idx]
            
            lgb_train_dataset = lgb.Dataset(tr_x, tr_y)
            lgb_valid_dataset = lgb.Dataset(va_x, va_y)
            model = lgb.train(lgb_params, lgb_train_dataset, valid_sets=[lgb_valid_dataset], verbose_eval=100)
            self.models.append(model)
            
            oof_pred = model.predict(va_x)
            oof_preds[va_idx] = oof_pred
            
        self.oof_preds = oof_preds
        
    def predict(self, test_x):
        preds = []
        for model in tqdm(self.models):
            pred = model.predict(test_x)
            preds.append(pred)
        preds = np.mean(preds, axis=0)
        return preds
    
    def get_feature_importance(self, importance_type="gain"):
        feature_names = self.models[0].feature_name()
        feature_importances_list = [model.feature_importance(importance_type) for model in self.models]
        
        out_df = pd.DataFrame()
        for i, name in enumerate(feature_names):
            out_df[name] = [v[i] for v in feature_importances_list]
        return out_df

In [None]:
def plot_importance(importance_df, max_features=100):
    feature_order = list(importance_df.mean().sort_values(ascending=False).index[:max_features])
    target_data = importance_df[feature_order]
    sns.boxenplot(data=target_data, orient="h", order=feature_order)

In [None]:
lgb_params = dict(tuner_cv.best_params)
lgb_params["learning_rate"] = 0.005
lgb_params["early_stopping_round"] = 200
lgb_params["num_iterations"] = 20000

In [None]:
folds_average_lgbm = FoldsAverageLGBM(folds)

In [None]:
folds_average_lgbm.fit(lgb_params, train_x, train_y)

In [None]:
plt.figure(figsize=(20, 20))
importance_df = folds_average_lgbm.get_feature_importance()
plot_importance(importance_df)

In [None]:
np.sqrt(mean_squared_error(df_train.target, folds_average_lgbm.oof_preds))

In [None]:
y_pred = folds_average_lgbm.predict(test_x)

In [None]:
sub = df_sample.copy()
sub["target"] = y_pred

sub.to_csv("submission_optuna_lgbm_ohe_v1.csv", index=False)

sub.head()

In [None]:
nan