In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from dataprep.eda import create_report

train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

y = train_df["target"]
n_train = len(train_df)
all_data = pd.concat((train_df, test_df), axis=0)
all_data.pop("target")

all_data.head()

In [None]:
create_report(train_df)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(8, 6))

sns.boxplot(data=train_df, x="target", y="gravity", ax=ax[0][0])
sns.boxplot(data=train_df, x="target", y="ph", ax=ax[0][1])
sns.boxplot(data=train_df, x="target", y="osmo", ax=ax[0][2])
sns.boxplot(data=train_df, x="target", y="cond", ax=ax[1][0])
sns.boxplot(data=train_df, x="target", y="urea", ax=ax[1][1])
sns.boxplot(data=train_df, x="target", y="calc", ax=ax[1][2])
plt.tight_layout()

In [None]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


lgbm_params = {
    'objective': 'binary',
    'feature_pre_filter': False,
    'lambda_l1': 0.2,
    'lambda_l2': 1e-05,
    'num_leaves': 5,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 7,
    'min_child_samples': 25,
    'num_iterations': 200,
    'early_stopping_round': 100,
}

X = all_data.iloc[:n_train].drop(columns=["id"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
lgb_train = lgbm.Dataset(X_train, y_train)
lgb_eval = lgbm.Dataset(X_val, y_val, reference=lgb_train)
evaluation_results = {}                       
model_lgbm = lgbm.train(
    lgbm_params,
    valid_names=['train', 'valid'],           
    valid_sets=[lgb_train, lgb_eval],        
    evals_result=evaluation_results,          
    train_set=lgb_train
)