In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from dataprep.eda import create_report

train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

y = train_df["Strength"]
n_train = len(train_df)
all_data = pd.concat((train_df, test_df), axis=0)
all_data.pop("Strength")

all_data.head()

In [None]:
create_report(train_df)

In [None]:
zero_cols = ["BlastFurnaceSlag", "FlyAshComponent", "SuperplasticizerComponent"]
for col in zero_cols:
    all_data[f"Has{col}"] = (all_data[col] > 0).astype(int)

all_data.head()

In [None]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


lgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'feature_pre_filter': False,
    'lambda_l1': 0.2,
    'lambda_l2': 1e-05,
    'num_leaves': 5,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 7,
    'min_child_samples': 25,
    'num_iterations': 200,
    'early_stopping_round': 100,
}

X = all_data.iloc[:n_train].drop(columns=["id"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
lgb_train = lgbm.Dataset(X_train, y_train)
lgb_eval = lgbm.Dataset(X_val, y_val, reference=lgb_train)
evaluation_results = {}                       
model_lgbm = lgbm.train(
    lgbm_params,
    valid_names=['train', 'valid'],           
    valid_sets=[lgb_train, lgb_eval],        
    evals_result=evaluation_results,          
    train_set=lgb_train
)

In [None]:
test_data = all_data.iloc[n_train:]

lgbm_pred = model_lgbm.predict(test_data.drop(columns=['id'] , axis=1))
submission = pd.DataFrame({"id": test_data["id"], "Strength": lgbm_pred})
submission.to_csv("./data/lgbm_submission.csv", index=False)