In [35]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
import lightgbm as lgb

RANDOM_SEED = 42

train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
test_ids = test["ID"] 

test["log_pSat_Pa"] = None
combined = pd.concat([train, test], ignore_index=True)

non_numeric_cols = combined.select_dtypes(include=['object']).columns
combined = pd.get_dummies(combined, columns=non_numeric_cols, drop_first=True)

train = combined[combined["log_pSat_Pa"].notnull()]
test = combined[combined["log_pSat_Pa"].isnull()]
y = train["log_pSat_Pa"]
X = train.drop(columns=["log_pSat_Pa"])
test = test.drop(columns=["log_pSat_Pa"])

imputer = SimpleImputer(strategy="mean")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

  combined = pd.concat([train, test], ignore_index=True)


Then try the model with optimized hyperparameters

In [36]:
from lightgbm import early_stopping

lgb_model = lgb.LGBMRegressor(
    boosting_type='gbdt',
    num_leaves=27,
    learning_rate=0.01,
    n_estimators=3000,
    feature_fraction=0.8,
    bagging_fraction=0.91,
    max_depth=10,
    lambda_l1=0.1,
    subsample_for_bin=12000,
    subsample=0.8,
    min_child_samples=20,
    min_child_weight=10,
    random_state=RANDOM_SEED
)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
)

y_val_pred = lgb_model.predict(X_val)

r2 = r2_score(y_val, y_val_pred)
print(f"R2 Score: {r2:.4f}")


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 924
[LightGBM] [Info] Number of data points in the train set: 21309, number of used features: 28
[LightGBM] [Info] Start training from score -5.539761
R2 Score: 0.7479


R2 score is 0.7479

Create submission file:

In [37]:
y_pred = lgb_model.predict(test)

results = pd.DataFrame({
    'ID': test_ids,
    'TARGET': y_pred
})

results.to_csv("submission.csv", index=False)

