In [12]:
import polars as pl
import pandas as pd

from sklearn.model_selection import train_test_split
import lightgbm as lgb
import kaggle

In [2]:
base = "/home/michael/Datasets/playground-series-s4e3"  # "/kaggle/input/playground-series-s4e3/train.csv"
data = pd.read_csv(f"{base}/train.csv")
test = pl.read_csv(f"{base}/test.csv")

X = data.drop(["id", "Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"], axis=1)
y = data[["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]]

In [3]:
param = {
    "is_unbalance": True,
    "boosting_type": "dart",
    "xgboost_dart_mode": False,
    "learning_rate": 0.22250875397947384,
    "num_iterations": 133,
    "lambda_l1": 6.383093976221751e-08,
    "lambda_l2": 2.320950282189897e-06,
    "max_bin": 161,
    "num_leaves": 489,
    "feature_fraction": 0.6429369642878832,
    "bagging_fraction": 0.9367691198774809,
    "bagging_freq": 4,
    "min_child_samples": 963,
    "min_sum_hessian_in_leaf": 0.00575253384646295,
    "device_type": "gpu",
    "objective": "binary",
    "verbosity": 50,
}

In [5]:
# Split the data into train and test sets
models = {}

for column in ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y[column])
    train_data = lgb.Dataset(X_train, label=y_train[column])
    validation_data = lgb.Dataset(X_val, label=y_val[column])
    best = lgb.train(param, train_data, 1000, valid_sets=[validation_data], callbacks=[lgb.early_stopping(stopping_rounds=100)])
    models[column] = best


# Predicting with each model
predictions = {}
test_no_id = test.drop(["id"])
for column, model in models.items():
    predictions[column] = model.predict(test_no_id)

# Convert predictions dictionary to DataFrame for better visualization
submission_df = pl.DataFrame(predictions)

# Display the predictions
print(submission_df)



[LightGBM] [Info] Number of positive: 1173, number of negative: 14202
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3527
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 27
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.41 MB) transferred to GPU in 0.001030 secs. 0 sparse feature groups
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.076293 -> initscore=-2.493818
[LightGBM] [Info] Start training from score -2.493818
[LightGBM] [Debug] Re-bagging, using 14409 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.38 MB) transferred to GPU in 0.000988 secs. 0 sparse feature groups
[LightGB



[LightGBM] [Info] Number of positive: 920, number of negative: 14455
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3527
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 27
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.41 MB) transferred to GPU in 0.001044 secs. 0 sparse feature groups
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.059837 -> initscore=-2.754422
[LightGBM] [Info] Start training from score -2.754422
[LightGBM] [Debug] Re-bagging, using 14409 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.38 MB) transferred to GPU in 0.001024 secs. 0 sparse feature groups
[LightGBM



[LightGBM] [Info] Number of positive: 2746, number of negative: 12629
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3525
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 27
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.41 MB) transferred to GPU in 0.001046 secs. 0 sparse feature groups
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.178602 -> initscore=-1.525850
[LightGBM] [Info] Start training from score -1.525850
[LightGBM] [Debug] Re-bagging, using 14409 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.38 MB) transferred to GPU in 0.000996 secs. 0 sparse feature groups
[LightGB



[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 10
[LightGBM] [Debug] Trained a tree with leaves = 13 and depth = 9
[LightGBM] [Debug] Re-bagging, using 14398 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.38 MB) transferred to GPU in 0.001026 secs. 0 sparse feature groups
[LightGBM] [Debug] Trained a tree with leaves = 15 and depth = 12
[LightGBM] [Debug] Trained a tree with leaves = 15 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 16 and depth = 7
[LightGBM] [Debug] Re-bagging, using 14449 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.39 MB) transferred to GPU in 0.001044 secs. 0 sparse feature groups
[LightGBM] [Debug] Trained a tree with leaves = 18 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 22 and depth = 11
[LightGBM] [Debug] Trained a tree with 



[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.41 MB) transferred to GPU in 0.001340 secs. 0 sparse feature groups
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.025236 -> initscore=-3.653933
[LightGBM] [Info] Start training from score -3.653933
[LightGBM] [Debug] Re-bagging, using 14409 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.38 MB) transferred to GPU in 0.001273 secs. 0 sparse feature groups
[LightGBM] [Debug] Trained a tree with leaves = 10 and depth = 4
[LightGBM] [Debug] Trained a tree with leaves = 12 and depth = 5
[LightGBM] [Debug] Trained a tree with leaves = 13 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 14 and depth = 8
[LightGBM] [Debug] Re-bagging, using 14397 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.38 MB) transferred to GPU in 0.0



[LightGBM] [Info] Number of positive: 3810, number of negative: 11565
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3531
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 27
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.41 MB) transferred to GPU in 0.001053 secs. 0 sparse feature groups
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247805 -> initscore=-1.110354
[LightGBM] [Info] Start training from score -1.110354
[LightGBM] [Debug] Re-bagging, using 14409 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.38 MB) transferred to GPU in 0.000985 secs. 0 sparse feature groups
[LightGB



[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.41 MB) transferred to GPU in 0.001079 secs. 0 sparse feature groups
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.341203 -> initscore=-0.657937
[LightGBM] [Info] Start training from score -0.657937
[LightGBM] [Debug] Re-bagging, using 14409 data to train
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 26 dense feature groups (0.38 MB) transferred to GPU in 0.000986 secs. 0 sparse feature groups
[LightGBM] [Debug] Trained a tree with leaves = 11 and depth = 5
[LightGBM] [Debug] Trained a tree with leaves = 11 and depth = 5
[LightGBM] [Debug] Trained a tree with leaves = 12 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 12 and



shape: (12_814, 7)
┌──────────┬───────────┬──────────┬──────────┬───────────┬──────────┬──────────────┐
│ Pastry   ┆ Z_Scratch ┆ K_Scatch ┆ Stains   ┆ Dirtiness ┆ Bumps    ┆ Other_Faults │
│ ---      ┆ ---       ┆ ---      ┆ ---      ┆ ---       ┆ ---      ┆ ---          │
│ f64      ┆ f64       ┆ f64      ┆ f64      ┆ f64       ┆ f64      ┆ f64          │
╞══════════╪═══════════╪══════════╪══════════╪═══════════╪══════════╪══════════════╡
│ 0.923622 ┆ 0.012376  ┆ 0.002601 ┆ 0.000048 ┆ 0.31489   ┆ 0.348241 ┆ 0.5493       │
│ 0.669383 ┆ 0.15054   ┆ 0.008496 ┆ 0.000102 ┆ 0.81645   ┆ 0.331673 ┆ 0.450578     │
│ 0.012864 ┆ 0.168966  ┆ 0.097929 ┆ 0.000697 ┆ 0.019374  ┆ 0.604094 ┆ 0.623177     │
│ 0.500448 ┆ 0.002534  ┆ 0.003106 ┆ 0.000153 ┆ 0.155136  ┆ 0.587292 ┆ 0.630211     │
│ 0.016245 ┆ 0.004     ┆ 0.001892 ┆ 0.003458 ┆ 0.037307  ┆ 0.842734 ┆ 0.571252     │
│ …        ┆ …         ┆ …        ┆ …        ┆ …         ┆ …        ┆ …            │
│ 0.588349 ┆ 0.679932  ┆ 0.001538 ┆ 0.000015 ┆

In [6]:
# Display the predictions
submission_df = (
    submission_df.with_columns([pl.lit(test["id"]).alias("id")])
    .select(["id", "Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"])
    .sort(by=["id"])
)
submission_df

id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
i64,f64,f64,f64,f64,f64,f64,f64
19219,0.923622,0.012376,0.002601,0.000048,0.31489,0.348241,0.5493
19220,0.669383,0.15054,0.008496,0.000102,0.81645,0.331673,0.450578
19221,0.012864,0.168966,0.097929,0.000697,0.019374,0.604094,0.623177
19222,0.500448,0.002534,0.003106,0.000153,0.155136,0.587292,0.630211
19223,0.016245,0.004,0.001892,0.003458,0.037307,0.842734,0.571252
…,…,…,…,…,…,…,…
32028,0.588349,0.679932,0.001538,0.000015,0.222377,0.447488,0.459842
32029,0.703278,0.010797,0.029669,0.027952,0.699414,0.381377,0.577355
32030,0.000513,0.002964,0.982967,0.000161,0.004842,0.002732,0.155223
32031,0.860184,0.071143,0.042491,0.000135,0.629276,0.411444,0.528606


In [13]:
dest = f"{base}/20240314002.csv"
submission_comment = "The second subbission"
competition = "playground-series-s4e3"
submission_df.write_csv(dest)

kaggle.api.competition_submit(dest, submission_comment, competition)



100%|██████████| 1.81M/1.81M [00:02<00:00, 912kB/s] 


Successfully submitted to Steel Plate Defect Prediction