In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder

In [2]:
train_df = pd.read_csv("../data/processed/train.csv")
train_df.head()

Unnamed: 0,Clothing ID,Age,Positive Feedback Count,Division Name,Department Name,Class Name,text,labels,probability
0,0,25,4,General,Bottoms,Skirts,"title: 3-season skirt! [SEP] review: Adorable,...",1,0.993447
1,0,39,0,General,Bottoms,Skirts,title: Very cute [SEP] review: Love the asymme...,1,0.99457
2,0,42,5,General,Bottoms,Skirts,title: Beautiful! fruns small for typical reta...,1,0.994842
3,0,45,9,General,Bottoms,Skirts,title: none [SEP] review: I was really pleased...,1,0.876926
4,0,57,1,General,Bottoms,Skirts,"title: Unique, pretty asymmetric skirt [SEP] r...",1,0.993074


In [3]:
# カテゴリ変数をラベル化
test_df = pd.read_csv("../data/processed/test.csv")
combined_df = pd.concat([train_df, test_df], ignore_index=True)

cat_cols = ["Division Name", "Department Name", "Class Name"]
ordinal_encoder = OrdinalEncoder()
combined_df[cat_cols] = ordinal_encoder.fit_transform(combined_df[cat_cols])

train_length = len(train_df)
train_df = combined_df.iloc[:train_length]
test_df = combined_df.iloc[train_length:]

In [4]:
test_df.reset_index(drop=True, inplace=True)
test_df.head()

Unnamed: 0,Clothing ID,Age,Positive Feedback Count,Division Name,Department Name,Class Name,text,labels,probability
0,0,32,0,0.0,0.0,12.0,title: So happy i bought this skirt! [SEP] rev...,,0.993913
1,0,34,0,0.0,0.0,12.0,title: Runs small [SEP] review: Beautiful patt...,,0.896281
2,0,37,0,0.0,0.0,12.0,title: Love the comfort of thi skirt [SEP] rev...,,0.994328
3,0,39,10,0.0,0.0,12.0,title: Way too small [SEP] review: This is a b...,,0.907166
4,0,39,0,0.0,0.0,12.0,title: none [SEP] review: I usually wear a siz...,,0.994283


In [5]:
params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.1,
    "verbosity": -1,
    "boosting_type": "gbdt",
    "lambda_l1": 0.3,
    "lambda_l2": 0.3,
    "max_depth": 6,
    "num_leaves": 128,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_child_samples": 20,
    "seed": 42,
}

In [6]:
features = ["Age", "Positive Feedback Count", "Division Name", "Department Name", "Class Name", "probability"]
X = train_df[features].head(1000).copy()
y = train_df["labels"].head(1000).copy()

oof = np.zeros(X.shape[0])
preds = np.zeros(test_df.shape[0])
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        params, 
        train_data, 
        valid_sets=[train_data, val_data],
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)]
    )

    oof[val_index] = model.predict(X_val)
    preds += model.predict(test_df[features]) / kf.n_splits

# model.save_model("../models/lightgbm.txt", num_iteration=model.best_iteration)
print(f"CV score: {roc_auc_score(y, oof)}")

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.948288	valid_1's auc: 0.945799
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	training's auc: 0.972065	valid_1's auc: 0.933096
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	training's auc: 0.978967	valid_1's auc: 0.948433
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.957657	valid_1's auc: 0.962112
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[9]	training's auc: 0.966528	valid_1's auc: 0.926215
CV score: 0.9055906329923273
