In [7]:
import pandas as pd

train_df = pd.read_csv("ranking_train_df.csv")
val_df = pd.read_csv("ranking_val_df.csv")

In [75]:
features = ["garment_group_name", "index_group_name",
            "cos", "age", "month_sin", "month_cos", "label"]

X_train = train_df[features]
X_val = val_df[features]
y_train = X_train.pop("label")
y_val = X_val.pop("label")

X_train.sample(5)

Unnamed: 0,garment_group_name,index_group_name,cos,age,month_sin,month_cos
1474569,Swimwear,Ladieswear,-0.395468,40.0,0.5,0.8660254
6613393,"Under-, Nightwear",Ladieswear,-0.640921,20.0,1.0,6.123234000000001e-17
4498694,Jersey Basic,Divided,-0.452613,20.0,-0.866025,-0.5
3062018,Swimwear,Ladieswear,-0.393614,40.0,0.5,0.8660254
574130,"Under-, Nightwear",Ladieswear,-0.485977,39.0,-0.866025,0.5


In [76]:
from catboost import CatBoostClassifier, Pool

cat_feat_list = ["garment_group_name", "index_group_name"]

pool_train = Pool(X_train, y_train,
                  cat_features=cat_feat_list)

pool_val = Pool(X_val, y_val, cat_features=cat_feat_list)

model = CatBoostClassifier(
    iterations=200,
    depth=5,
    scale_pos_weight=10,
    early_stopping_rounds=5,
    use_best_model=True
)

model.fit(pool_train, eval_set=pool_val)


Learning rate set to 0.5
0:	learn: 0.6515828	test: 0.6645605	best: 0.6645605 (0)	total: 2.79s	remaining: 9m 14s
1:	learn: 0.6364415	test: 0.6560014	best: 0.6560014 (1)	total: 5.32s	remaining: 8m 46s
2:	learn: 0.6310434	test: 0.6547751	best: 0.6547751 (2)	total: 7.3s	remaining: 7m 59s
3:	learn: 0.6274044	test: 0.6535945	best: 0.6535945 (3)	total: 9.43s	remaining: 7m 42s
4:	learn: 0.6221039	test: 0.6498609	best: 0.6498609 (4)	total: 11.8s	remaining: 7m 41s
5:	learn: 0.6206351	test: 0.6493037	best: 0.6493037 (5)	total: 13.8s	remaining: 7m 25s
6:	learn: 0.6190219	test: 0.6488043	best: 0.6488043 (6)	total: 16.3s	remaining: 7m 28s
7:	learn: 0.6161233	test: 0.6461680	best: 0.6461680 (7)	total: 19s	remaining: 7m 36s
8:	learn: 0.6151542	test: 0.6456326	best: 0.6456326 (8)	total: 22.1s	remaining: 7m 49s
9:	learn: 0.6145938	test: 0.6450292	best: 0.6450292 (9)	total: 24.3s	remaining: 7m 41s
10:	learn: 0.6134952	test: 0.6433945	best: 0.6433945 (10)	total: 27.4s	remaining: 7m 51s
11:	learn: 0.612933

<catboost.core.CatBoostClassifier at 0x14d298130>

In [77]:
from sklearn.metrics import classification_report

preds = model.predict(pool_val)

print(classification_report(y_val, preds))

              precision    recall  f1-score   support

           0       0.95      0.61      0.74   2880680
           1       0.15      0.67      0.24    288068

    accuracy                           0.61   3168748
   macro avg       0.55      0.64      0.49   3168748
weighted avg       0.88      0.61      0.70   3168748



In [80]:
feat_to_score = {feature: score for feature, score in zip(
    X_train.columns, model.feature_importances_)}

feat_to_score

{'garment_group_name': 10.974865609222299,
 'index_group_name': 8.10157114052959,
 'cos': 42.32225394750771,
 'age': 6.217185919401023,
 'month_sin': 11.085311919595457,
 'month_cos': 21.298811463743842}