In [110]:
!pip install catboost



In [111]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm.sklearn import LGBMRanker
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import ndcg_score
import pandas as pd
import joblib
import lightgbm as lgb

In [112]:
train_df = pd.read_csv('train_df.csv')
train_df.head()

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,758,9,0,0,1,20,3,40,0,3,...,0.204682,0.271755,0.055623,0,0,0,0.38648,0.0,0.0,0
1,758,9,0,0,1,20,3,40,0,3,...,0.195531,0.188787,0.036914,0,0,0,0.10982,0.0,0.0,0
2,758,9,0,0,1,20,3,40,0,3,...,0.148609,0.186517,0.027718,0,0,0,0.03674,0.0,0.0,0
3,758,9,0,0,1,20,3,40,0,3,...,0.223748,0.229039,0.051247,0,0,0,0.0,0.0,0.0,0
4,758,9,0,0,1,20,3,40,0,3,...,0.170935,0.249031,0.042568,0,0,0,0.0,0.0,0.0,0


In [113]:
dtypes = list(zip(train_df.dtypes.keys(), train_df.dtypes))
joblib.dump(str(dtypes), "./dtypes.txt")

['./dtypes.txt']

In [114]:
test_df = pd.read_csv('test_df.csv')
test_df.head()

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,10655,9,0,0,1,20,4,40,0,0,...,0.14883,0.196644,0.029267,0,0,0,0.03674,0.0,0.0,0
1,10655,9,0,0,1,20,4,40,0,0,...,0.119724,0.174199,0.020856,0,0,0,0.0,0.0,0.0,0
2,10655,9,0,0,1,20,4,40,0,0,...,0.160606,0.19878,0.031925,0,0,0,0.0,0.0,0.0,0
3,10655,9,0,0,1,20,4,40,0,0,...,0.180191,0.187882,0.033855,0,0,0,0.0,0.0,0.0,0
4,10655,9,0,0,1,20,4,40,0,0,...,0.117308,0.153586,0.018017,0,0,0,0.0,0.0,0.0,0


In [115]:
dic = test_df.iloc[0].to_dict()

In [116]:
train_df.target.value_counts(), test_df.target.value_counts(),

(0    14759
 1      322
 Name: target, dtype: int64,
 0    1495
 1      34
 Name: target, dtype: int64)

Как часто бывает в подобных задачах, в данных имеется большой дисбаланс классов.

In [117]:
X_train = train_df.drop(columns=['target'])
y_train = train_df['target']
X_test = test_df.drop(columns=['target'])
y_test = test_df['target']

In [118]:
def evaluate(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    ndcg = ndcg_score(y_test.values.reshape(1, -1), y_pred_proba.reshape(1, -1), k=X_test.shape[0])
    print(f'NDCG на тестовых данных c использованием {model_name}: {ndcg}')


In [121]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
)
evaluate(model, X_train, y_train, X_test, y_test, model_name='RandomForestClassifier')

NDCG на тестовых данных c использованием RandomForestClassifier: 0.5093955175621377


In [119]:
model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
)

evaluate(model, X_train, y_train, X_test, y_test, model_name='XGBoostClassifier')

NDCG на тестовых данных c использованием XGBoostClassifier: 0.48580860296132844


In [122]:
model = KNeighborsClassifier(
    n_neighbors=5,
    weights='uniform'
)
evaluate(model, X_train, y_train, X_test, y_test, model_name='KNNClassifier')

NDCG на тестовых данных c использованием KNNClassifier: 0.41750638285686853


In [123]:
model = SVC(probability=True)
evaluate(model, X_train, y_train, X_test, y_test, model_name='SVCClassifier')

NDCG на тестовых данных c использованием SVCClassifier: 0.39092368450449655


In [124]:
model = LogisticRegression()
evaluate(model, X_train, y_train, X_test, y_test, model_name='LogisticRegression')

NDCG на тестовых данных c использованием LogisticRegression: 0.41291761830445706


In [125]:
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
)
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)

y_pred_proba = model.predict_proba(X_test)[:, 1]

ndcg = ndcg_score(y_test.values.reshape(1, -1), y_pred_proba.reshape(1, -1), k=X_test.shape[0])
print(f'NDCG на тестовых данных с использованием CatBoost: {ndcg}')

NDCG на тестовых данных с использованием CatBoost: 0.5681496636375397


In [126]:
groups = train_df.groupby(['search_id'])['feature_0'].count().values

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    max_depth=7,
    n_estimators=300,
    importance_type='gain',
    verbose=10,
    num_leaves=100
)

ranker = ranker.fit(
    X_train,
    y_train,
    group=groups
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.815210
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.395985
[LightGBM] [Debug] init for col-wise cost 0.004430 seconds, init for row-wise cost 0.016404 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 12703
[LightGBM] [Info] Number of data points in the train set: 15081, number of used features: 76
[LightGBM] [Debug] Trained a tree with leaves = 59 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 89 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 89 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 73 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 65 and depth = 7
[LightGBM] [Debug]

In [127]:
preds = []
batch_size = 1_000_000
for bucket in tqdm(range(0, len(test_df), batch_size)):
  outputs = ranker.predict(
      test_df.iloc[bucket: bucket+batch_size]
      .drop(columns = ['target'])
      )
  preds.append(outputs)

100%|██████████| 1/1 [00:00<00:00, 21.79it/s]


In [128]:
ndcg_score(y_test.values.reshape(1, -1), preds[0].reshape(1, -1),k=y_test.values.shape[0])

0.4393037901110992

Лучший результат у CatBoost, найдем наиболее оптимальные параметры для модели

In [129]:
from catboost import Pool

train_group_id = X_train['search_id']
test_group_id = X_test['search_id']

train_pool = Pool(data=X_train, label=y_train, group_id=train_group_id)
test_pool = Pool(data=X_test, label=y_test, group_id=test_group_id)

In [130]:
model = CatBoostClassifier(
    iterations=100,
    depth=6,
    random_strength=1.01,
    scale_pos_weight=0.75,
    custom_metric='NDCG'
)

In [131]:
model.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.168648
0:	learn: 0.4137357	test: 0.4130147	best: 0.4130147 (0)	total: 44.2ms	remaining: 4.37s
1:	learn: 0.2658690	test: 0.2657496	best: 0.2657496 (1)	total: 65.3ms	remaining: 3.2s
2:	learn: 0.1860858	test: 0.1881705	best: 0.1881705 (2)	total: 87.4ms	remaining: 2.83s
3:	learn: 0.1430477	test: 0.1453859	best: 0.1453859 (3)	total: 108ms	remaining: 2.58s
4:	learn: 0.1171404	test: 0.1195294	best: 0.1195294 (4)	total: 130ms	remaining: 2.46s
5:	learn: 0.1023756	test: 0.1055622	best: 0.1055622 (5)	total: 152ms	remaining: 2.38s
6:	learn: 0.0936377	test: 0.0968182	best: 0.0968182 (6)	total: 173ms	remaining: 2.29s
7:	learn: 0.0869786	test: 0.0895334	best: 0.0895334 (7)	total: 199ms	remaining: 2.29s
8:	learn: 0.0833955	test: 0.0859753	best: 0.0859753 (8)	total: 222ms	remaining: 2.24s
9:	learn: 0.0804645	test: 0.0827659	best: 0.0827659 (9)	total: 243ms	remaining: 2.18s
10:	learn: 0.0785968	test: 0.0811581	best: 0.0811581 (10)	total: 263ms	remaining: 2.13s
11:	learn: 0.0772483

<catboost.core.CatBoostClassifier at 0x7ec35cecc430>

In [132]:
y_pred_proba = model.predict_proba(X_test)[:, 1]

ndcg = ndcg_score(y_test.values.reshape(1, -1), y_pred_proba.reshape(1, -1), k=X_test.shape[0])
print(f'NDCG на тестовых данных с использованием CatBoost: {ndcg}')

NDCG на тестовых данных с использованием CatBoost: 0.7026311288146562


In [133]:
# сохранение модели
joblib.dump(model, "./model.joblib")

['./model.joblib']