In [None]:
import os
import numpy as np
import pandas as pd
import time
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [None]:
raw = pd.read_csv('/content/train.csv')
sub = pd.read_csv('/content/sample_submission.csv')

In [None]:
%%time

train_raw = pd.get_dummies(raw, columns = ['cart'], prefix='', prefix_sep='', dtype='bool')
train_raw = train_raw.groupby(['user_id', 'order_completed_at']).any().reset_index()


train_raw['order_number'] = train_raw.groupby(['user_id']).cumcount()
train_raw = train_raw.drop('order_completed_at', axis=1)


last_order = train_raw.groupby(['user_id'])['order_number'].transform(max) == train_raw['order_number']
train = train_raw[~last_order].groupby('user_id').sum().reset_index()
valid = train_raw[last_order].reset_index(drop=True)


train_melt = pd.melt(train, id_vars=['user_id'], var_name='category', value_name='ordered')
valid_melt = pd.melt(valid, id_vars=['user_id'], var_name='category', value_name='target')

Train = train_melt.copy()

# total purchase counter for each user
order_number = valid[['user_id', 'order_number']].set_index('user_id').squeeze()
Train['orders_total']= Train['user_id'].map(order_number)


Train['rating'] = Train['ordered'] / Train['orders_total']


Train['id'] = Train['user_id'].astype(str) + ';' + Train['category']


Train['target'] = valid_melt['target'].astype(int)


Train = Train[Train.id.isin(sub.id.unique())].reset_index(drop=True)
print((Train.sort_values('id')['id'].values == Train.sort_values('id')['id'].values).all())


total_ordered = Train.groupby('category')['ordered'].sum()
Train['total_ordered'] = Train['category'].map(total_ordered)



True
CPU times: user 29 s, sys: 3.7 s, total: 32.7 s
Wall time: 32.8 s


In [None]:
Test = Train.copy()


Test['orders_total'] += 1
Test['ordered'] = Test['ordered'] + Test['target']

test_total_ordered = Test.groupby('category')['ordered'].sum()
Test['total_ordered'] = Test['category'].map(test_total_ordered)


Test['rating'] = Test['ordered'] / Test['orders_total']
Test = Test.drop('target', axis=1)

In [None]:
Train

Unnamed: 0,user_id,category,ordered,orders_total,rating,id,target,total_ordered
0,7,0,0,10,0.000000,7;0,1,12922
1,8,0,1,7,0.142857,8;0,0,12922
2,9,0,1,45,0.022222,9;0,0,12922
3,12,0,1,20,0.050000,12;0,1,12922
4,13,0,3,16,0.187500,13;0,0,12922
...,...,...,...,...,...,...,...,...
790444,3238,880,2,70,0.028571,3238;880,0,7
790445,4816,880,1,22,0.045455,4816;880,0,7
790446,10280,880,2,8,0.250000,10280;880,0,7
790447,13281,880,1,3,0.333333,13281;880,0,7


In [None]:
Train_set, Valid_set = train_test_split(Train, test_size = 0.2,
                                        stratify = None, random_state = 17)

In [None]:
!pip install catboost -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score

In [None]:
categorical_features = ['id']
categorical_features = [col for col in categorical_features if col in Train_set.columns]

In [None]:
X_train = Train_set.drop(columns=['target'], errors='ignore')
y_train = Train_set['target']

In [None]:
X_valid = Valid_set.drop(columns=['target'], errors='ignore')
y_valid = Valid_set['target']


In [None]:
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
valid_pool = Pool(data=X_valid, label=y_valid, cat_features=categorical_features)

In [None]:
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.01,
    depth=6,
    eval_metric='F1',
    random_seed=42,
    logging_level='Verbose',
    early_stopping_rounds=50
)

In [None]:
model.fit(
    train_pool,
    eval_set=valid_pool,
    use_best_model=True
)

0:	learn: 0.5470523	test: 0.5448745	best: 0.5448745 (0)	total: 533ms	remaining: 4m 26s
1:	learn: 0.5767092	test: 0.5760344	best: 0.5760344 (1)	total: 953ms	remaining: 3m 57s
2:	learn: 0.5758719	test: 0.5745042	best: 0.5760344 (1)	total: 1.36s	remaining: 3m 45s
3:	learn: 0.5846489	test: 0.5845699	best: 0.5845699 (3)	total: 1.8s	remaining: 3m 43s
4:	learn: 0.5842848	test: 0.5840633	best: 0.5845699 (3)	total: 2.28s	remaining: 3m 45s
5:	learn: 0.5829938	test: 0.5822452	best: 0.5845699 (3)	total: 2.66s	remaining: 3m 39s
6:	learn: 0.5801369	test: 0.5791903	best: 0.5845699 (3)	total: 3.08s	remaining: 3m 37s
7:	learn: 0.5804421	test: 0.5796964	best: 0.5845699 (3)	total: 3.5s	remaining: 3m 35s
8:	learn: 0.5809085	test: 0.5802791	best: 0.5845699 (3)	total: 3.94s	remaining: 3m 35s
9:	learn: 0.5821008	test: 0.5814342	best: 0.5845699 (3)	total: 4.38s	remaining: 3m 34s
10:	learn: 0.5827537	test: 0.5824022	best: 0.5845699 (3)	total: 4.77s	remaining: 3m 32s
11:	learn: 0.5824867	test: 0.5822162	best: 0

<catboost.core.CatBoostClassifier at 0x7aa670ea09b0>

In [None]:
train_pred = model.predict(X_train)
valid_pred = model.predict(X_valid)

In [None]:
y_pred_val = (model.predict(X_valid) > 0.5).astype(int)
test_f1 = f1_score(y_valid, y_pred_val)

In [None]:
import numpy as np
from sklearn.metrics import f1_score


valid_probs = model.predict_proba(X_valid)[:, 1]

thresholds = np.linspace(0, 1, 101)

f1_scores = [f1_score(y_valid, (valid_probs > t).astype(int)) for t in thresholds]

optimal_threshold = thresholds[np.argmax(f1_scores)]
optimal_f1 = max(f1_scores)

print(f"Оптимальный порог: {optimal_threshold:.2f}")
print(f"Максимальный F1-Score: {optimal_f1:.4f}")


test_probs = model.predict_proba(Test)[:, 1]
Test['target'] = (test_probs > optimal_threshold).astype(int)


Оптимальный порог: 0.49
Максимальный F1-Score: 0.6155


In [None]:
test_probs = model.predict(Test).flatten()
Test['target'] = (test_probs > optimal_threshold).astype(int)


In [None]:
Test['target'].sum()

np.int64(64283)

In [None]:
submit = pd.merge(sub['id'], Test[['id', 'target']], on='id')

In [None]:
submit

Unnamed: 0,id,target
0,0;133,0
1,0;5,0
2,0;10,0
3,0;396,0
4,0;14,1
...,...,...
790444,19998;26,0
790445,19998;31,0
790446,19998;29,0
790447,19998;798,0


In [None]:
submit.to_csv('submission.csv', index = False)