In [1]:
from ozon_matching.kopatych_solution.utils import read_parquet, extract_category_levels
import polars as pl
from sklearn.model_selection import StratifiedKFold, train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
import numpy as np
from matplotlib import pyplot as plt

In [2]:
data = pl.concat(
    [
        read_parquet('../data/test_data.parquet', columns=['variantid', 'categories']),
        read_parquet('../data/train_data.parquet', columns=['variantid', 'categories']),
    ]
)
data = data.unique(subset=['variantid'])

[32m2023-05-23 18:48:56.221[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/test_data.parquet[0m
[32m2023-05-23 18:48:56.233[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 35730, N Cols - 2[0m
[32m2023-05-23 18:48:56.233[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/train_data.parquet[0m
[32m2023-05-23 18:48:56.348[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 457063, N Cols - 2[0m


In [3]:
data = extract_category_levels(data, [3, 4])
data = data.select(pl.col(['variantid', 'category_level_3', 'category_level_4']))

In [4]:
data = data.join(
    (
        data
        .select(pl.col('category_level_3'))
        .unique()
        .with_row_count(name='category_level_3_id')
    ),
    on=['category_level_3']
)

data = data.join(
    (
        data
        .select(pl.col('category_level_4'))
        .unique()
        .with_row_count(name='category_level_4_id')
    ),
    on=['category_level_4']
)

data = data.drop(['category_level_3', 'category_level_4'])

In [15]:
order_columns = [
    'variantid1', 'variantid2', 'target', 'is_train'
]

train = read_parquet('../data/train_pairs.parquet')
train = train.with_columns(
    [
        pl.lit(1).cast(pl.Int8).alias('is_train'),
        pl.col('target').cast(pl.Int8).alias('target')
    ]
)

test = read_parquet('../data/test_pairs_wo_target.parquet')
test = test.drop(['__index_level_0__']).with_columns(
    [
        pl.lit(0).cast(pl.Int8).alias('is_train'),
        pl.lit(None).cast(pl.Int8).alias('target')
    ]
)

train = train.select(pl.col(order_columns))
test = test.select(pl.col(order_columns))

[32m2023-05-25 08:07:57.869[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/train_pairs.parquet[0m
[32m2023-05-25 08:07:57.879[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 306540, N Cols - 3[0m
[32m2023-05-25 08:07:57.879[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m57[0m - [1mshape: (5, 3)
┌────────┬────────────┬────────────┐
│ target ┆ variantid1 ┆ variantid2 │
│ ---    ┆ ---        ┆ ---        │
│ f64    ┆ i64        ┆ i64        │
╞════════╪════════════╪════════════╡
│ 0.0    ┆ 51197862   ┆ 51198054   │
│ 1.0    ┆ 53062686   ┆ 536165289  │
│ 1.0    ┆ 53602615   ┆ 587809782  │
│ 1.0    ┆ 53888651   ┆ 89598677   │
│ 0.0    ┆ 56930698   ┆ 551526166  │
└────────┴────────────┴────────────┘[0m
[32m2023-05-25 08:07:57.882[0m | [1mINFO    [0m | [36mozon_matchi

In [16]:
train.head(1)

variantid1,variantid2,target,is_train
i64,i64,i8,i8
51197862,51198054,0,1


In [17]:
test.head(1)

variantid1,variantid2,target,is_train
i64,i64,i8,i8
52076340,290590137,,0


In [18]:
pairs = pl.concat([train, test])

In [25]:
a = np.array(pairs['variantid1'].to_list())
b = np.array(pairs['variantid2'].to_list())

In [26]:
np.random.shuffle(a)
np.random.shuffle(b)

In [30]:
new_pairs = [
    [v1, v2]
    for v1, v2 in zip(a, b)
]

new_pairs = pl.DataFrame(new_pairs, orient='row', schema={'variantid1': pl.Int64, 'variantid2': pl.Int64})

In [33]:
new_pairs.join(
    pairs,
    on=['variantid1', 'variantid2'],
    how='left'
).filter(
    (pl.col('target').is_null()) & (pl.col('is_train').is_null())
).fill_null(0).drop(['is_train']).write_parquet('adversarial_v4.parquet')

In [34]:
np.quantile(np.random.rand(1000), 0.95)

0.9508266781518807

In [7]:
pairs = pairs.join(
    data.rename(
        {
            'variantid': 'variantid1',
            'category_level_3_id': 'category_level_3_id_1',
            'category_level_4_id': 'category_level_4_id_1',
        }
    ),
    on=['variantid1']
).join(
    data.rename(
        {
            'variantid': 'variantid2',
            'category_level_3_id': 'category_level_3_id_2',
            'category_level_4_id': 'category_level_4_id_2',
        }
    ),
    on=['variantid2']
).to_pandas()

In [8]:
18084

18084

In [9]:
import pandas as pd

In [10]:
cv1 = StratifiedKFold(n_splits=3, random_state=13, shuffle=True)
cv2 = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)

X = pairs[
    [
        'category_level_3_id_1', 
        'category_level_4_id_1',
        'category_level_3_id_2', 
        'category_level_4_id_2'
    ]
].values
y = pairs['is_train'].values

adversarial_dataset = np.zeros((X.shape[0], 15))
adversarial_dataset[:] = np.nan

for n, (cv_index, holdout_index) in tqdm(enumerate(cv1.split(X, y))):
    X_cv, X_holdout = X[cv_index], X[holdout_index]
    y_cv, y_holdout = y[cv_index], y[holdout_index]
    models = []
    for m, (train_index, valid_index) in tqdm(enumerate(cv2.split(X_cv, y_cv))):
        X_train, X_valid = X_cv[train_index], X_cv[valid_index]
        y_train, y_valid = y_cv[train_index], y_cv[valid_index]

        model = LGBMClassifier(
            n_estimators=5000,
        )
        model.fit(
            X=X_train,
            y=y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric=['auc'],
            categorical_feature=[0,1,2,3],
            early_stopping_rounds=50
        )
        adversarial_dataset[holdout_index, m + n] = model.predict_proba(X_holdout)[:, 1]
        
pairs['adversarial'] = np.nanmean(adversarial_dataset, axis=1)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

New categorical_feature is [0, 1, 2, 3]


[1]	valid_0's auc: 0.786526	valid_0's binary_logloss: 0.207354
[2]	valid_0's auc: 0.788255	valid_0's binary_logloss: 0.202286
[3]	valid_0's auc: 0.788563	valid_0's binary_logloss: 0.198579
[4]	valid_0's auc: 0.788467	valid_0's binary_logloss: 0.195697
[5]	valid_0's auc: 0.788663	valid_0's binary_logloss: 0.193403
[6]	valid_0's auc: 0.789329	valid_0's binary_logloss: 0.191517
[7]	valid_0's auc: 0.789601	valid_0's binary_logloss: 0.189977
[8]	valid_0's auc: 0.789739	valid_0's binary_logloss: 0.188718
[9]	valid_0's auc: 0.790043	valid_0's binary_logloss: 0.187598
[10]	valid_0's auc: 0.790113	valid_0's binary_logloss: 0.186682
[11]	valid_0's auc: 0.790114	valid_0's binary_logloss: 0.185923
[12]	valid_0's auc: 0.790239	valid_0's binary_logloss: 0.185275
[13]	valid_0's auc: 0.790087	valid_0's binary_logloss: 0.184704
[14]	valid_0's auc: 0.790216	valid_0's binary_logloss: 0.184231
[15]	valid_0's auc: 0.790249	valid_0's binary_logloss: 0.183805
[16]	valid_0's auc: 0.790287	valid_0's binary_log

New categorical_feature is [0, 1, 2, 3]


[7]	valid_0's auc: 0.780063	valid_0's binary_logloss: 0.191084
[8]	valid_0's auc: 0.779995	valid_0's binary_logloss: 0.189883
[9]	valid_0's auc: 0.780271	valid_0's binary_logloss: 0.188856
[10]	valid_0's auc: 0.780472	valid_0's binary_logloss: 0.187999
[11]	valid_0's auc: 0.780333	valid_0's binary_logloss: 0.187303
[12]	valid_0's auc: 0.780413	valid_0's binary_logloss: 0.1867
[13]	valid_0's auc: 0.780444	valid_0's binary_logloss: 0.186157
[14]	valid_0's auc: 0.780185	valid_0's binary_logloss: 0.185705
[15]	valid_0's auc: 0.780122	valid_0's binary_logloss: 0.185316
[16]	valid_0's auc: 0.780138	valid_0's binary_logloss: 0.184971
[17]	valid_0's auc: 0.780641	valid_0's binary_logloss: 0.184666
[18]	valid_0's auc: 0.78071	valid_0's binary_logloss: 0.184403
[19]	valid_0's auc: 0.780738	valid_0's binary_logloss: 0.18418
[20]	valid_0's auc: 0.780822	valid_0's binary_logloss: 0.183982
[21]	valid_0's auc: 0.780907	valid_0's binary_logloss: 0.183818
[22]	valid_0's auc: 0.780975	valid_0's binary_l

New categorical_feature is [0, 1, 2, 3]


[18]	valid_0's auc: 0.789255	valid_0's binary_logloss: 0.183304
[19]	valid_0's auc: 0.789267	valid_0's binary_logloss: 0.183056
[20]	valid_0's auc: 0.789373	valid_0's binary_logloss: 0.182847
[21]	valid_0's auc: 0.789312	valid_0's binary_logloss: 0.182657
[22]	valid_0's auc: 0.789139	valid_0's binary_logloss: 0.182493
[23]	valid_0's auc: 0.789121	valid_0's binary_logloss: 0.182355
[24]	valid_0's auc: 0.789184	valid_0's binary_logloss: 0.182222
[25]	valid_0's auc: 0.789123	valid_0's binary_logloss: 0.182107
[26]	valid_0's auc: 0.789088	valid_0's binary_logloss: 0.182005
[27]	valid_0's auc: 0.78914	valid_0's binary_logloss: 0.181913
[28]	valid_0's auc: 0.788987	valid_0's binary_logloss: 0.181825
[29]	valid_0's auc: 0.788849	valid_0's binary_logloss: 0.181755
[30]	valid_0's auc: 0.788719	valid_0's binary_logloss: 0.181692
[31]	valid_0's auc: 0.78865	valid_0's binary_logloss: 0.181647
[32]	valid_0's auc: 0.78861	valid_0's binary_logloss: 0.181613
[33]	valid_0's auc: 0.788597	valid_0's bina

New categorical_feature is [0, 1, 2, 3]


[11]	valid_0's auc: 0.787197	valid_0's binary_logloss: 0.186954
[12]	valid_0's auc: 0.787141	valid_0's binary_logloss: 0.1863
[13]	valid_0's auc: 0.787294	valid_0's binary_logloss: 0.18574
[14]	valid_0's auc: 0.787127	valid_0's binary_logloss: 0.185248
[15]	valid_0's auc: 0.787222	valid_0's binary_logloss: 0.184822
[16]	valid_0's auc: 0.787366	valid_0's binary_logloss: 0.184436
[17]	valid_0's auc: 0.787434	valid_0's binary_logloss: 0.184117
[18]	valid_0's auc: 0.787324	valid_0's binary_logloss: 0.183846
[19]	valid_0's auc: 0.787345	valid_0's binary_logloss: 0.183622
[20]	valid_0's auc: 0.787456	valid_0's binary_logloss: 0.183405
[21]	valid_0's auc: 0.787605	valid_0's binary_logloss: 0.183227
[22]	valid_0's auc: 0.787574	valid_0's binary_logloss: 0.183053
[23]	valid_0's auc: 0.787642	valid_0's binary_logloss: 0.182894
[24]	valid_0's auc: 0.787762	valid_0's binary_logloss: 0.182761
[25]	valid_0's auc: 0.787861	valid_0's binary_logloss: 0.182629
[26]	valid_0's auc: 0.787877	valid_0's bina

New categorical_feature is [0, 1, 2, 3]


[15]	valid_0's auc: 0.782195	valid_0's binary_logloss: 0.186032
[16]	valid_0's auc: 0.781996	valid_0's binary_logloss: 0.185705
[17]	valid_0's auc: 0.782161	valid_0's binary_logloss: 0.185401
[18]	valid_0's auc: 0.782105	valid_0's binary_logloss: 0.185146
[19]	valid_0's auc: 0.782069	valid_0's binary_logloss: 0.184927
[20]	valid_0's auc: 0.782206	valid_0's binary_logloss: 0.184731
[21]	valid_0's auc: 0.782119	valid_0's binary_logloss: 0.184576
[22]	valid_0's auc: 0.782155	valid_0's binary_logloss: 0.184437
[23]	valid_0's auc: 0.782227	valid_0's binary_logloss: 0.184311
[24]	valid_0's auc: 0.782306	valid_0's binary_logloss: 0.184193
[25]	valid_0's auc: 0.782388	valid_0's binary_logloss: 0.184092
[26]	valid_0's auc: 0.782162	valid_0's binary_logloss: 0.184007
[27]	valid_0's auc: 0.78185	valid_0's binary_logloss: 0.18392
[28]	valid_0's auc: 0.781869	valid_0's binary_logloss: 0.183852
[29]	valid_0's auc: 0.781913	valid_0's binary_logloss: 0.183797
[30]	valid_0's auc: 0.781867	valid_0's bin

0it [00:00, ?it/s]

[1]	valid_0's auc: 0.775723	valid_0's binary_logloss: 0.207954
[2]	valid_0's auc: 0.777224	valid_0's binary_logloss: 0.203247
[3]	valid_0's auc: 0.777728	valid_0's binary_logloss: 0.199771
[4]	valid_0's auc: 0.777703	valid_0's binary_logloss: 0.197129
[5]	valid_0's auc: 0.777835	valid_0's binary_logloss: 0.195
[6]	valid_0's auc: 0.778061	valid_0's binary_logloss: 0.193263


New categorical_feature is [0, 1, 2, 3]


[7]	valid_0's auc: 0.778249	valid_0's binary_logloss: 0.19187
[8]	valid_0's auc: 0.778175	valid_0's binary_logloss: 0.190686
[9]	valid_0's auc: 0.778073	valid_0's binary_logloss: 0.189702
[10]	valid_0's auc: 0.778222	valid_0's binary_logloss: 0.188855
[11]	valid_0's auc: 0.778137	valid_0's binary_logloss: 0.188132
[12]	valid_0's auc: 0.778039	valid_0's binary_logloss: 0.187511
[13]	valid_0's auc: 0.778091	valid_0's binary_logloss: 0.186985
[14]	valid_0's auc: 0.77798	valid_0's binary_logloss: 0.186517
[15]	valid_0's auc: 0.778	valid_0's binary_logloss: 0.186103
[16]	valid_0's auc: 0.778106	valid_0's binary_logloss: 0.185746
[17]	valid_0's auc: 0.77797	valid_0's binary_logloss: 0.185445
[18]	valid_0's auc: 0.778088	valid_0's binary_logloss: 0.185178
[19]	valid_0's auc: 0.778354	valid_0's binary_logloss: 0.184937
[20]	valid_0's auc: 0.778326	valid_0's binary_logloss: 0.184728
[21]	valid_0's auc: 0.778349	valid_0's binary_logloss: 0.184535
[22]	valid_0's auc: 0.778416	valid_0's binary_log

New categorical_feature is [0, 1, 2, 3]


[7]	valid_0's auc: 0.785088	valid_0's binary_logloss: 0.191173
[8]	valid_0's auc: 0.784985	valid_0's binary_logloss: 0.189988
[9]	valid_0's auc: 0.785128	valid_0's binary_logloss: 0.188967
[10]	valid_0's auc: 0.785001	valid_0's binary_logloss: 0.188122
[11]	valid_0's auc: 0.784987	valid_0's binary_logloss: 0.187406
[12]	valid_0's auc: 0.785064	valid_0's binary_logloss: 0.186782
[13]	valid_0's auc: 0.785212	valid_0's binary_logloss: 0.186246
[14]	valid_0's auc: 0.785441	valid_0's binary_logloss: 0.185769
[15]	valid_0's auc: 0.785398	valid_0's binary_logloss: 0.185384
[16]	valid_0's auc: 0.785729	valid_0's binary_logloss: 0.185049
[17]	valid_0's auc: 0.785909	valid_0's binary_logloss: 0.18474
[18]	valid_0's auc: 0.785799	valid_0's binary_logloss: 0.184481
[19]	valid_0's auc: 0.785857	valid_0's binary_logloss: 0.184253
[20]	valid_0's auc: 0.785916	valid_0's binary_logloss: 0.184037
[21]	valid_0's auc: 0.785912	valid_0's binary_logloss: 0.183867
[22]	valid_0's auc: 0.785852	valid_0's binar

New categorical_feature is [0, 1, 2, 3]


[16]	valid_0's auc: 0.778453	valid_0's binary_logloss: 0.185738
[17]	valid_0's auc: 0.778393	valid_0's binary_logloss: 0.185453
[18]	valid_0's auc: 0.778335	valid_0's binary_logloss: 0.185212
[19]	valid_0's auc: 0.778259	valid_0's binary_logloss: 0.185022
[20]	valid_0's auc: 0.778196	valid_0's binary_logloss: 0.18485
[21]	valid_0's auc: 0.778269	valid_0's binary_logloss: 0.184707
[22]	valid_0's auc: 0.778281	valid_0's binary_logloss: 0.184568
[23]	valid_0's auc: 0.778289	valid_0's binary_logloss: 0.184438
[24]	valid_0's auc: 0.778234	valid_0's binary_logloss: 0.184321
[25]	valid_0's auc: 0.77819	valid_0's binary_logloss: 0.184236
[26]	valid_0's auc: 0.778274	valid_0's binary_logloss: 0.184146
[27]	valid_0's auc: 0.778254	valid_0's binary_logloss: 0.184088
[28]	valid_0's auc: 0.778323	valid_0's binary_logloss: 0.184014
[29]	valid_0's auc: 0.778263	valid_0's binary_logloss: 0.183978
[30]	valid_0's auc: 0.778213	valid_0's binary_logloss: 0.183927
[31]	valid_0's auc: 0.778065	valid_0's bin

New categorical_feature is [0, 1, 2, 3]


[2]	valid_0's auc: 0.785302	valid_0's binary_logloss: 0.202632
[3]	valid_0's auc: 0.786098	valid_0's binary_logloss: 0.199002
[4]	valid_0's auc: 0.786894	valid_0's binary_logloss: 0.196183
[5]	valid_0's auc: 0.787105	valid_0's binary_logloss: 0.193922
[6]	valid_0's auc: 0.787447	valid_0's binary_logloss: 0.192075
[7]	valid_0's auc: 0.78728	valid_0's binary_logloss: 0.190599
[8]	valid_0's auc: 0.78693	valid_0's binary_logloss: 0.18934
[9]	valid_0's auc: 0.787479	valid_0's binary_logloss: 0.188259
[10]	valid_0's auc: 0.787241	valid_0's binary_logloss: 0.187371
[11]	valid_0's auc: 0.787212	valid_0's binary_logloss: 0.186595
[12]	valid_0's auc: 0.787324	valid_0's binary_logloss: 0.185887
[13]	valid_0's auc: 0.787362	valid_0's binary_logloss: 0.1853
[14]	valid_0's auc: 0.787439	valid_0's binary_logloss: 0.184799
[15]	valid_0's auc: 0.787471	valid_0's binary_logloss: 0.184353
[16]	valid_0's auc: 0.787559	valid_0's binary_logloss: 0.183956
[17]	valid_0's auc: 0.787619	valid_0's binary_logloss

New categorical_feature is [0, 1, 2, 3]


[1]	valid_0's auc: 0.783158	valid_0's binary_logloss: 0.207628
[2]	valid_0's auc: 0.784536	valid_0's binary_logloss: 0.202842
[3]	valid_0's auc: 0.784401	valid_0's binary_logloss: 0.199312
[4]	valid_0's auc: 0.785057	valid_0's binary_logloss: 0.196549
[5]	valid_0's auc: 0.785661	valid_0's binary_logloss: 0.19435
[6]	valid_0's auc: 0.785693	valid_0's binary_logloss: 0.192571
[7]	valid_0's auc: 0.786178	valid_0's binary_logloss: 0.191093
[8]	valid_0's auc: 0.786304	valid_0's binary_logloss: 0.189843
[9]	valid_0's auc: 0.786347	valid_0's binary_logloss: 0.188805
[10]	valid_0's auc: 0.786584	valid_0's binary_logloss: 0.187914
[11]	valid_0's auc: 0.786558	valid_0's binary_logloss: 0.187165
[12]	valid_0's auc: 0.786689	valid_0's binary_logloss: 0.186506
[13]	valid_0's auc: 0.786745	valid_0's binary_logloss: 0.185958
[14]	valid_0's auc: 0.786843	valid_0's binary_logloss: 0.18548
[15]	valid_0's auc: 0.786877	valid_0's binary_logloss: 0.185059
[16]	valid_0's auc: 0.786946	valid_0's binary_loglo

0it [00:00, ?it/s]

New categorical_feature is [0, 1, 2, 3]


[1]	valid_0's auc: 0.776104	valid_0's binary_logloss: 0.207796
[2]	valid_0's auc: 0.777289	valid_0's binary_logloss: 0.203162
[3]	valid_0's auc: 0.778019	valid_0's binary_logloss: 0.19972
[4]	valid_0's auc: 0.7781	valid_0's binary_logloss: 0.197042
[5]	valid_0's auc: 0.778395	valid_0's binary_logloss: 0.194934
[6]	valid_0's auc: 0.77915	valid_0's binary_logloss: 0.193197
[7]	valid_0's auc: 0.779224	valid_0's binary_logloss: 0.191779
[8]	valid_0's auc: 0.779112	valid_0's binary_logloss: 0.190618
[9]	valid_0's auc: 0.779158	valid_0's binary_logloss: 0.189649
[10]	valid_0's auc: 0.779462	valid_0's binary_logloss: 0.188806
[11]	valid_0's auc: 0.779246	valid_0's binary_logloss: 0.188079
[12]	valid_0's auc: 0.779459	valid_0's binary_logloss: 0.187433
[13]	valid_0's auc: 0.779284	valid_0's binary_logloss: 0.186876
[14]	valid_0's auc: 0.779402	valid_0's binary_logloss: 0.186399
[15]	valid_0's auc: 0.779455	valid_0's binary_logloss: 0.185991
[16]	valid_0's auc: 0.779584	valid_0's binary_logloss

New categorical_feature is [0, 1, 2, 3]


[15]	valid_0's auc: 0.781495	valid_0's binary_logloss: 0.18581
[16]	valid_0's auc: 0.781628	valid_0's binary_logloss: 0.18548
[17]	valid_0's auc: 0.781713	valid_0's binary_logloss: 0.185163
[18]	valid_0's auc: 0.781838	valid_0's binary_logloss: 0.18488
[19]	valid_0's auc: 0.782023	valid_0's binary_logloss: 0.184637
[20]	valid_0's auc: 0.781936	valid_0's binary_logloss: 0.184431
[21]	valid_0's auc: 0.781763	valid_0's binary_logloss: 0.184256
[22]	valid_0's auc: 0.78176	valid_0's binary_logloss: 0.184097
[23]	valid_0's auc: 0.781642	valid_0's binary_logloss: 0.183979
[24]	valid_0's auc: 0.78163	valid_0's binary_logloss: 0.183875
[25]	valid_0's auc: 0.781589	valid_0's binary_logloss: 0.183788
[26]	valid_0's auc: 0.781586	valid_0's binary_logloss: 0.18371
[27]	valid_0's auc: 0.781583	valid_0's binary_logloss: 0.183614
[28]	valid_0's auc: 0.781654	valid_0's binary_logloss: 0.183553
[29]	valid_0's auc: 0.781623	valid_0's binary_logloss: 0.183503
[30]	valid_0's auc: 0.781588	valid_0's binary_

New categorical_feature is [0, 1, 2, 3]


[1]	valid_0's auc: 0.776514	valid_0's binary_logloss: 0.207792
[2]	valid_0's auc: 0.778142	valid_0's binary_logloss: 0.203038
[3]	valid_0's auc: 0.779248	valid_0's binary_logloss: 0.19955
[4]	valid_0's auc: 0.780026	valid_0's binary_logloss: 0.196832
[5]	valid_0's auc: 0.779798	valid_0's binary_logloss: 0.194712
[6]	valid_0's auc: 0.779945	valid_0's binary_logloss: 0.192966
[7]	valid_0's auc: 0.780061	valid_0's binary_logloss: 0.191555
[8]	valid_0's auc: 0.780189	valid_0's binary_logloss: 0.190374
[9]	valid_0's auc: 0.780056	valid_0's binary_logloss: 0.189385
[10]	valid_0's auc: 0.780062	valid_0's binary_logloss: 0.188563
[11]	valid_0's auc: 0.780072	valid_0's binary_logloss: 0.187855
[12]	valid_0's auc: 0.780255	valid_0's binary_logloss: 0.187246
[13]	valid_0's auc: 0.780282	valid_0's binary_logloss: 0.186737
[14]	valid_0's auc: 0.780482	valid_0's binary_logloss: 0.186282
[15]	valid_0's auc: 0.7804	valid_0's binary_logloss: 0.185909
[16]	valid_0's auc: 0.780471	valid_0's binary_loglos

New categorical_feature is [0, 1, 2, 3]


[1]	valid_0's auc: 0.780625	valid_0's binary_logloss: 0.207549
[2]	valid_0's auc: 0.781825	valid_0's binary_logloss: 0.202717
[3]	valid_0's auc: 0.782165	valid_0's binary_logloss: 0.199134
[4]	valid_0's auc: 0.782518	valid_0's binary_logloss: 0.196381
[5]	valid_0's auc: 0.782582	valid_0's binary_logloss: 0.194182
[6]	valid_0's auc: 0.782681	valid_0's binary_logloss: 0.192405
[7]	valid_0's auc: 0.783131	valid_0's binary_logloss: 0.190917
[8]	valid_0's auc: 0.783519	valid_0's binary_logloss: 0.189686
[9]	valid_0's auc: 0.783687	valid_0's binary_logloss: 0.18864
[10]	valid_0's auc: 0.783833	valid_0's binary_logloss: 0.187752
[11]	valid_0's auc: 0.783892	valid_0's binary_logloss: 0.187009
[12]	valid_0's auc: 0.783822	valid_0's binary_logloss: 0.186367
[13]	valid_0's auc: 0.783989	valid_0's binary_logloss: 0.185802
[14]	valid_0's auc: 0.784064	valid_0's binary_logloss: 0.18532
[15]	valid_0's auc: 0.784093	valid_0's binary_logloss: 0.184924
[16]	valid_0's auc: 0.784177	valid_0's binary_loglo

New categorical_feature is [0, 1, 2, 3]


[1]	valid_0's auc: 0.787706	valid_0's binary_logloss: 0.207627
[2]	valid_0's auc: 0.789272	valid_0's binary_logloss: 0.202574
[3]	valid_0's auc: 0.789734	valid_0's binary_logloss: 0.19891
[4]	valid_0's auc: 0.78952	valid_0's binary_logloss: 0.196099
[5]	valid_0's auc: 0.789782	valid_0's binary_logloss: 0.193821
[6]	valid_0's auc: 0.790008	valid_0's binary_logloss: 0.191941
[7]	valid_0's auc: 0.790469	valid_0's binary_logloss: 0.190398
[8]	valid_0's auc: 0.790412	valid_0's binary_logloss: 0.189115
[9]	valid_0's auc: 0.790708	valid_0's binary_logloss: 0.188028
[10]	valid_0's auc: 0.790622	valid_0's binary_logloss: 0.187093
[11]	valid_0's auc: 0.790608	valid_0's binary_logloss: 0.186305
[12]	valid_0's auc: 0.790469	valid_0's binary_logloss: 0.18562
[13]	valid_0's auc: 0.79058	valid_0's binary_logloss: 0.18502
[14]	valid_0's auc: 0.79076	valid_0's binary_logloss: 0.184498
[15]	valid_0's auc: 0.790697	valid_0's binary_logloss: 0.184064
[16]	valid_0's auc: 0.79081	valid_0's binary_logloss: 0

In [11]:
pl.from_pandas(
    pairs[pairs['is_train'] == 1].sort_values('adversarial').head(18084)[['variantid1', 'variantid2']]
).join(
    read_parquet('../data/train_pairs.parquet'),
    on=['variantid1', 'variantid2']
).write_parquet('../data/adversarial.parquet')

[32m2023-05-23 18:49:49.853[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/train_pairs.parquet[0m
[32m2023-05-23 18:49:49.864[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 306540, N Cols - 3[0m


In [12]:
read_parquet('../data/adversarial.parquet')

[32m2023-05-23 19:00:54.775[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/adversarial.parquet[0m
[32m2023-05-23 19:00:54.778[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 18084, N Cols - 3[0m


variantid1,variantid2,target
i64,i64,f64
85836778,85838082,0.0
85890302,85939258,1.0
85940318,148607642,1.0
85975076,508197093,1.0
89403250,90341703,0.0
89722250,177811799,1.0
100009852,750360172,1.0
129052413,177224924,1.0
159929290,553255415,0.0
167327697,576839429,0.0


In [None]:
scores_cv

In [None]:
np.mean(scores_cv)

In [None]:
cv2 = StratifiedKFold(n_splits=3, random_state=13, shuffle=True)

scores_cv = []

for fold_number in range(1, 6):
    pairs_test = pl.read_parquet(
        f'../experiments/v5/cv_{fold_number}/test/pairs.parquet', 
        columns=['variantid1', 'variantid2']
    )
    pairs_train = pl.read_parquet(
        f'../experiments/v5/cv_{fold_number}/train/pairs.parquet',
        columns=['variantid1', 'variantid2']
    )
    pairs_test = pairs_test.with_columns([pl.lit(0).cast(pl.Int8).alias('is_train')])
    pairs_train = pairs_train.with_columns([pl.lit(1).cast(pl.Int8).alias('is_train')])
    
    pairs = pl.concat([pairs_test, pairs_train])
    
    pairs = (
        pairs
        .join(
            data.rename(
                {
                    'variantid': 'variantid1',
                    'category_level_3_id': 'category_level_3_id_1',
                    'category_level_4_id': 'category_level_4_id_1',
                }
            ),
            on=['variantid1']
        )
        .join(
            data.rename(
                {
                    'variantid': 'variantid2',
                    'category_level_3_id': 'category_level_3_id_2',
                    'category_level_4_id': 'category_level_4_id_2',
                }
            ),
            on=['variantid2']
        )
        .drop(['variantid1', 'variantid2'])
        .to_pandas()
    )
    
    X = pairs.drop(columns=['is_train']).values
    y = pairs['is_train'].values
    
    X_cv, X_holdout, y_cv, y_holdout = train_test_split(X, y, test_size=0.2, random_state=13)
    for train_index, valid_index in tqdm(cv2.split(X_cv, y_cv)):
        X_train, X_valid = X_cv[train_index], X_cv[valid_index]
        y_train, y_valid = y_cv[train_index], y_cv[valid_index]

    
        model = LGBMClassifier(
            n_estimators=1000,
        )
        model.fit(
            X=X_train,
            y=y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric=['auc'],
            categorical_feature=[0,1,2,3],
            early_stopping_rounds=50
        )
        scores_cv.append(roc_auc_score(y_holdout, model.predict_proba(X_holdout)[:, 1]))

In [None]:
np.mean(scores_cv)

In [None]:
dataset = pl.read_parquet('../data/train_pairs.parquet')
dataset = dataset.join(
    data.rename(
        {
            'variantid': 'variantid1',
            'category_level_3_id': 'category_level_3_id_1',
            'category_level_4_id': 'category_level_4_id_1',
        }
    ),
    on=['variantid1']
).join(
    data.rename(
        {
            'variantid': 'variantid2',
            'category_level_3_id': 'category_level_3_id_2',
            'category_level_4_id': 'category_level_4_id_2',
        }
    ),
    on=['variantid2']
)

In [None]:
dataset

In [None]:
table = pl.DataFrame(schema={'variantid1': pl.Int64, 'variantid2': pl.Int64})
for fold_number in range(1, 6):
    for fold_type in ['train', 'test']:
        t = pl.read_csv(f'../experiments/v5/cv_{fold_number}/{fold_type}/prediction.csv')
        t = t.rename({'scores': f'fold_{fold_number}_{fold_type}'})
        table = table.join(t, on=['variantid1', 'variantid2'], how='outer')

In [None]:
pl.read_parquet('../data/train_pairs.parquet').shape

In [None]:
from ozon_matching.kopatych_solution.utils import write_parquet
write_parquet(table, '../experiments/v5/cv_pivot.parquet')

In [13]:
cats = np.random.randint(1, 100, 10000)
unique_cats, counts = np.unique(cats, return_counts=True)
unique_cats

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [14]:
counts

array([129, 111,  93, 112, 104,  98, 102,  88,  99,  98, 103, 116, 104,
        93, 111, 111, 103,  88, 105,  96, 104, 115, 102,  93,  86, 103,
       116,  83, 118,  89,  87, 105,  96, 108, 112, 108, 108,  98,  85,
        82, 102,  88, 106,  86,  94,  93,  95,  83, 101, 102,  93, 100,
        89, 110, 113,  94,  92,  95,  87,  92, 101, 119, 104, 102,  82,
       109,  92, 118,  99, 106, 104, 119, 102,  67, 111,  97,  90, 107,
       123, 120, 107, 116, 105,  98,  98, 109, 131,  98,  86, 119, 111,
        95,  91,  96, 104,  95,  94, 105,  93])

In [15]:
data = pl.concat(
    [
        read_parquet('../data/test_data.parquet', columns=['variantid', 'categories']),
        read_parquet('../data/train_data.parquet', columns=['variantid', 'categories']),
    ]
)
data = data.unique(subset=['variantid'])

[32m2023-05-23 21:12:35.328[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/test_data.parquet[0m
[32m2023-05-23 21:12:35.336[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 35730, N Cols - 2[0m
[32m2023-05-23 21:12:35.336[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/train_data.parquet[0m
[32m2023-05-23 21:12:35.459[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 457063, N Cols - 2[0m


In [16]:
data = extract_category_levels(data, [3, 4])
data = data.select(pl.col(['variantid', 'category_level_3', 'category_level_4']))

In [17]:
data = data.join(
    (
        data
        .select(pl.col('category_level_3'))
        .unique()
        .with_row_count(name='category_level_3_id')
    ),
    on=['category_level_3']
)

data = data.join(
    (
        data
        .select(pl.col('category_level_4'))
        .unique()
        .with_row_count(name='category_level_4_id')
    ),
    on=['category_level_4']
)

data = data.drop(['category_level_3', 'category_level_4'])

In [18]:
train = read_parquet('../data/train_pairs.parquet')
train = train.drop(['target']).with_columns([pl.lit(1).cast(pl.Int8).alias('is_train')])

test = read_parquet('../data/test_pairs_wo_target.parquet')
test = test.drop(['__index_level_0__']).with_columns([pl.lit(0).cast(pl.Int8).alias('is_train')])

[32m2023-05-23 21:12:36.660[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/train_pairs.parquet[0m
[32m2023-05-23 21:12:36.672[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 306540, N Cols - 3[0m
[32m2023-05-23 21:12:36.674[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m54[0m - [1mRead Parquet from ../data/test_pairs_wo_target.parquet[0m
[32m2023-05-23 21:12:36.677[0m | [1mINFO    [0m | [36mozon_matching.kopatych_solution.utils[0m:[36mread_parquet[0m:[36m56[0m - [1mN Rows - 18084, N Cols - 3[0m


In [19]:
pairs = pl.concat([train, test])

In [20]:
pairs = pairs.join(
    data.rename(
        {
            'variantid': 'variantid1',
            'category_level_3_id': 'category_level_3_id_1',
            'category_level_4_id': 'category_level_4_id_1',
        }
    ),
    on=['variantid1']
).join(
    data.rename(
        {
            'variantid': 'variantid2',
            'category_level_3_id': 'category_level_3_id_2',
            'category_level_4_id': 'category_level_4_id_2',
        }
    ),
    on=['variantid2']
)

In [21]:
pairs

variantid1,variantid2,is_train,category_level_3_id_1,category_level_4_id_1,category_level_3_id_2,category_level_4_id_2
i64,i64,i8,u32,u32,u32,u32
91616256,91618048,1,87,120,87,120
195867339,195996128,1,43,329,43,329
532527673,538263776,1,47,352,47,352
795470824,795525568,1,68,208,68,208
93805858,190378432,1,68,208,68,208
190373379,190378432,1,68,208,68,208
692284332,721805472,1,70,198,70,198
256395898,487503808,0,72,155,72,34
234014245,234027232,1,22,278,22,278
226405958,234027232,1,22,278,22,278
