In [1]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
import pickle

import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.preprocessing import MaxAbsScaler
import vaex
import seaborn as sns
from umap import UMAP


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
LOCAL_DATA_PATH = 'context_data'
SPLIT_SEED = 42
DATA_FILE = 'competition_data_final_pqt'
TARGET_FILE = 'public_train.pqt'
SUBMISSION_FILE = 'submit_2.pqt'

In [3]:
id_to_submit = pq.read_table(f'../{LOCAL_DATA_PATH}/{SUBMISSION_FILE}').to_pandas()

In [4]:
df = vaex.open(f'../{LOCAL_DATA_PATH}/{DATA_FILE}')

In [5]:
targets = pq.read_table(f'../{LOCAL_DATA_PATH}/{TARGET_FILE}')

In [6]:
%%time
data_agg = df[['user_id', 'url_host', 'request_cnt']].\
    groupby(['user_id', 'url_host']).agg([('request_cnt', "sum")])

CPU times: total: 5min 41s
Wall time: 44.8 s


In [7]:
data_agg

#,user_id,url_host,request_cnt
0,45098,ssp.otm-r.com,2
1,79395,ad.adriver.ru,16
2,79395,aif-ru.cdn.ampproject.org,1
3,91294,zen.yandex.ru,44
4,91294,s3.amazonaws.com,11
...,...,...,...
32277664,97498,lenta-ru.turbopages.org,1
32277665,227188,4251.tech,15
32277666,227188,click.mail.ru,12
32277667,227188,thesame.tv,36


In [7]:
url_set = set(data_agg['url_host'].unique())
print(f'{len(url_set)} urls')
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
usr_set = set(data_agg['user_id'].unique())
print(f'{len(usr_set)} users')
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

199683 urls
415317 users


In [8]:
%%time
values = data_agg['request_cnt'].values
rows = data_agg['user_id'].map(usr_dict).to_numpy()
cols = data_agg['url_host'].map(url_dict).to_numpy()
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
mat

CPU times: total: 20.2 s
Wall time: 3.36 s


<415317x199683 sparse matrix of type '<class 'numpy.int64'>'
	with 32277669 stored elements in COOrdinate format>

In [9]:
sc = MaxAbsScaler()
mat = sc.fit_transform(mat)
mat

<415317x199683 sparse matrix of type '<class 'numpy.float64'>'
	with 32277669 stored elements in Compressed Sparse Row format>

In [10]:
tgt_sorted = targets.to_pandas().sort_index()
lbl_is_male = pd.DataFrame(range(415317), columns=[['user_id']])
lbl_is_male['is_male'] = tgt_sorted.is_male
lbl_is_male['is_male'] = lbl_is_male['is_male'].replace({'NA': -1})
lbl_is_male['is_male'] = lbl_is_male['is_male'].fillna(-1)
lbl_is_male['is_male'] = lbl_is_male.is_male.astype(int)
lbl_is_male.is_male.value_counts()

(is_male,)
-1            150991
 1            135332
 0            128994
dtype: int64

In [11]:
%%time
reducer = UMAP(n_neighbors=15, n_components=500, low_memory=True, random_state=722, verbose=True)
emb = reducer.fit_transform(mat, y=lbl_is_male.is_male.values[:,0])

UMAP(n_components=500, random_state=722, verbose=True)
Fri Feb 24 02:13:09 2023 Construct fuzzy simplicial set
Fri Feb 24 02:13:09 2023 Finding Nearest Neighbors
Fri Feb 24 02:13:09 2023 Building RP forest with 37 trees
Fri Feb 24 02:51:29 2023 metric NN descent for 19 iterations
	 1  /  19
	 2  /  19
	 3  /  19
	 4  /  19
	 5  /  19
	 6  /  19
	 7  /  19
	 8  /  19
	 9  /  19
	Stopping threshold met -- exiting after 9 iterations
Fri Feb 24 02:55:04 2023 Finished Nearest Neighbor Search
Fri Feb 24 02:55:09 2023 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

Fri Feb 24 05:18:34 2023 Finished embedding
CPU times: total: 17h 24min 31s
Wall time: 3h 5min 26s


In [12]:
with open('umap_semisv_urls_gender/emb.pickle', 'wb') as outp:
    pickle.dump(emb, outp, pickle.HIGHEST_PROTOCOL)

In [11]:
emb ### old

array([[ 7.617141  ,  2.2451036 ,  4.1770544 , ...,  6.2679186 ,
         4.2914505 ,  4.2745457 ],
       [ 1.6534604 ,  0.66642445,  4.225202  , ...,  6.2068906 ,
         4.27293   ,  4.3046308 ],
       [ 4.569186  ,  1.1020229 ,  3.6174378 , ...,  6.342753  ,
         4.278675  ,  4.330285  ],
       ...,
       [15.145679  ,  5.3674865 ,  8.654407  , ...,  6.3377323 ,
         4.3462677 ,  4.339196  ],
       [15.885858  ,  5.1889668 ,  8.792454  , ...,  6.309725  ,
         4.359164  ,  4.355493  ],
       [15.463209  ,  5.429692  ,  8.773679  , ...,  6.260381  ,
         4.27645   ,  4.2941146 ]], dtype=float32)

In [13]:
emb

array([[ 9.939966 ,  5.2275515,  5.195164 , ...,  4.7920566,  4.8716373,
         5.2537074],
       [ 9.924918 ,  5.429859 ,  5.227916 , ...,  4.8446813,  4.9683547,
         4.7377415],
       [ 9.947503 ,  5.286981 ,  5.2022257, ...,  4.819232 ,  4.8912954,
         4.8721023],
       ...,
       [ 9.973839 ,  5.4201193,  5.0543013, ...,  4.975858 ,  5.024999 ,
         5.0366693],
       [10.031597 ,  7.4354243,  4.8231645, ...,  5.166925 ,  5.0702267,
         5.083732 ],
       [10.017691 ,  8.455213 ,  5.3100743, ...,  5.1622486,  5.0164614,
         4.923465 ]], dtype=float32)

In [16]:
emb.shape

(415317, 500)

In [61]:
lbl_is_male

Unnamed: 0,user_id,is_male
0,0,0
1,1,0
2,2,0
3,3,1
4,4,0
...,...,...
415312,415312,1
415313,415313,0
415314,415314,-1
415315,415315,-1


In [64]:
%%time
inv_usr_map = {v: k for k, v in usr_dict.items()}
usr_emb = pd.DataFrame(emb)
usr_emb['user_id'] = usr_emb.index.map(inv_usr_map)
usr_targets = targets.to_pandas()
df_train = usr_targets.merge(usr_emb, how = 'inner', on = ['user_id'])
df_train = df_train[df_train['is_male'] != 'NA']
df_train = df_train.dropna()
df_train['is_male'] = df_train['is_male'].map(int)
df_train['is_male'].value_counts()

Wall time: 936 ms


1    135331
0    128994
Name: is_male, dtype: int64

In [65]:
%%time
x_train, x_test, y_train, y_test = train_test_split(\
    df_train.drop(['user_id', 'age', 'is_male'], axis = 1), \
        df_train['is_male'], test_size = 0.33, random_state = SPLIT_SEED)
clf = CatBoostClassifier()
clf.fit(x_train, y_train, verbose = True)
print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]) - 1:2.3f}')

Learning rate set to 0.093955
0:	learn: 0.6908729	total: 59ms	remaining: 58.9s
1:	learn: 0.6888128	total: 100ms	remaining: 50s
2:	learn: 0.6870159	total: 142ms	remaining: 47.3s
3:	learn: 0.6854511	total: 196ms	remaining: 48.7s
4:	learn: 0.6842263	total: 240ms	remaining: 47.8s
5:	learn: 0.6831757	total: 284ms	remaining: 47s
6:	learn: 0.6821972	total: 326ms	remaining: 46.2s
7:	learn: 0.6811657	total: 370ms	remaining: 45.8s
8:	learn: 0.6803783	total: 413ms	remaining: 45.5s
9:	learn: 0.6796008	total: 462ms	remaining: 45.7s
10:	learn: 0.6789335	total: 507ms	remaining: 45.6s
11:	learn: 0.6783016	total: 554ms	remaining: 45.6s
12:	learn: 0.6777863	total: 603ms	remaining: 45.8s
13:	learn: 0.6772777	total: 655ms	remaining: 46.1s
14:	learn: 0.6768293	total: 697ms	remaining: 45.8s
15:	learn: 0.6764571	total: 737ms	remaining: 45.3s
16:	learn: 0.6760281	total: 781ms	remaining: 45.2s
17:	learn: 0.6756864	total: 823ms	remaining: 44.9s
18:	learn: 0.6753396	total: 867ms	remaining: 44.8s
19:	learn: 0.675

161:	learn: 0.6596223	total: 7.15s	remaining: 37s
162:	learn: 0.6595076	total: 7.2s	remaining: 37s
163:	learn: 0.6594507	total: 7.24s	remaining: 36.9s
164:	learn: 0.6593222	total: 7.28s	remaining: 36.8s
165:	learn: 0.6592496	total: 7.32s	remaining: 36.8s
166:	learn: 0.6591588	total: 7.36s	remaining: 36.7s
167:	learn: 0.6590954	total: 7.4s	remaining: 36.7s
168:	learn: 0.6590342	total: 7.44s	remaining: 36.6s
169:	learn: 0.6589466	total: 7.49s	remaining: 36.6s
170:	learn: 0.6588410	total: 7.53s	remaining: 36.5s
171:	learn: 0.6587769	total: 7.57s	remaining: 36.5s
172:	learn: 0.6586792	total: 7.62s	remaining: 36.4s
173:	learn: 0.6586038	total: 7.66s	remaining: 36.4s
174:	learn: 0.6585196	total: 7.7s	remaining: 36.3s
175:	learn: 0.6584442	total: 7.74s	remaining: 36.2s
176:	learn: 0.6583881	total: 7.78s	remaining: 36.2s
177:	learn: 0.6583005	total: 7.83s	remaining: 36.1s
178:	learn: 0.6582119	total: 7.87s	remaining: 36.1s
179:	learn: 0.6581194	total: 7.91s	remaining: 36s
180:	learn: 0.6580530

321:	learn: 0.6488539	total: 14.3s	remaining: 30s
322:	learn: 0.6488006	total: 14.3s	remaining: 30s
323:	learn: 0.6487455	total: 14.3s	remaining: 29.9s
324:	learn: 0.6486908	total: 14.4s	remaining: 29.9s
325:	learn: 0.6486394	total: 14.4s	remaining: 29.8s
326:	learn: 0.6485590	total: 14.5s	remaining: 29.8s
327:	learn: 0.6485075	total: 14.5s	remaining: 29.8s
328:	learn: 0.6484696	total: 14.6s	remaining: 29.7s
329:	learn: 0.6484232	total: 14.6s	remaining: 29.7s
330:	learn: 0.6483667	total: 14.7s	remaining: 29.7s
331:	learn: 0.6483016	total: 14.7s	remaining: 29.6s
332:	learn: 0.6482409	total: 14.8s	remaining: 29.6s
333:	learn: 0.6481848	total: 14.8s	remaining: 29.5s
334:	learn: 0.6481370	total: 14.8s	remaining: 29.5s
335:	learn: 0.6480779	total: 14.9s	remaining: 29.4s
336:	learn: 0.6480222	total: 14.9s	remaining: 29.4s
337:	learn: 0.6479544	total: 15s	remaining: 29.3s
338:	learn: 0.6479019	total: 15s	remaining: 29.3s
339:	learn: 0.6478671	total: 15s	remaining: 29.2s
340:	learn: 0.6477929	

481:	learn: 0.6402474	total: 21.6s	remaining: 23.2s
482:	learn: 0.6401772	total: 21.6s	remaining: 23.1s
483:	learn: 0.6401191	total: 21.7s	remaining: 23.1s
484:	learn: 0.6400830	total: 21.7s	remaining: 23.1s
485:	learn: 0.6400304	total: 21.8s	remaining: 23s
486:	learn: 0.6399934	total: 21.8s	remaining: 23s
487:	learn: 0.6399558	total: 21.8s	remaining: 22.9s
488:	learn: 0.6399018	total: 21.9s	remaining: 22.9s
489:	learn: 0.6398642	total: 21.9s	remaining: 22.8s
490:	learn: 0.6398229	total: 22s	remaining: 22.8s
491:	learn: 0.6397677	total: 22s	remaining: 22.7s
492:	learn: 0.6397368	total: 22.1s	remaining: 22.7s
493:	learn: 0.6397018	total: 22.1s	remaining: 22.6s
494:	learn: 0.6396596	total: 22.2s	remaining: 22.6s
495:	learn: 0.6396247	total: 22.2s	remaining: 22.6s
496:	learn: 0.6395743	total: 22.3s	remaining: 22.5s
497:	learn: 0.6395200	total: 22.3s	remaining: 22.5s
498:	learn: 0.6394704	total: 22.3s	remaining: 22.4s
499:	learn: 0.6394342	total: 22.4s	remaining: 22.4s
500:	learn: 0.639387

640:	learn: 0.6328600	total: 28.7s	remaining: 16.1s
641:	learn: 0.6328258	total: 28.7s	remaining: 16s
642:	learn: 0.6327739	total: 28.8s	remaining: 16s
643:	learn: 0.6327341	total: 28.8s	remaining: 15.9s
644:	learn: 0.6326881	total: 28.9s	remaining: 15.9s
645:	learn: 0.6326264	total: 28.9s	remaining: 15.8s
646:	learn: 0.6325896	total: 29s	remaining: 15.8s
647:	learn: 0.6325537	total: 29s	remaining: 15.8s
648:	learn: 0.6325143	total: 29.1s	remaining: 15.7s
649:	learn: 0.6324700	total: 29.1s	remaining: 15.7s
650:	learn: 0.6324268	total: 29.1s	remaining: 15.6s
651:	learn: 0.6323929	total: 29.2s	remaining: 15.6s
652:	learn: 0.6323480	total: 29.2s	remaining: 15.5s
653:	learn: 0.6323009	total: 29.3s	remaining: 15.5s
654:	learn: 0.6322479	total: 29.3s	remaining: 15.4s
655:	learn: 0.6321987	total: 29.4s	remaining: 15.4s
656:	learn: 0.6321577	total: 29.4s	remaining: 15.4s
657:	learn: 0.6321267	total: 29.5s	remaining: 15.3s
658:	learn: 0.6320689	total: 29.5s	remaining: 15.3s
659:	learn: 0.632021

801:	learn: 0.6260457	total: 36.1s	remaining: 8.91s
802:	learn: 0.6260084	total: 36.1s	remaining: 8.86s
803:	learn: 0.6259677	total: 36.2s	remaining: 8.82s
804:	learn: 0.6259303	total: 36.2s	remaining: 8.77s
805:	learn: 0.6258889	total: 36.3s	remaining: 8.73s
806:	learn: 0.6258452	total: 36.3s	remaining: 8.68s
807:	learn: 0.6258055	total: 36.3s	remaining: 8.64s
808:	learn: 0.6257605	total: 36.4s	remaining: 8.59s
809:	learn: 0.6257226	total: 36.4s	remaining: 8.55s
810:	learn: 0.6256769	total: 36.5s	remaining: 8.51s
811:	learn: 0.6256263	total: 36.5s	remaining: 8.46s
812:	learn: 0.6255793	total: 36.6s	remaining: 8.42s
813:	learn: 0.6255313	total: 36.6s	remaining: 8.37s
814:	learn: 0.6254996	total: 36.7s	remaining: 8.33s
815:	learn: 0.6254598	total: 36.7s	remaining: 8.28s
816:	learn: 0.6254288	total: 36.8s	remaining: 8.23s
817:	learn: 0.6253857	total: 36.8s	remaining: 8.19s
818:	learn: 0.6253418	total: 36.8s	remaining: 8.14s
819:	learn: 0.6252884	total: 36.9s	remaining: 8.1s
820:	learn: 0

963:	learn: 0.6198437	total: 43.5s	remaining: 1.62s
964:	learn: 0.6198002	total: 43.6s	remaining: 1.58s
965:	learn: 0.6197535	total: 43.6s	remaining: 1.53s
966:	learn: 0.6197111	total: 43.6s	remaining: 1.49s
967:	learn: 0.6196797	total: 43.7s	remaining: 1.44s
968:	learn: 0.6196445	total: 43.8s	remaining: 1.4s
969:	learn: 0.6196075	total: 43.8s	remaining: 1.35s
970:	learn: 0.6195715	total: 43.9s	remaining: 1.31s
971:	learn: 0.6195369	total: 43.9s	remaining: 1.26s
972:	learn: 0.6194932	total: 44s	remaining: 1.22s
973:	learn: 0.6194553	total: 44s	remaining: 1.17s
974:	learn: 0.6194224	total: 44s	remaining: 1.13s
975:	learn: 0.6193892	total: 44.1s	remaining: 1.08s
976:	learn: 0.6193518	total: 44.1s	remaining: 1.04s
977:	learn: 0.6193080	total: 44.2s	remaining: 994ms
978:	learn: 0.6192763	total: 44.2s	remaining: 949ms
979:	learn: 0.6192517	total: 44.3s	remaining: 904ms
980:	learn: 0.6192205	total: 44.3s	remaining: 858ms
981:	learn: 0.6191821	total: 44.4s	remaining: 813ms
982:	learn: 0.61914

In [67]:
params = {'iterations': 10000,
          'depth': 8,
          'random_strength': 2,
          'l2_leaf_reg': 10,
          'learning_rate': 0.1,
          'custom_metric': "AUC",
          'eval_metric': "AUC", 
          'score_function': 'L2',
          'task_type': "CPU",
          'verbose': 2000, # output to stdout info about training process every 200 iterations
          'random_seed': 722
         }
cbc_001 = CatBoostClassifier(**params)
cbc_001.fit(x_train, y_train, # data to train on (required parameters, unless we provide X as a pool object, will be shown below)
          eval_set=(x_test, y_test), # data to validate on
          use_best_model=True, # True if we don't want to save trees created after iteration with the best validation score
          early_stopping_rounds=400,
          plot=True # True for visualization of the training process (it is not shown in a published kernel - try executing this code)
         );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.5919393	best: 0.5919393 (0)	total: 68.2ms	remaining: 11m 21s
2000:	test: 0.6798113	best: 0.6798167 (1994)	total: 2m 13s	remaining: 8m 55s
4000:	test: 0.6862173	best: 0.6862628 (3995)	total: 4m 37s	remaining: 6m 56s
6000:	test: 0.6890952	best: 0.6891112 (5976)	total: 7m 14s	remaining: 4m 49s
8000:	test: 0.6898367	best: 0.6900502 (7786)	total: 10m 2s	remaining: 2m 30s
Stopped by overfitting detector  (400 iterations wait)

bestTest = 0.6900501543
bestIteration = 7786

Shrink model to first 7787 iterations.
