In [12]:
import os

import torch
import numpy as np
import torch.nn.functional as F
import pandas as pd
import pytorch_lightning as pl

from pathlib import Path

from transformers import AutoTokenizer, AutoModel

In [13]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
GPU = 0
data_path = Path('data')
pl.seed_everything(42)

Global seed set to 42


42

# Prepare data

In [14]:
df = pd.read_parquet(data_path / 'competition_data_final_pqt', engine='pyarrow', columns=['user_id', 'url_host', 'request_cnt', 'part_of_day', 'date'])

### Impute price, create date+day_part event_time

In [15]:
df['event_time'] = pd.to_datetime(df["date"]).values.astype('datetime64[h]').astype('int64')
df['event_time'] += df['part_of_day'].map({'morning': 0, 'day': 6, 'evening': 12, 'night': 18})
df = df.drop(columns=['date'])

# Create transactional data

In [16]:
def embed(texts, batch_size=10000):
    tokenizer = AutoTokenizer.from_pretrained('muhtasham/olm-bert-tiny-december-2022')
    bert = AutoModel.from_pretrained('muhtasham/olm-bert-tiny-december-2022').cuda(GPU)
    res = dict()
    for i in range(0, len(texts), batch_size):
        b_texts = texts[i:i+batch_size]
        tokens = tokenizer(b_texts, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            out = bert(**{k: v.to(bert.device) for k, v in tokens.items()})
    
        embeddings = F.normalize(out.last_hidden_state[:, 0, :]).cpu()
        res.update(dict(zip(b_texts, embeddings)))
    return res

In [17]:
embs = embed(df['url_host'].unique().tolist())

Some weights of the model checkpoint at AustinCarthy/BERT_generated_url_classification_v1_12_1_23 were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df['url_host'] = df['url_host'].apply(lambda url_host: embs[url_host])

In [7]:
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset

preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='event_time',
    event_time_transformation='none',
    cols_category=['part_of_day'],
    #cols_category=['price', 'region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'url_host', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day'],
    cols_numerical=['request_cnt'],
    cols_identity=['url_host'],
    return_records=True,
)

In [8]:
dataset = MemoryMapDataset(data=preprocessor.fit_transform(df))

In [9]:
TRAIN_SIZE = int(len(dataset) * 0.9)
VAL_SIZE = len(dataset) - TRAIN_SIZE

train, val = torch.utils.data.random_split(dataset, [TRAIN_SIZE, VAL_SIZE])

In [10]:
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.frames.coles.coles_dataset import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_data = ColesDataset(
    data=train,
    splitter=SampleSlices(split_count=5, cnt_min=20, cnt_max=200)
)

val_data = ColesDataset(
    data=val,
    splitter=SampleSlices(split_count=5, cnt_min=20, cnt_max=200)
)

dl = PtlsDataModule(
    train_data=train_data, train_num_workers=16, train_batch_size=256, 
    valid_data=val_data, valid_num_workers=16, valid_batch_size=256
)

## Train COLES

### Model definition

In [11]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.nn.trx_encoder.encoders import IdentityEncoder

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'request_cnt': 'identity'
    },
    embeddings={
        #'price': {'in': 10, 'out': 2},
        #'region_name': {'in': 81, 'out': 4},
        #'city_name': {'in': 985, 'out': 16},
        #'cpe_manufacturer_name': {'in': 37, 'out': 4},
        #'cpe_model_name': {'in': 599, 'out': 16},
        #'cpe_type_cd': {'in': 4, 'out': 2}, 
        #'cpe_model_os_type': {'in': 3, 'out': 2}, 
        #'part_of_day': {'in': 4, 'out': 1},
    },
    custom_embeddings={
        'url_host': IdentityEncoder(128)
    }
)


seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=512,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
)

### Trainer

In [49]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint


trainer = pl.Trainer(
    max_epochs=200,
    limit_val_batches=100,
    gpus=[GPU],
    enable_progress_bar=False,
    logger=TensorBoardLogger('lightning_logs', name='url_host_embeddigns_tsne'),
    callbacks=[ModelCheckpoint(
        monitor='recall_top_k',
        dirpath='model/',
        filename='{epoch}-{recall_top_k:.2f}',
        every_n_epochs=100,
    )]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [50]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, dl)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 988 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
988 K     Trainable params
0         Non-trainable params
988 K     Total params
3.954     Total estimated model params size (MB)


logger.version = 8




{'loss': tensor(275.7948, device='cuda:0'), 'seq_len': tensor(84.1578, device='cuda:0')}


In [16]:
torch.save(model.state_dict(), "model/coles-emb-text-001-stepLR-09-tsne.pt")

# Inference

In [12]:
model.load_state_dict(torch.load("model/coles-emb-text-001-stepLR-09-tsne.pt"))

<All keys matched successfully>

In [13]:
import tqdm
from ptls.data_load.datasets import inference_data_loader
import numpy as np

def pooling_inference(model, dl, device=f'cuda:{GPU}'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = model.seq_encoder.trx_encoder(batch.to(device)).payload
            out_max = torch.max(x, dim=1)[0]
            out_min = torch.min(x, dim=1)[0]
            out_mean = torch.mean(x, dim=1)
            out_std = torch.std(x, dim=1)
            features = torch.cat([out_max, out_min, out_mean, out_std], dim=1)      
            X += [features]
    return X

def embed_inference(model, dl, device=f'cuda:{GPU}'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            features = model.seq_encoder(batch.to(device))
            X += [features]
    return X

dl = inference_data_loader(dataset.processed_data, num_workers=0, batch_size=64)
X_coles = torch.vstack(embed_inference(model, dl, )).cpu().numpy()
X_pool = torch.vstack(pooling_inference(model, dl, )).cpu().numpy()
X_embeds = np.concatenate([X_coles, X_pool], axis=1)


df_embeds = pd.DataFrame(X_embeds, columns=[f"embed_{e}" for e in range(X_embeds.shape[1])])
df_embeds['user_id'] = [rec['user_id'] for rec in dataset.processed_data]
df_embeds.to_csv('./data/coles_512.csv', index=False)

6490it [17:14,  6.27it/s]
6490it [06:27, 16.75it/s]


## Downstream

## Targets

In [14]:
%%time

import bisect
import numpy as np
import pyarrow.parquet as pq

df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'])

CPU times: user 221 ms, sys: 34 ms, total: 255 ms
Wall time: 243 ms


## Add features

In [15]:
X_embeddings=df_embeds
# X_embeddings = pd.read_csv('./data/coles_512.csv')
#X_factors = pd.read_csv('./data/user_factors.csv')
#X_aggregates = pd.read_csv('./data/aggregates.csv')
X = df_public
X = X.merge(X_embeddings, on="user_id", how='inner')
# X = X.merge(X_factors, on="user_id", how='left').fillna(0)
# X = X.merge(X_aggregates, on="user_id", how='inner')

del X['user_id'], X['age'], X['is_male']

In [16]:
X.head()

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023,embed_1024,embed_1025,embed_1026,embed_1027
0,-0.78704,-0.457125,-0.017737,0.005416,-0.397249,0.067654,-0.32571,0.061905,0.060056,0.043753,...,0.076959,0.128632,0.062424,0.08759,0.13793,0.065165,0.053161,0.089808,0.119454,0.206273
1,-0.274681,0.160478,0.0538,0.011052,-0.136415,0.083284,-0.381442,0.067448,0.167852,-0.10443,...,0.180769,0.297384,0.13126,0.213208,0.297388,0.139177,0.113681,0.223502,0.251975,0.47131
2,-0.813374,0.218355,-0.022478,-0.005189,-0.261558,0.067715,-0.307461,-0.046547,0.101327,-0.144751,...,0.119493,0.209573,0.123451,0.160019,0.222549,0.090359,0.074468,0.17979,0.193492,0.237386
3,-0.968491,0.313913,-0.179472,0.002859,-0.235855,0.129345,-0.315666,0.007441,0.704675,-0.103729,...,0.092443,0.162889,0.095323,0.120299,0.162929,0.069529,0.052288,0.118071,0.13368,0.233652
4,-0.416571,0.221322,-0.171212,0.000957,-0.267172,0.175585,-0.34214,0.009744,0.124708,-0.25041,...,0.157901,0.28726,0.15448,0.221048,0.273447,0.130451,0.105877,0.217656,0.254213,0.357098


## Gender

In [17]:
X.shape

(270000, 1028)

In [18]:
cat_features = []#  ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day', 'price']

In [19]:
# %%time

from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X[not_na_gender], y_gender[not_na_gender], test_size = 0.1, random_state = 42)

clf_gender = CatBoostClassifier(
    iterations=5000,
    custom_metric=[metrics.AUC()],
    use_best_model=False,
    random_seed=42)
clf_gender.fit(x_train, y_train, metric_period=100, eval_set=(x_test_gender, y_test_gender), cat_features=cat_features)

Learning rate set to 0.060789
0:	learn: 0.6721727	test: 0.6718698	best: 0.6718698 (0)	total: 105ms	remaining: 8m 45s
100:	learn: 0.4991232	test: 0.4972384	best: 0.4972384 (100)	total: 4.97s	remaining: 4m 1s
200:	learn: 0.4896866	test: 0.4905966	best: 0.4905966 (200)	total: 9.42s	remaining: 3m 44s
300:	learn: 0.4811727	test: 0.4856596	best: 0.4856596 (300)	total: 13.8s	remaining: 3m 35s
400:	learn: 0.4745008	test: 0.4828917	best: 0.4828917 (400)	total: 18.4s	remaining: 3m 31s
500:	learn: 0.4686178	test: 0.4807002	best: 0.4807002 (500)	total: 23.1s	remaining: 3m 27s
600:	learn: 0.4631824	test: 0.4791940	best: 0.4791940 (600)	total: 27.8s	remaining: 3m 23s
700:	learn: 0.4581485	test: 0.4779488	best: 0.4779488 (700)	total: 32.5s	remaining: 3m 19s
800:	learn: 0.4533999	test: 0.4769587	best: 0.4769587 (800)	total: 37.2s	remaining: 3m 15s
900:	learn: 0.4489002	test: 0.4760786	best: 0.4760786 (900)	total: 41.6s	remaining: 3m 9s
1000:	learn: 0.4445326	test: 0.4751830	best: 0.4751830 (1000)	tota

<catboost.core.CatBoostClassifier at 0x7f706823fee0>

In [20]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.715


# Age

In [21]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age)
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 42)

clf_age = CatBoostClassifier(iterations=5000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42)
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age), cat_features=cat_features)

Learning rate set to 0.065168
0:	learn: 1.8833790	test: 1.8824755	best: 1.8824755 (0)	total: 366ms	remaining: 30m 28s
100:	learn: 1.3201749	test: 1.3154945	best: 1.3154945 (100)	total: 28.7s	remaining: 23m 11s
200:	learn: 1.2943310	test: 1.2967469	best: 1.2967469 (200)	total: 54.5s	remaining: 21m 41s
300:	learn: 1.2756351	test: 1.2859790	best: 1.2859790 (300)	total: 1m 19s	remaining: 20m 34s
400:	learn: 1.2614717	test: 1.2800287	best: 1.2800287 (400)	total: 1m 43s	remaining: 19m 48s
500:	learn: 1.2504714	test: 1.2762930	best: 1.2762930 (500)	total: 2m 6s	remaining: 18m 55s
600:	learn: 1.2398501	test: 1.2741194	best: 1.2741194 (600)	total: 2m 30s	remaining: 18m 19s
700:	learn: 1.2304381	test: 1.2719352	best: 1.2719352 (700)	total: 2m 52s	remaining: 17m 40s
800:	learn: 1.2216216	test: 1.2701447	best: 1.2701447 (800)	total: 3m 15s	remaining: 17m 6s
900:	learn: 1.2127716	test: 1.2684004	best: 1.2684004 (900)	total: 3m 38s	remaining: 16m 34s
1000:	learn: 1.2044731	test: 1.2671780	best: 1.26

<catboost.core.CatBoostClassifier at 0x7f7068211550>

In [22]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       109
       18-25       0.54      0.40      0.46      3238
       25-34       0.51      0.64      0.57      8863
       35-44       0.42      0.51      0.46      7773
       45-54       0.39      0.26      0.31      4218
       55-65       0.40      0.26      0.32      2254
         65+       0.28      0.02      0.04       545

    accuracy                           0.47     27000
   macro avg       0.37      0.30      0.31     27000
weighted avg       0.46      0.47      0.45     27000



In [None]:
0.760 + 2*0.47

1.7

# Score submit 

(2 * f1_weighted(по 6 возрастным бакетам) + gini)

In [24]:
%%time

df_submit = pq.read_table('data/submit_2.pqt').to_pandas().sort_values(by='user_id')

# X_embeddings = pd.read_csv('./data/coles_256.csv')
#X_factors = pd.read_csv('./data/user_factors.csv')
#X_aggregates = pd.read_csv('./data/aggregates.csv')
X_submit = df_submit.merge(X_embeddings, on="user_id", how='left')
#X_submit = X_submit.merge(X_factors, on="user_id", how='left')
#X_submit = X_submit.merge(X_aggregates, on="user_id", how='inner')

CPU times: user 2.63 s, sys: 1.08 s, total: 3.71 s
Wall time: 3.71 s


In [25]:
list(X_submit['user_id']) == list(df_submit['user_id'])

True

In [26]:
del X_submit['user_id']

In [27]:
df_submit['age'] = clf_age.predict(X_submit)
df_submit['is_male'] = clf_gender.predict_proba(X_submit)[:, 1]
df_submit.head()

Unnamed: 0,user_id,age,is_male
0,6,2,0.194695
7,7,2,0.898154
9,9,2,0.12613
10,10,3,0.05588
4,11,5,0.678688


In [29]:
df_submit.to_csv(f'data/submission_xlarge_embed_tsne.csv', index = False)