In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
%load_ext autoreload
%autoreload 2

### Pretrain datasets

In [3]:
%%time

import tqdm, torch

df_trans = pq.read_table('data/trans_filtered.pq').select(['user_id', 'url_host']).to_pandas()

cols = ['url_host']

for col in tqdm.tqdm(cols):
    df_trans[col] = df_trans[col].apply(torch.tensor)

100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.19s/it]

CPU times: user 21.3 s, sys: 25.6 s, total: 47 s
Wall time: 20.6 s





In [4]:
from sklearn.model_selection import train_test_split

df_train_trans, df_valid_trans = train_test_split(df_trans, test_size = 0.1, random_state = 42)
df_train_trans = df_train_trans.to_dict(orient='records')
df_valid_trans = df_valid_trans.to_dict(orient='records')

In [5]:
len(df_train_trans), len(df_valid_trans)

(373785, 41532)

## Train tabformer

### Model definition

In [6]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, LongformerEncoder, TabFormerFeatureEncoder, TransformerEncoder
from ptls.nn import PBLinear, PBL2Norm, PBLayerNorm
from ptls.frames.bert import MLMPretrainModule
from ptls.frames.tabformer.tabformer_module import TabformerPretrainModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    embeddings={ 
        'url_host': {'in': 132025, 'out': 512}
    }
)


trx_encoder = TrxEncoder(**trx_encoder_params)
feature_encoder = TabFormerFeatureEncoder(n_cols=1, emb_dim=512)
seq_encoder = LongformerEncoder(input_size=512, num_attention_heads=2, num_hidden_layers=4, max_position_embeddings=1024)

model = TabformerPretrainModule(
    trx_encoder=trx_encoder,
    seq_encoder=seq_encoder,
    feature_encoder=feature_encoder,
    total_steps=200001,
    mask_prob=0.15
)



### Dataloader

In [7]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.frames.bert import MlmDataset
from ptls.frames.tabformer.tabformer_dataset import TabformerDataset


drop_feature_names = ['request_cnt', 'part_of_day', 'event_time']

train_ds = MemoryMapDataset(data=df_train_trans, i_filters=[FeatureFilter(drop_feature_names=drop_feature_names)])
valid_ds = MemoryMapDataset(data=df_valid_trans, i_filters=[FeatureFilter(drop_feature_names=drop_feature_names)])

train_data=TabformerDataset(train_ds, min_len=25, max_len=512)
valid_data=TabformerDataset(valid_ds, min_len=25, max_len=512)


dl = PtlsDataModule(train_data=train_data, valid_data=valid_data, train_num_workers=4, train_batch_size=8)

### Trainer

In [8]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_steps=200000,
    limit_val_batches=100,
    gpus=[0],
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [9]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, dl)
print(trainer.logged_metrics)

logger.version = 79


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name                 | Type                    | Params
-----------------------------------------------------------------
0 | trx_encoder          | TrxEncoder              | 67.6 M
1 | feature_encoder      | TabFormerFeatureEncoder | 1.4 M 
2 | head                 | ModuleList              | 67.7 M
3 | _seq_encoder         | LongformerEncoder       | 8.4 M 
4 | loss                 | CrossEntropyLoss        | 0     
5 | train_tabformer_loss | MeanMetric              | 0     
6 | valid_tabformer_loss | MeanMetric              | 0     
7 | lin_proj             | Sequential              | 263 K 
-----------------------------------------------------------------
145 M     Trainable params
0         Non-trainable params
145 M     Total params
581.560   Total estimated model params size (MB)


{'tabformer/loss': tensor(4.5296, device='cuda:0'), 'tabformer/valid_tabformer_loss': tensor(4.4922, device='cuda:0'), 'tabformer/train_tabformer_loss': tensor(4.4722, device='cuda:0')}


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [10]:
torch.save(model.state_dict(), "tabformer-emb.pt")

# Inference

In [11]:
model.load_state_dict(torch.load("tabformer-emb.pt"))

<All keys matched successfully>

In [12]:
%%time

import tqdm, torch

df_trans = pq.read_table('data/trans_filtered.pq').to_pandas()

cols = ['url_host']

for col in tqdm.tqdm(cols):
    df_trans[col] = df_trans[col].apply(torch.tensor)

100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.98s/it]

CPU times: user 30.6 s, sys: 42.7 s, total: 1min 13s
Wall time: 47.5 s





In [13]:
%%time
import tqdm
from ptls.data_load.datasets import inference_data_loader
import numpy as np

def pooling_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = model.trx_encoder(batch.to(device)).payload
            out_max = torch.max(x, dim=1)[0]
            out_min = torch.min(x, dim=1)[0]
            out_mean = torch.mean(x, dim=1)
            out_std = torch.std(x, dim=1)
            features = torch.cat([out_max, out_min, out_mean, out_std], dim=1)      
            X += [features]
    return X

def embed_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            z = model.trx_encoder(batch.to(device))
            features = model.seq_encoder(z)
            X += [features]
    return X

dl = inference_data_loader(df_trans.to_dict(orient='records'), num_workers=0, batch_size=128)
#X_mlm = torch.vstack(embed_inference(model, dl, )).cpu().numpy()
X_pool = torch.vstack(pooling_inference(model, dl, )).cpu().numpy()
#X_embeds = np.concatenate([X_mlm, X_pool], axis=1)
X_embeds = X_pool

df_embeds = pd.DataFrame(X_embeds, columns=[f"tab_{e}" for e in range(X_embeds.shape[1])])
df_embeds['user_id'] = df_trans['user_id']
df_embeds.to_csv('./data/tabformer.csv', index=False)

3245it [01:17, 41.62it/s]


CPU times: user 22min 34s, sys: 29.3 s, total: 23min 4s
Wall time: 11min 6s


## Downstream

In [14]:
%%time

import bisect
import numpy as np

df_embeds = pd.read_csv('./data/tabformer.csv')
df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'])

X = df_public
X = X.merge(df_embeds, on="user_id", how='inner')
del X['user_id'], X['age'], X['is_male']

CPU times: user 2min 1s, sys: 14.3 s, total: 2min 16s
Wall time: 2min 16s


## Gender

In [15]:
# %%time

from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X[not_na_gender], y_gender[not_na_gender], test_size = 0.1, random_state = 42)

clf_gender = CatBoostClassifier(
    iterations=1000,
    custom_metric=[metrics.AUC()],
    use_best_model=True,
    random_seed=42)
clf_gender.fit(x_train, y_train, metric_period=100, eval_set=(x_test_gender, y_test_gender))

Learning rate set to 0.122426
0:	learn: 0.6751815	test: 0.6754598	best: 0.6754598 (0)	total: 171ms	remaining: 2m 51s
100:	learn: 0.5203476	test: 0.5230942	best: 0.5230942 (100)	total: 10.3s	remaining: 1m 31s
200:	learn: 0.4928468	test: 0.5000588	best: 0.5000588 (200)	total: 20.9s	remaining: 1m 22s
300:	learn: 0.4771739	test: 0.4897786	best: 0.4897786 (300)	total: 31.4s	remaining: 1m 13s
400:	learn: 0.4655176	test: 0.4841904	best: 0.4841904 (400)	total: 42.2s	remaining: 1m 3s
500:	learn: 0.4553358	test: 0.4808018	best: 0.4808018 (500)	total: 52.7s	remaining: 52.5s
600:	learn: 0.4463135	test: 0.4788439	best: 0.4788439 (600)	total: 1m 3s	remaining: 41.9s
700:	learn: 0.4378026	test: 0.4769616	best: 0.4769616 (700)	total: 1m 13s	remaining: 31.3s
800:	learn: 0.4298296	test: 0.4755379	best: 0.4755379 (800)	total: 1m 23s	remaining: 20.8s
900:	learn: 0.4221736	test: 0.4748350	best: 0.4748350 (900)	total: 1m 34s	remaining: 10.3s
999:	learn: 0.4147652	test: 0.4744257	best: 0.4744257 (999)	total: 

<catboost.core.CatBoostClassifier at 0x7f08fb184970>

In [16]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.713


# Age

In [17]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age)
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 42)

clf_age = CatBoostClassifier(iterations=1000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42)
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age))

Learning rate set to 0.120515
0:	learn: 1.8444123	test: 1.8426211	best: 1.8426211 (0)	total: 915ms	remaining: 15m 14s
100:	learn: 1.3254545	test: 1.3264097	best: 1.3264097 (100)	total: 1m 12s	remaining: 10m 43s
200:	learn: 1.2875972	test: 1.3026698	best: 1.3026698 (200)	total: 2m 19s	remaining: 9m 13s
300:	learn: 1.2644711	test: 1.2934260	best: 1.2934260 (300)	total: 3m 25s	remaining: 7m 57s
400:	learn: 1.2451778	test: 1.2880599	best: 1.2880599 (400)	total: 4m 27s	remaining: 6m 40s
500:	learn: 1.2287270	test: 1.2850381	best: 1.2850381 (500)	total: 5m 32s	remaining: 5m 30s
600:	learn: 1.2131467	test: 1.2827807	best: 1.2827807 (600)	total: 6m 37s	remaining: 4m 23s
700:	learn: 1.1983100	test: 1.2808273	best: 1.2808273 (700)	total: 7m 42s	remaining: 3m 17s
800:	learn: 1.1845334	test: 1.2793951	best: 1.2793951 (800)	total: 8m 47s	remaining: 2m 11s
900:	learn: 1.1715021	test: 1.2784615	best: 1.2784615 (900)	total: 9m 52s	remaining: 1m 5s
999:	learn: 1.1586239	test: 1.2779272	best: 1.2779272 

<catboost.core.CatBoostClassifier at 0x7ed6e383e790>

In [18]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       109
       18-25       0.54      0.36      0.43      3238
       25-34       0.50      0.65      0.56      8863
       35-44       0.41      0.50      0.45      7773
       45-54       0.37      0.23      0.29      4218
       55-65       0.40      0.22      0.28      2254
         65+       0.35      0.02      0.04       545

    accuracy                           0.46     27000
   macro avg       0.37      0.28      0.29     27000
weighted avg       0.45      0.46      0.44     27000



In [21]:
0.713 + 2*0.44

1.593

In [20]:
0.760 + 2*0.47

1.7