In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## Train COLES

### Model definition

In [None]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'request_cnt': 'identity'
    },
    embeddings={
        #'price': {'in': 10, 'out': 2},
        #'region_name': {'in': 81, 'out': 4},
        #'city_name': {'in': 985, 'out': 16},
        #'cpe_manufacturer_name': {'in': 37, 'out': 4},
        #'cpe_model_name': {'in': 599, 'out': 16},
        #'cpe_type_cd': {'in': 4, 'out': 2}, 
        #'cpe_model_os_type': {'in': 3, 'out': 2}, 
        'part_of_day': {'in': 4, 'out': 1},
        'url_host': {'in': 132025, 'out': 512}
    }
)


seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    num_layers=3,
    type='lstm',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
)

In [None]:
model.load_state_dict(torch.load("coles-emb-lstm.pt"))

# Finetune

In [None]:
%%time

import tqdm, torch


df_trans = pq.read_table('./data/trans_filtered.pq').to_pandas()

cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time']
for col in tqdm.tqdm(cols):
    df_trans[col] = df_trans[col].apply(torch.tensor)

In [None]:
SEED = 0  # todo 0, 1, 2, 3, 4 done 
torch.manual_seed(SEED)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

import bisect

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

# Combined target age_gender
df_public['age'] = list(map(age_bucket, df_public['age'] ))
df_public = df_public[(df_public['age'] != 'NA') & (df_public['is_male'] != 'NA')]
df_public = df_public.dropna()
df_public['target'] = df_public['age'].astype(int) + 7 * df_public['is_male'].astype(int)

# Merge
df_finetune = df_trans.merge(df_public[['user_id', 'target']], on='user_id')

train_ft, valid_ft = train_test_split(df_finetune, test_size = 0.1, random_state = SEED)

train_ft = train_ft.to_dict(orient='records')
valid_ft = valid_ft.to_dict(orient='records')

In [None]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.frames.supervised import SeqToTargetDataset
from ptls.frames import PtlsDataModule
from ptls.data_load.iterable_processing import SeqLenFilter

from ptls.data_load.augmentations import AllTimeShuffle, DropoutTrx
from  ptls.data_load.datasets import AugmentationDataset

def get_dataset(data, aug=False):
    ds = MemoryMapDataset(data=data, i_filters=[SeqLenFilter(max_seq_len=1000),])
    if aug:
        ds = AugmentationDataset(ds, f_augmentations = [AllTimeShuffle(), DropoutTrx(trx_dropout=0.01)])
    return SeqToTargetDataset(ds, target_col_name='target',)

finetune_dm = PtlsDataModule(
    train_data=get_dataset(train_ft, aug=True),
    valid_data=get_dataset(valid_ft),
    train_num_workers=4,
    train_batch_size=128,)

In [None]:
from functools import partial
import torch
import torchmetrics
from ptls.frames.supervised import SequenceToTarget
from ptls.nn import Head

model_finetuned = SequenceToTarget(
    seq_encoder=model.seq_encoder,
    head=Head(
        input_size=model.seq_encoder.embedding_size,
        use_batch_norm=True,
        objective='classification',
        num_classes=14,
    ),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(compute_on_step=False),
    pretrained_lr=0.0001,
    optimizer_partial=partial(torch.optim.Adam, lr=0.01, weight_decay=1e-5),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
)

In [None]:
from pytorch_lightning.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor="val_Accuracy",
    min_delta=0.001,
    patience=5,
    mode='max'
)

trainer_ft = pl.Trainer(
    max_epochs=50,
    callbacks=[early_stopping],
    gpus=[0],
    enable_progress_bar=False,
)

In [None]:
print(f'logger.version = {trainer_ft.logger.version}')
trainer_ft.fit(model_finetuned, finetune_dm)
print(trainer_ft.logged_metrics)

# Infernece

In [None]:
%%time
import tqdm

from ptls.data_load.datasets import inference_data_loader

def pooling_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = model.seq_encoder.trx_encoder(batch.to(device)).payload
            out_max = torch.max(x, dim=1)[0]
            out_min = torch.min(x, dim=1)[0]
            out_mean = torch.mean(x, dim=1)
            out_std = torch.std(x, dim=1)
            features = torch.cat([out_max, out_min, out_mean, out_std], dim=1)      
            features = out_max     
            X += [features]
    return X

def embed_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            features = model.seq_encoder(batch.to(device))
            X += [features]
    return X

dl = inference_data_loader(df_trans.to_dict(orient='records'), num_workers=0, batch_size=64)
X_coles = torch.vstack(embed_inference(model_finetuned, dl, )).cpu().numpy()
X_pool = torch.vstack(pooling_inference(model_finetuned, dl, )).cpu().numpy()

In [None]:
import numpy as np

X_embeds = np.concatenate([X_coles, X_pool], axis=1)


df_embeds = pd.DataFrame(X_embeds, columns=[f"embed_{e}" for e in range(X_embeds.shape[1])])
df_embeds['user_id'] = df_trans['user_id']
df_embeds.to_csv(f'./data/coles_finetuned_{SEED}.csv', index=False)

## Downstream

## Targets

In [None]:
%%time

from ptls.data_load.datasets import inference_data_loader
import bisect
import numpy as np

df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'])

X = df_public.merge(df_embeds, on="user_id", how='inner')

del X['user_id'], X['age'], X['is_male']

In [None]:
X.head()

## Gender

In [None]:
%%time
from catboost import CatBoostClassifier, metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

not_na_gender = (y_gender != 'NA') & (y_gender != None)

x_train, x_test_gender, y_train, y_test_gender = train_test_split(X[not_na_gender], y_gender[not_na_gender], test_size = 0.1, random_state = 42)

clf_gender = CatBoostClassifier(
    iterations=1000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42)
clf_gender.fit(x_train, y_train, metric_period=100, eval_set=(x_test_gender, y_test_gender))

In [19]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.794


# Age

In [20]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age)
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 42)

clf_age = CatBoostClassifier(iterations=1000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42)
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age))

Learning rate set to 0.120515
0:	learn: 1.7976433	test: 1.7967610	best: 1.7967610 (0)	total: 351ms	remaining: 5m 50s
100:	learn: 1.1848231	test: 1.1844693	best: 1.1844693 (100)	total: 25.8s	remaining: 3m 49s
200:	learn: 1.1661629	test: 1.1773333	best: 1.1773333 (200)	total: 50.5s	remaining: 3m 20s
300:	learn: 1.1518798	test: 1.1748456	best: 1.1748456 (300)	total: 1m 16s	remaining: 2m 57s
400:	learn: 1.1386098	test: 1.1734652	best: 1.1734652 (400)	total: 1m 41s	remaining: 2m 31s
500:	learn: 1.1263486	test: 1.1728772	best: 1.1728772 (500)	total: 2m 5s	remaining: 2m 5s
600:	learn: 1.1144286	test: 1.1729456	best: 1.1728772 (500)	total: 2m 29s	remaining: 1m 39s
700:	learn: 1.1030784	test: 1.1727424	best: 1.1727424 (700)	total: 2m 53s	remaining: 1m 14s
800:	learn: 1.0917255	test: 1.1728851	best: 1.1727424 (700)	total: 3m 17s	remaining: 49s
900:	learn: 1.0806047	test: 1.1731725	best: 1.1727424 (700)	total: 3m 41s	remaining: 24.3s
999:	learn: 1.0697039	test: 1.1735223	best: 1.1727424 (700)	tot

<catboost.core.CatBoostClassifier at 0x7face4515fd0>

In [21]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       109
       18-25       0.56      0.48      0.51      3238
       25-34       0.56      0.63      0.59      8863
       35-44       0.46      0.54      0.50      7773
       45-54       0.43      0.34      0.38      4218
       55-65       0.43      0.35      0.39      2254
         65+       0.35      0.04      0.07       545

    accuracy                           0.50     27000
   macro avg       0.40      0.34      0.35     27000
weighted avg       0.50      0.50      0.49     27000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
0.781 + 2*0.48

1.741