In [1]:
import re
from typing import Tuple, List, Dict

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import dask.dataframe as dd

from tld import get_tld


# Prepare data

In [2]:
df = dd.read_parquet('data/competition_data_final_pqt', columns=['user_id', 'url_host', 'request_cnt', 'part_of_day', 'date'])

In [3]:
def is_url_ip_address(url: str) -> bool:
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4 with port
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}|'
        '([0-9]+(?:\.[0-9]+){3}:[0-9]+)|'
        '((?:(?:\d|[01]?\d\d|2[0-4]\d|25[0-5])\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d|\d)(?:\/\d{1,2})?)', url)  # Ipv6
    return 1 if match else 0
    
df['is_ip'] = df['url_host'].apply(lambda url: is_url_ip_address(url))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('url_host', 'int64'))



In [4]:
def process_tld(url_host: str, fix_protos: bool = True) -> Tuple[str, str, str, str]:
    """
    Takes a URL string and uses the tld library to extract subdomain, domain, top
    level domain and full length domain
    """
    try:
        res = get_tld(url_host, as_object = True, fail_silently=False, fix_protocol=fix_protos)
        subdomain = res.subdomain
        domain = res.domain
        tld = res.tld
        fld = res.fld
        return [subdomain, domain, tld, fld]
    except:
        return [None, None, None, None]

In [5]:
def process_tlds(url_hosts: List[str], fix_protocol: bool = True) -> Dict[str, List[Tuple[str, str, str, str]]]:
    return {
        url_host: process_tld(url_host, fix_protocol)
        for url_host in url_hosts
    }

In [6]:
processed_hosts = process_tlds(df['url_host'].unique().compute(scheduler='multiprocessing'))

In [None]:
df['subdomain'] = df['url_host'].apply(lambda url: processed_hosts[url][0], meta=pd.Series(['esf']))
df['domain'] = df['url_host'].apply(lambda url: processed_hosts[url][1], meta=pd.Series(['esf']))
df['tld'] = df['url_host'].apply(lambda url: processed_hosts[url][2], meta=pd.Series(['esf']))

In [None]:
def contains_shortening_service(url: str) -> int:
    """
    Checks to see whether URL contains a shortening service
    """
    match = re.search('^(' + 'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net' + ')',
                      url)
    return 1 if match else 0

In [None]:
df['contains_shortener'] = df['url_host'].apply(lambda url: contains_shortening_service(url))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('url_host', 'int64'))



In [None]:
df = df.compute(scheduler='multiprocessing')

In [None]:
df.part_of_day.value_counts()

day        107328399
evening     96239286
morning     85236015
night       34095735
Name: part_of_day, dtype: int64

### Impute price, create date+day_part event_time

In [None]:
# df.price = df.price.fillna(0)
# df.price = pd.qcut(df.price, 10, labels=False)

In [None]:
df['event_time'] = pd.to_datetime(df["date"]).values.astype('datetime64[h]').view('int64')

In [None]:
df['event_time'] += df['part_of_day'].map({'morning': 0, 'day': 6, 'evening': 12, 'night': 18})

In [None]:
del df['date']

## Handcrafted aggreagates

# Create transactional data

In [None]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='event_time',
    event_time_transformation='none',
    cols_category=['url_host', 'part_of_day', 'is_ip', 'contains_shortener', 'subdomain', 'domain', 'tld'],
    #cols_category=['price', 'region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'url_host', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day'],
    cols_numerical=['request_cnt'],
    return_records=False,
)

In [None]:
df_trans = preprocessor.fit_transform(df)

### Pretrain datasets

In [None]:
from sklearn.model_selection import train_test_split

df_train_trans, df_valid_trans = train_test_split(df_trans, test_size = 0.1, random_state = 42)
df_train_trans = df_train_trans.to_dict(orient='records')
df_valid_trans = df_valid_trans.to_dict(orient='records')

In [None]:
len(df_train_trans), len(df_valid_trans)

(373785, 41532)

## Train COLES

### Model definition

In [None]:
df.isna().sum()

user_id                    0
url_host                   0
request_cnt                0
part_of_day                0
is_ip                      0
subdomain             341841
domain                341841
tld                   341841
contains_shortener         0
event_time                 0
dtype: int64

In [None]:
df[['subdomain', 'domain', 'tld']] = df[['subdomain', 'domain', 'tld']].fillna(value='#')

In [None]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'request_cnt': 'identity'
    },
    embeddings={
        #'price': {'in': 10, 'out': 2},
        #'region_name': {'in': 81, 'out': 4},
        #'city_name': {'in': 985, 'out': 16},
        #'cpe_manufacturer_name': {'in': 37, 'out': 4},
        #'cpe_model_name': {'in': 599, 'out': 16},
        #'cpe_type_cd': {'in': 4, 'out': 2}, 
        #'cpe_model_os_type': {'in': 3, 'out': 2}, 
        #'part_of_day': {'in': 4, 'out': 1},
        'url_host': {'in': 131994, 'out': 512},
        'subdomain': {'in': 46722, 'out': 128},
        'domain': {'in': 127241, 'out': 512},
        'tld': {'in': 515, 'out': 16},
        'is_ip': {'in': 2, 'out': 1},
        'contains_shortener': {'in': 2, 'out': 1},
    }
)


seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=512,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.AdamW, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.CyclicLR, base_lr=0.0001, max_lr=0.001, cycle_momentum=False),
)

### Dataloader

In [None]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule


train_ds = MemoryMapDataset(data=df_train_trans)
valid_ds = MemoryMapDataset(data=df_valid_trans)

train_data=ColesDataset(train_ds, splitter = SampleSlices(split_count=5, cnt_min=20, cnt_max=200))
valid_data=ColesDataset(valid_ds, splitter = SampleSlices(split_count=5, cnt_min=20, cnt_max=200))

dl = PtlsDataModule(
    train_data=train_data, train_num_workers=16, train_batch_size=256, 
    valid_data=valid_data, valid_num_workers=16, valid_batch_size=256
)

### Trainer

In [None]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

trainer = pl.Trainer(
    max_epochs=200,
    gpus=[0],
    enable_progress_bar=False,
    logger=TensorBoardLogger('lightning_logs', name='url_host_features'),
    callbacks=[ModelCheckpoint(
        monitor='recall_top_k',
        dirpath='model/',
        filename='{epoch}-{recall_top_k:.2f}',
        save_top_k=5,
        mode='max',
        save_weights_only=True,
    )]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
#print(f'logger.version = {trainer.logger.version}')
#trainer.fit(model, dl)
#print(trainer.logged_metrics)

In [None]:
#torch.save(model.state_dict(), "model/coles-emb-feature-001-cyclicLR-1e-4-1e-3.pt")

# Inference

In [None]:
#model.load_state_dict(torch.load("model/coles-emb-feature-001-cyclicLR-1e-4-1e-3.pt"))
checkpoint = torch.load('model/epoch=121-recall_top_k=0.92.ckpt')
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [None]:
# %%time

# import tqdm, torch

# df_trans = pq.read_table('data/trans_filtered.pq').to_pandas()

# cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time']

# for col in tqdm.tqdm(cols):
#     df_trans[col] = df_trans[col].apply(torch.tensor)

In [None]:
%%time
import tqdm
from ptls.data_load.datasets import inference_data_loader
import numpy as np

def pooling_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = model.seq_encoder.trx_encoder(batch.to(device)).payload
            out_max = torch.max(x, dim=1)[0]
            out_min = torch.min(x, dim=1)[0]
            out_mean = torch.mean(x, dim=1)
            out_std = torch.std(x, dim=1)
            features = torch.cat([out_max, out_min, out_mean, out_std], dim=1)      
            X += [features]
    return X

def embed_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            features = model.seq_encoder(batch.to(device))
            X += [features]
    return X

dl = inference_data_loader(df_trans.to_dict(orient='records'), num_workers=0, batch_size=64)
X_coles = torch.vstack(embed_inference(model, dl, )).cpu().numpy()
X_pool = torch.vstack(pooling_inference(model, dl, )).cpu().numpy()
X_embeds = np.concatenate([X_coles, X_pool], axis=1)


df_embeds = pd.DataFrame(X_embeds, columns=[f"embed_{e}" for e in range(X_embeds.shape[1])])
df_embeds['user_id'] = df_trans['user_id']
df_embeds.to_csv('data/coles_512_feature.csv', index=False)

6490it [12:06,  8.93it/s]
6490it [02:05, 51.81it/s]


CPU times: user 1h 59min 30s, sys: 3min 28s, total: 2h 2min 58s
Wall time: 41min 39s


## Downstream

## Targets

In [30]:
%%time

import bisect
import numpy as np

df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'])

CPU times: user 188 ms, sys: 28 ms, total: 216 ms
Wall time: 214 ms


## Add features

In [31]:
#X_embeddings=df_embeds
X_embeddings = pd.read_csv('data/coles_512_feature.csv')
#X_factors = pd.read_csv('./data/user_factors.csv')
#X_aggregates = pd.read_csv('./data/aggregates.csv')
#X_embeddings['user_id'] = X_aggregates['user_id']
X = df_public
X = X.merge(X_embeddings, on="user_id", how='inner')
# X = X.merge(X_factors, on="user_id", how='left').fillna(0)
# X = X.merge(X_aggregates, on="user_id", how='inner')

del X['user_id'], X['age'], X['is_male']

In [32]:
X.head()

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_5186,embed_5187,embed_5188,embed_5189,embed_5190,embed_5191,embed_5192,embed_5193,embed_5194,embed_5195
0,-0.401901,0.040664,0.356776,0.285869,0.179293,-0.948565,-0.072344,0.001979,0.843977,-0.848923,...,0.024881,0.055764,0.065482,0.037987,0.148053,0.053345,0.051573,0.076169,0.049984,0.614058
1,0.474354,-0.179691,0.626575,0.643749,0.260619,-0.987184,-0.930312,-0.312049,0.84696,0.464052,...,0.103082,0.140181,0.15232,0.108916,0.320273,0.132142,0.132284,0.170922,0.112164,1.403053
2,-0.047594,0.615767,0.133633,0.180548,0.123601,-0.906348,-0.844166,-0.203295,0.923673,-0.933795,...,0.034125,0.088067,0.081374,0.063938,0.20572,0.08678,0.081251,0.124125,0.081455,0.706681
3,-0.218762,0.770747,0.495254,0.243223,0.324211,-0.895869,-0.802742,0.087466,0.356542,-0.589794,...,0.031798,0.06801,0.057427,0.04404,0.168938,0.067471,0.06033,0.090984,0.059706,0.695565
4,-0.423964,0.47987,0.396467,0.151632,0.021593,-0.981009,-0.880682,0.058634,0.866789,-0.739819,...,0.060639,0.120209,0.109767,0.09774,0.27492,0.119586,0.113837,0.157969,0.103664,1.063054


## Gender

In [33]:
X.shape

(270000, 5196)

In [34]:
cat_features = []#  ['region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day', 'price']

In [35]:
# %%time

from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X[not_na_gender], y_gender[not_na_gender], test_size = 0.1, random_state = 42)

clf_gender = CatBoostClassifier(
    iterations=5000,
    custom_metric=[metrics.AUC()],
    use_best_model=True,
    random_seed=42)
clf_gender.fit(x_train, y_train, metric_period=100, eval_set=(x_test_gender, y_test_gender))

Learning rate set to 0.060789
0:	learn: 0.6665186	test: 0.6662811	best: 0.6662811 (0)	total: 266ms	remaining: 22m 7s
100:	learn: 0.4622145	test: 0.4606331	best: 0.4606331 (100)	total: 20.4s	remaining: 16m 30s
200:	learn: 0.4533938	test: 0.4544491	best: 0.4544491 (200)	total: 41.4s	remaining: 16m 28s
300:	learn: 0.4452482	test: 0.4501575	best: 0.4501575 (300)	total: 1m 2s	remaining: 16m 12s
400:	learn: 0.4385246	test: 0.4475111	best: 0.4475111 (400)	total: 1m 23s	remaining: 15m 54s
500:	learn: 0.4326498	test: 0.4455714	best: 0.4455714 (500)	total: 1m 44s	remaining: 15m 34s
600:	learn: 0.4274111	test: 0.4444478	best: 0.4444478 (600)	total: 2m 4s	remaining: 15m 12s
700:	learn: 0.4225264	test: 0.4436871	best: 0.4436871 (700)	total: 2m 25s	remaining: 14m 53s
800:	learn: 0.4178355	test: 0.4430454	best: 0.4430454 (800)	total: 2m 47s	remaining: 14m 36s
900:	learn: 0.4133630	test: 0.4425543	best: 0.4425543 (900)	total: 3m 8s	remaining: 14m 16s
1000:	learn: 0.4089374	test: 0.4420090	best: 0.4420

<catboost.core.CatBoostClassifier at 0x7f1734c6e040>

In [36]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.755


# Age

In [37]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age)
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 42)

clf_age = CatBoostClassifier(iterations=5000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42)
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age))

Learning rate set to 0.065168
0:	learn: 1.8791325	test: 1.8782257	best: 1.8782257 (0)	total: 1.9s	remaining: 2h 38m 35s
100:	learn: 1.2883887	test: 1.2852628	best: 1.2852628 (100)	total: 2m 18s	remaining: 1h 51m 57s
200:	learn: 1.2646798	test: 1.2681450	best: 1.2681450 (200)	total: 4m 26s	remaining: 1h 46m 2s
300:	learn: 1.2468173	test: 1.2577821	best: 1.2577821 (300)	total: 6m 26s	remaining: 1h 40m 35s
400:	learn: 1.2338263	test: 1.2521672	best: 1.2521672 (400)	total: 8m 29s	remaining: 1h 37m 20s
500:	learn: 1.2223616	test: 1.2483838	best: 1.2483838 (500)	total: 10m 26s	remaining: 1h 33m 46s
600:	learn: 1.2121243	test: 1.2454129	best: 1.2454129 (600)	total: 12m 24s	remaining: 1h 30m 51s
700:	learn: 1.2028210	test: 1.2435612	best: 1.2435612 (700)	total: 14m 22s	remaining: 1h 28m 6s
800:	learn: 1.1938839	test: 1.2420133	best: 1.2420133 (800)	total: 16m 20s	remaining: 1h 25m 37s
900:	learn: 1.1853099	test: 1.2409321	best: 1.2409321 (900)	total: 18m 20s	remaining: 1h 23m 24s
1000:	learn: 

<catboost.core.CatBoostClassifier at 0x7f1734881580>

In [38]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       1.00      0.01      0.02       109
       18-25       0.55      0.42      0.48      3238
       25-34       0.53      0.64      0.58      8863
       35-44       0.44      0.51      0.47      7773
       45-54       0.39      0.28      0.33      4218
       55-65       0.41      0.28      0.33      2254
         65+       0.33      0.02      0.04       545

    accuracy                           0.48     27000
   macro avg       0.52      0.31      0.32     27000
weighted avg       0.47      0.48      0.46     27000



In [39]:
0.760 + 2*0.47

1.7

# Score submit 

(2 * f1_weighted(по 6 возрастным бакетам) + gini)

In [40]:
%%time
import pyarrow.parquet as pq

df_submit = pq.read_table('data/submit_2.pqt').to_pandas().sort_values(by='user_id')

X_embeddings=df_embeds
# X_embeddings = pd.read_csv('./data/coles_256.csv')
#X_factors = pd.read_csv('./data/user_factors.csv')
#X_aggregates = pd.read_csv('./data/aggregates.csv')
X_submit = df_submit.merge(X_embeddings, on="user_id", how='left')
#X_submit = X_submit.merge(X_factors, on="user_id", how='left')
#X_submit = X_submit.merge(X_aggregates, on="user_id", how='inner')

CPU times: user 13 s, sys: 5.03 s, total: 18 s
Wall time: 18 s


In [41]:
list(X_submit['user_id']) == list(df_submit['user_id'])

True

In [42]:
del X_submit['user_id']

In [43]:
df_submit['age'] = clf_age.predict(X_submit)
df_submit['is_male'] = clf_gender.predict_proba(X_submit)[:, 1]
df_submit.head()

Unnamed: 0,user_id,age,is_male
0,6,2,0.263546
7,7,2,0.874413
9,9,2,0.136029
10,10,3,0.009366
4,11,5,0.927592


In [44]:
df_submit.to_csv(f'data/submission_url_host_features_early_stop.csv', index = False)