In [1]:
import os
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix, vstack

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sys import getsizeof
from catboost import CatBoostRegressor, cv, Pool, sum_models
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import vaex
import pyarrow.parquet as pq
import bisect

import pickle
from random import shuffle

import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers as R
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras import layers as L
from tensorflow.keras import optimizers as O
from tensorflow.keras import backend as K
from tensorflow.keras.losses import mse

  from pandas import MultiIndex, Int64Index
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
LOCAL_DATA_PATH = 'context_data'
SPLIT_SEED = 42
DATA_FILE = 'competition_data_final_pqt'
TARGET_FILE = 'public_train.pqt'
SUBMISSION_FILE = 'submit_2.pqt'

In [3]:
df = vaex.open(f'../{LOCAL_DATA_PATH}/{DATA_FILE}')
df.head()

#,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368,2022-05-30,day,1,45098
5,Краснодарский край,Краснодар,Apple,iPhone 7,i.ytimg.com,smartphone,iOS,20368,2022-03-29,evening,2,45098
6,Краснодарский край,Краснодар,Apple,iPhone 7,icloud.com,smartphone,iOS,20368,2022-03-17,morning,1,45098
7,Краснодарский край,Краснодар,Apple,iPhone 7,m.avito.ru,smartphone,iOS,20368,2022-05-19,morning,1,45098
8,Краснодарский край,Краснодар,Apple,iPhone 7,relap.io,smartphone,iOS,20368,2022-03-29,night,1,45098
9,Краснодарский край,Краснодар,Apple,iPhone 7,sun9-5.userapi.com,smartphone,iOS,20368,2022-06-16,day,1,45098


In [4]:
%%time
data_agg = df[['user_id', 'url_host', 'request_cnt']].\
    groupby(['user_id', 'url_host']).agg([('request_cnt', "sum")])

CPU times: total: 5min 14s
Wall time: 41 s


In [5]:
url_set = set(data_agg['url_host'].unique())
print(f'{len(url_set)} urls')
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
usr_set = set(data_agg['user_id'].unique())
print(f'{len(usr_set)} users')
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

199683 urls
415317 users


In [6]:
%%time
values = data_agg['request_cnt'].values.astype(np.int32)
rows = data_agg['user_id'].map(usr_dict).to_numpy()
cols = data_agg['url_host'].map(url_dict).to_numpy()
mat = csr_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
mat

CPU times: total: 20.9 s
Wall time: 4.15 s


<415317x199683 sparse matrix of type '<class 'numpy.intc'>'
	with 32277669 stored elements in Compressed Sparse Row format>

In [7]:
train_idx, val_idx = train_test_split(np.arange(mat.shape[0]), test_size=0.1, shuffle=True, random_state=1)
train_idx = np.sort(train_idx)    
val_idx = np.sort(val_idx) 

In [8]:
class DataGenerator(Sequence):
    def __init__(self, x_vals, batch_size, split_idx):
        self.x_vals = x_vals
        self.inds = split_idx
        shuffle(self.inds)
        self.batch_size = batch_size
        
    def __getitem__(self, item):
        from_ind = self.batch_size * item
        to_ind = self.batch_size * (item + 1)
        res = self.x_vals[np.sort(self.inds[from_ind:to_ind])].todense()
        return (res, res)
    
    def on_epoch_end(self):
        shuffle(self.inds)
        
    def __len__(self):
        return int(np.ceil(len(self.inds) / float(self.batch_size)))

In [9]:
train_gen = DataGenerator(mat,\
                          512,\
                          train_idx
                         )
val_gen = DataGenerator(mat,\
                        512,
                        val_idx
                       )

In [10]:
latent_dim = 256

encoder = Sequential([
    L.Dense(2048, activation='relu', input_shape=(199683,)),
    L.BatchNormalization(),
    L.Dropout(0.3),
    L.Dense(1024, activation='relu'),
    L.BatchNormalization(),
    L.Dropout(0.2),
    L.Dense(512, activation='relu'),
    L.BatchNormalization(),
    L.Dropout(0.1),
    L.Dense(latent_dim, activation='relu')
])

decoder = Sequential([
    L.BatchNormalization(),
    L.Dense(512, activation='relu', input_shape=(latent_dim,)),
    L.BatchNormalization(),
    L.Dropout(0.1),
    L.Dense(768, activation='relu'),
    L.BatchNormalization(),
    L.Dropout(0.2),
    L.Dense(1536, activation='relu'),
    L.BatchNormalization(),
    L.Dropout(0.3),
    L.Dense(199683, activation=None)
])

autoencoder = Model(inputs=encoder.input, outputs=decoder(encoder.output))
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_input (InputLayer)    [(None, 199683)]          0         
                                                                 
 dense (Dense)               (None, 2048)              408952832 
                                                                 
 batch_normalization (BatchN  (None, 2048)             8192      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 2048)              0         
                                                                 
 dense_1 (Dense)             (None, 1024)              2098176   
                                                                 
 batch_normalization_1 (Batc  (None, 1024)             4096      
 hNormalization)                                             

In [11]:
es = EarlyStopping(monitor="val_loss",
                   patience=4,
                   verbose=1,
                   mode="min",
                   restore_best_weights=True
                  )
lr = ReduceLROnPlateau(monitor="val_loss",
                       factor=0.5,
                       patience=2,
                       verbose=1,
                       mode="min",
                      )

opt = O.Adam(learning_rate=1e-3)
loss = mse

autoencoder.compile(optimizer=opt, 
              loss=loss,
             )
history = autoencoder.fit(train_gen,
              epochs=500,
              validation_data=val_gen,
              callbacks=[es,lr]
          )

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 26: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 29: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 33: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 34/500
Epoch 35/500

Epoch 35: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 35: early stopping


In [12]:
autoencoder.save('ae/model_1_rs_1')

INFO:tensorflow:Assets written to: v5/model_1_rs_1\assets


INFO:tensorflow:Assets written to: v5/model_1_rs_1\assets


In [15]:
emb = encoder.predict(mat)
emb.shape



(415317, 256)

In [16]:
with open('ae/emb_1.pickle', 'wb') as outp:
    pickle.dump(emb, outp, pickle.HIGHEST_PROTOCOL)

In [17]:
K.clear_session()