In [1]:
import os
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix, vstack, load_npz

import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
from tqdm.notebook import tqdm
from sys import getsizeof
import gc
#from catboost import CatBoostRegressor, cv, Pool, sum_models
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder, LabelEncoder

import vaex
import pyarrow.parquet as pq
import bisect

import pickle
from random import shuffle

import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback, ProgbarLogger
from tensorflow.keras import regularizers as R
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras import layers as L
from tensorflow.keras import optimizers as O
from tensorflow.keras import backend as K
from tensorflow.keras.losses import mse, binary_crossentropy, categorical_crossentropy
from tensorflow.keras import mixed_precision
import tensorflow_addons as tfa
from tensorflow_addons.metrics import F1Score
tf.random.set_seed(722)

C:\ProgramData\Miniconda3\envs\ml_env_v1\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\ProgramData\Miniconda3\envs\ml_env_v1\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
LOCAL_DATA_PATH = 'context_data'
SPLIT_SEED = 42
DATA_FILE = 'competition_data_final_pqt'
TARGET_FILE = 'public_train.pqt'
SUBMISSION_FILE = 'submit_2.pqt'

In [3]:
id_to_submit = pq.read_table(f'../{LOCAL_DATA_PATH}/{SUBMISSION_FILE}').to_pandas()
tgt = pq.read_table(f'../{LOCAL_DATA_PATH}/{TARGET_FILE}').to_pandas()

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

In [4]:
mat = load_npz('../utils/mat.npz')

In [5]:
idx_tr = tgt['age'][tgt.age > 15].index.values
y_train = tgt['age'][tgt.age > 15].map(age_bucket).values.astype(np.int8)
y_train[y_train==0] = 1
y_train = y_train - 1

mat_train = mat[idx_tr]
idx_test = id_to_submit.user_id.values
mat_test = mat[idx_test]

cols_countsum_tr = np.asarray(mat_train.astype(bool).sum(axis=0)).flatten()
cols_countsum_test = np.asarray(mat_test.astype(bool).sum(axis=0)).flatten()
mask = (cols_countsum_tr > 1) * (cols_countsum_test > 0)

mat_train = mat_train[:, mask]
mat_test = mat_test[:, mask]
print(mat_train.shape, mat_test.shape)

(269903, 88072) (144724, 88072)


In [6]:
feat_df = pd.read_csv('../utils/feat_gen_df3.csv', index_col='user_id')
feat_df['os'] = feat_df['os'].map({'iOS': 0, 'Android': 1})
feat_df.head()

Unnamed: 0_level_0,day,evening,morning,night,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,...,company,model,os,region_name_count,city_name_count,req_max,req_sum,id_rows,days,dates_range
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.554404,0.321244,0.119171,0.005181,0.056995,0.020725,0.134715,0.108808,0.036269,0.440415,...,Samsung,Galaxy J1 2016 LTE Dual,1,1,1,5,193,131,17,18
1,0.346705,0.295129,0.322827,0.035339,0.127985,0.209169,0.102197,0.098376,0.122254,0.150907,...,Xiaomi,Mi 9,1,3,6,6,1047,700,19,20
2,0.481752,0.316302,0.187348,0.014599,0.153285,0.128954,0.148418,0.150852,0.104623,0.128954,...,Huawei,Honor 9 Lite,1,1,1,4,411,356,50,57
3,0.352727,0.454545,0.178182,0.014545,0.24,0.185455,0.065455,0.116364,0.123636,0.090909,...,Huawei Device Company Limited,P Smart 2021,1,1,1,5,275,188,15,16
4,0.348777,0.265122,0.371943,0.014157,0.212355,0.164736,0.185328,0.14157,0.118404,0.072072,...,Huawei,Nova 3,1,5,9,5,777,591,20,42


In [7]:
cont_feat = feat_df.drop(['region_name', 'city_name', 'company', 'model'], axis=1).values
cont_feat_train = cont_feat[idx_tr]
cont_feat_test = cont_feat[idx_test]

In [8]:
cat_df = feat_df[['region_name', 'city_name', 'company', 'model']]
cat_feat = np.stack([cat_df[col].astype('category').cat.codes.values for col in cat_df]).T
cat_feat_train = cat_feat[idx_tr]
cat_feat_test = cat_feat[idx_test]

In [9]:
mat_pod = load_npz('../utils/mat_pod.npz')
mat_pod_train = mat_pod[idx_tr]
mat_pod_test = mat_pod[idx_test]

mat_pod_cols_countsum_tr = np.asarray(mat_pod_train.astype(bool).sum(axis=0)).flatten()
mat_pod_cols_countsum_test = np.asarray(mat_pod_test.astype(bool).sum(axis=0)).flatten()
mat_pod_mask = (mat_pod_cols_countsum_tr > 1) * (mat_pod_cols_countsum_test > 0)

mat_pod_train = mat_pod_train[:, mat_pod_mask]
mat_pod_test = mat_pod_test[:, mat_pod_mask]
print(mat_pod_train.shape, mat_pod_test.shape)

(269903, 173452) (144724, 173452)


In [10]:
class DataGenerator(Sequence):
    def __init__(self, x1_vals, x2_vals, x3_vals, x4_vals, y_vals, batch_size, split_idx, shuffle_idx=False):
        self.x1_vals = x1_vals
        self.x2_vals = x2_vals
        self.x3_vals = x3_vals
        self.x4_vals = x4_vals
        self.y_vals = y_vals
        self.inds = split_idx
        self.shuffle_idx = shuffle_idx
        if shuffle_idx:
            shuffle(self.inds)
        self.batch_size = batch_size
        
    def __getitem__(self, item):
        from_ind = self.batch_size * item
        to_ind = self.batch_size * (item + 1)
        batch_x1 = self.x1_vals[np.sort(self.inds[from_ind:to_ind])].todense()
        batch_x2 = self.x2_vals[np.sort(self.inds[from_ind:to_ind])]
        batch_x3 = self.x3_vals[np.sort(self.inds[from_ind:to_ind])]
        batch_x4 = self.x4_vals[np.sort(self.inds[from_ind:to_ind])].todense()
        batch_y = self.y_vals[np.sort(self.inds[from_ind:to_ind])]
        return ([batch_x1, batch_x2, batch_x3, batch_x4], tf.one_hot(batch_y, depth=6))
    
    def on_epoch_end(self):
        if self.shuffle_idx:
            shuffle(self.inds)
        else:
            pass
        
    def __len__(self):
        return int(np.ceil(len(self.inds) / float(self.batch_size)))
    
    
class DataGenerator_test(Sequence):
    def __init__(self, x1_vals, x2_vals, x3_vals, x4_vals, batch_size, split_idx, shuffle_idx=False):
        self.x1_vals = x1_vals
        self.x2_vals = x2_vals
        self.x3_vals = x3_vals
        self.x4_vals = x4_vals
        self.inds = split_idx
        self.shuffle_idx = shuffle_idx
        if shuffle_idx:
            shuffle(self.inds)
        self.batch_size = batch_size
        
    def __getitem__(self, item):
        from_ind = self.batch_size * item
        to_ind = self.batch_size * (item + 1)
        batch_x1 = self.x1_vals[np.sort(self.inds[from_ind:to_ind])].todense()
        batch_x2 = self.x2_vals[np.sort(self.inds[from_ind:to_ind])]
        batch_x3 = self.x3_vals[np.sort(self.inds[from_ind:to_ind])]
        batch_x4 = self.x4_vals[np.sort(self.inds[from_ind:to_ind])].todense()
        return ([batch_x1, batch_x2, batch_x3, batch_x4],)
    
    def on_epoch_end(self):
        if self.shuffle_idx:
            shuffle(self.inds)
        else:
            pass
        
    def __len__(self):
        return int(np.ceil(len(self.inds) / float(self.batch_size)))

In [11]:
class GatedLinearUnit(L.Layer):
    def __init__(self, units):
        super().__init__()
        self.linear = L.Dense(units)
        self.sigmoid = L.Dense(units, activation="sigmoid")

    def call(self, inputs):
        return self.linear(inputs) * self.sigmoid(inputs)
    
    
class GatedResidualNetwork(L.Layer):
    def __init__(self, units, dropout_rate):
        super().__init__()
        self.units = units
        self.relu_dense = L.Dense(units, activation="relu")
        self.linear_dense = L.Dense(units)
        self.dropout = L.Dropout(dropout_rate)
        self.gated_linear_unit = GatedLinearUnit(units)
        self.layer_norm = L.LayerNormalization()
        self.project = L.Dense(units)

    def call(self, inputs):
        x = self.relu_dense(inputs)
        x = self.linear_dense(x)
        x = self.dropout(x)
        if inputs.shape[-1] != self.units:
            inputs = self.project(inputs)
        x = inputs + self.gated_linear_unit(x)
        x = self.layer_norm(x)
        return x
    
    
class VariableSelection(L.Layer):
    def __init__(self, num_features, units, dropout_rate):
        super().__init__()
        self.grns = list()
        # Create a GRN for each feature independently
        for idx in range(num_features):
            grn = GatedResidualNetwork(units, dropout_rate)
            self.grns.append(grn)
        # Create a GRN for the concatenation of all the features
        self.grn_concat = GatedResidualNetwork(units, dropout_rate)
        self.softmax = L.Dense(units=num_features, activation="softmax")

    def call(self, inputs):
        v = L.concatenate(inputs)
        v = self.grn_concat(v)
        v = tf.expand_dims(self.softmax(v), axis=-1)

        x = []
        for idx, input_ in enumerate(inputs):
            x.append(self.grns[idx](input_))
        x = tf.stack(x, axis=1)

        outputs = tf.squeeze(tf.matmul(v, x, transpose_a=True), axis=1)
        return outputs
    

class VariableSelectionFlow(L.Layer):
    def __init__(self, num_features, units, dropout_rate, dense_units=None):
        super().__init__()
        self.variableselection = VariableSelection(num_features, units, dropout_rate)
        self.split = L.Lambda(lambda t: tf.split(t, num_features, axis=-1))
        self.dense = dense_units
        if dense_units:
            self.dense_list = [L.Dense(dense_units, \
                                       activation='linear') \
                               for _ in tf.range(num_features)
                              ]
    
    def call(self, inputs):
        split_input = self.split(inputs)
        if self.dense:
            l = [self.dense_list[i](split_input[i]) for i in range(len(self.dense_list))]
        else:
            l = split_input
        return self.variableselection(l)        
    
    
def smish(x):
    return x * K.tanh(K.log(1 + K.sigmoid(x)))


def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):

    mlp_layers = []
    for units in hidden_units:
        mlp_layers.append(L.Dense(units, activation=activation))
        mlp_layers.append(normalization_layer),
        mlp_layers.append(L.Dropout(dropout_rate))

    return tf.keras.Sequential(mlp_layers, name=name)


class TransformerBlock(L.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.15, num_transformer_blocks=3):
        super(TransformerBlock, self).__init__()
        self.num_transformer_blocks = num_transformer_blocks
        self.att = L.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=dropout_rate)
        self.ffn = create_mlp(
            hidden_units=ff_dim,
            dropout_rate=dropout_rate,
            activation=tf.keras.activations.gelu,
            normalization_layer=L.LayerNormalization(epsilon=1e-6),
        )
        self.layernorm1 = L.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = L.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        for block_idx in range(num_transformer_blocks):
            attn_output = self.att(inputs, inputs)
            out1 = self.layernorm1(inputs + attn_output)
            ffn_output = self.ffn(out1)
            inputs = self.layernorm2(out1 + ffn_output)
        return inputs

    
class Wt_Add(L.Layer):
    def __init__(self, units=1, input_dim=1):
        super(Wt_Add, self).__init__()
        w_init = tf.random_normal_initializer(mean=1.0)
        self.w1 = tf.Variable(
            initial_value=w_init(shape=(input_dim, units), dtype="float32"),
            trainable=True,
        )
        self.w2 = tf.Variable(
            initial_value=w_init(shape=(input_dim, units), dtype="float32"),
            trainable=True,
        )        
        
    def call(self, input1, input2):
        return tf.multiply(input1,self.w1) + tf.multiply(input2, self.w2)

In [12]:
batch_size = 256
units_1 = 256
units_2 = 64
units_22 = 128
dropout_1 = 0.1
dropout_2 = 0.1
dropout_22 = 0.1


INIT_LR = 1e-5
MAX_LR = 1e-3
steps_per_epoch = 1055


dropout_rate = 0.10
num_transformer_blocks = 3  # Number of transformer blocks.
num_heads = 4  # Number of attention heads.
embedding_dims = 32  # Embedding dimensions of the categorical features.
vocab_len = [80, 950, 37, 599]

In [13]:
#K.clear_session()

In [14]:
###__--__###

In [14]:
%%time


#policy_m16 = mixed_precision.Policy('mixed_float16')
#policy_32 = mixed_precision.Policy('float32')

test_gen = DataGenerator_test(mat_test,\
                              cont_feat_test,\
                              cat_feat_test,\
                              mat_pod_test,\
                              batch_size,
                              np.arange(mat_test.shape[0])
                           )

for n in range(10):
    print(f'______fold {n+1}______')
    train_gen = DataGenerator(mat_train,\
                              cont_feat_train,\
                              cat_feat_train,\
                              mat_pod_train,\
                              y_train,\
                              batch_size,\
                              np.arange(mat_train.shape[0]),\
                              shuffle_idx=True
                             )
    
    #mixed_precision.set_global_policy(policy_m16)

    inputs_1 = tf.keras.Input(shape=(88072,))
    r1_1 = L.Reshape((1,88072,1))(inputs_1)
    cnn_1 = L.Conv2D(16, (1,41), strides=2, activation=smish)(r1_1)
    
    #mixed_precision.set_global_policy(policy_32)

    d_1 = L.Dense(1, activation=smish)(cnn_1)
    r2_1 = L.Reshape((44016,))(d_1)
    features_1 = VariableSelectionFlow(336, units_1, dropout_1)(r2_1)
    
   
    inputs_2 = tf.keras.Input(shape=(20,), dtype=tf.int32)
    features_2 = VariableSelectionFlow(20, units_2, dropout_2, dense_units=1)(inputs_2)

    
    inputs_3 = tf.keras.Input(shape=(4,), dtype=tf.int16)
    n_0 = L.Lambda(lambda t: tf.split(t, 4, axis=-1))(inputs_3)
    emb = [L.Embedding(input_dim=vocab_len[n], output_dim=embedding_dims)(l) for n, l in enumerate(n_0)]
    cat_emb = tf.concat(emb, axis=1)    
    transf_cat = TransformerBlock(embed_dim=embedding_dims, \
                                 num_heads=num_heads, \
                                 ff_dim=[embedding_dims], \
                                 dropout_rate=dropout_rate, \
                                 num_transformer_blocks=num_transformer_blocks
                                )(cat_emb)

    # Flatten the "contextualized" embeddings of the categorical features.
    cat_features = L.Flatten()(transf_cat)

    #mixed_precision.set_global_policy(policy_m16)    
    
    inputs_4 = tf.keras.Input(shape=(173452,))
    r1_4 = L.Reshape((1,173452,1))(inputs_4)
    cnn_4 = L.Conv2D(6, (1,53), strides=2, activation=smish)(r1_4)
    
    #mixed_precision.set_global_policy(policy_32)
    
    d_4 = L.Dense(1, activation=smish)(cnn_4)    
    r2_4 = L.Reshape((86700,))(d_4)
    features_4 = VariableSelectionFlow(425, units_1, dropout_1)(r2_4)   

    
    add_1_4 = Wt_Add(units=units_1)(features_1, features_4)
    
    
    concat1 = L.Concatenate()([features_2, cat_features, add_1_4])
    
    features_22 = VariableSelectionFlow(concat1.shape[-1], units_22, dropout_22)(concat1)
    
    dense_out = L.Dense(6)(features_22)
    outputs = L.Activation("softmax", dtype='float32')(dense_out)

    model = Model(inputs=[inputs_1, inputs_2, inputs_3, inputs_4], outputs=outputs)
                
    clr = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=INIT_LR,
        maximal_learning_rate=MAX_LR,
        scale_fn=lambda x: 1/(2.**(x-1)),
        step_size=1 * steps_per_epoch
        )
    
    opt = O.Adam(learning_rate=clr, epsilon=1e-09)
    loss = categorical_crossentropy

    model.compile(optimizer=opt, 
                    loss=loss,
                    metrics=[F1Score(num_classes=6, average='weighted')]
                 )
    
    model.fit(train_gen,
                epochs=2
            )

    y_test_df = pd.DataFrame(idx_test).rename({0: 'user_id'}, axis=1)
    y_test_df[[str(i) for i in (range(1,7))]] = model.predict(test_gen)
    y_test_df = y_test_df.set_index('user_id', drop=True)
    y_test_df.to_csv(f'v132/fold_{n+1}/y_test.csv')
    
    del model, clr, opt, loss
    gc.collect()
    K.clear_session()
    

______fold 1______
Epoch 1/2
Epoch 2/2
______fold 2______
Epoch 1/2
Epoch 2/2
______fold 3______
Epoch 1/2
Epoch 2/2
______fold 4______
Epoch 1/2
Epoch 2/2
______fold 5______
Epoch 1/2
Epoch 2/2
______fold 6______
Epoch 1/2
Epoch 2/2
______fold 7______
Epoch 1/2
Epoch 2/2
______fold 8______
Epoch 1/2
Epoch 2/2
______fold 9______
Epoch 1/2
Epoch 2/2

ResourceExhaustedError: Graph execution error:

OOM when allocating tensor with shape[256,16,1,44016] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node gradient_tape/model/conv2d/mul/Mul_1-0-TransposeNHWCToNCHW-LayoutOptimizer}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_8411640]

In [13]:
%%time


#policy_m16 = mixed_precision.Policy('mixed_float16')
#policy_32 = mixed_precision.Policy('float32')

test_gen = DataGenerator_test(mat_test,\
                              cont_feat_test,\
                              cat_feat_test,\
                              mat_pod_test,\
                              batch_size,
                              np.arange(mat_test.shape[0])
                           )

for n in range(8,10):
    print(f'______fold {n+1}______')
    train_gen = DataGenerator(mat_train,\
                              cont_feat_train,\
                              cat_feat_train,\
                              mat_pod_train,\
                              y_train,\
                              batch_size,\
                              np.arange(mat_train.shape[0]),\
                              shuffle_idx=True
                             )
    
    #mixed_precision.set_global_policy(policy_m16)

    inputs_1 = tf.keras.Input(shape=(88072,))
    r1_1 = L.Reshape((1,88072,1))(inputs_1)
    cnn_1 = L.Conv2D(16, (1,41), strides=2, activation=smish)(r1_1)
    
    #mixed_precision.set_global_policy(policy_32)

    d_1 = L.Dense(1, activation=smish)(cnn_1)
    r2_1 = L.Reshape((44016,))(d_1)
    features_1 = VariableSelectionFlow(336, units_1, dropout_1)(r2_1)
    
   
    inputs_2 = tf.keras.Input(shape=(20,), dtype=tf.int32)
    features_2 = VariableSelectionFlow(20, units_2, dropout_2, dense_units=1)(inputs_2)

    
    inputs_3 = tf.keras.Input(shape=(4,), dtype=tf.int16)
    n_0 = L.Lambda(lambda t: tf.split(t, 4, axis=-1))(inputs_3)
    emb = [L.Embedding(input_dim=vocab_len[n], output_dim=embedding_dims)(l) for n, l in enumerate(n_0)]
    cat_emb = tf.concat(emb, axis=1)    
    transf_cat = TransformerBlock(embed_dim=embedding_dims, \
                                 num_heads=num_heads, \
                                 ff_dim=[embedding_dims], \
                                 dropout_rate=dropout_rate, \
                                 num_transformer_blocks=num_transformer_blocks
                                )(cat_emb)

    # Flatten the "contextualized" embeddings of the categorical features.
    cat_features = L.Flatten()(transf_cat)

    #mixed_precision.set_global_policy(policy_m16)    
    
    inputs_4 = tf.keras.Input(shape=(173452,))
    r1_4 = L.Reshape((1,173452,1))(inputs_4)
    cnn_4 = L.Conv2D(6, (1,53), strides=2, activation=smish)(r1_4)
    
    #mixed_precision.set_global_policy(policy_32)
    
    d_4 = L.Dense(1, activation=smish)(cnn_4)    
    r2_4 = L.Reshape((86700,))(d_4)
    features_4 = VariableSelectionFlow(425, units_1, dropout_1)(r2_4)   

    
    add_1_4 = Wt_Add(units=units_1)(features_1, features_4)
    
    
    concat1 = L.Concatenate()([features_2, cat_features, add_1_4])
    
    features_22 = VariableSelectionFlow(concat1.shape[-1], units_22, dropout_22)(concat1)
    
    dense_out = L.Dense(6)(features_22)
    outputs = L.Activation("softmax", dtype='float32')(dense_out)

    model = Model(inputs=[inputs_1, inputs_2, inputs_3, inputs_4], outputs=outputs)
                
    clr = tfa.optimizers.CyclicalLearningRate(initial_learning_rate=INIT_LR,
        maximal_learning_rate=MAX_LR,
        scale_fn=lambda x: 1/(2.**(x-1)),
        step_size=1 * steps_per_epoch
        )
    
    opt = O.Adam(learning_rate=clr, epsilon=1e-09)
    loss = categorical_crossentropy

    model.compile(optimizer=opt, 
                    loss=loss,
                    metrics=[F1Score(num_classes=6, average='weighted')]
                 )
    
    model.fit(train_gen,
                epochs=2
            )

    y_test_df = pd.DataFrame(idx_test).rename({0: 'user_id'}, axis=1)
    y_test_df[[str(i) for i in (range(1,7))]] = model.predict(test_gen)
    y_test_df = y_test_df.set_index('user_id', drop=True)
    y_test_df.to_csv(f'v132/fold_{n+1}/y_test.csv')
    
    del model, clr, opt, loss
    gc.collect()
    K.clear_session()
    

______fold 9______
Epoch 1/2
Epoch 2/2
______fold 10______
Epoch 1/2
Epoch 2/2
CPU times: total: 2h 41min 9s
Wall time: 4h 17min 1s
