In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from dataclasses import dataclass, asdict
import glob
import tensorflow as tf
import pandas as pd
from pprint import pprint

## Load parameters

In [43]:
@dataclass
class Config:
    num_layers:int=12
    num_heads:int=12
    d_model:int=128
    dropout:float=0.1
    layer_norm_eps:float=1e-12
    activation:str="gelu"
    vocab_size:int=40857
    max_seq_len:int=25
    learning_rate:float=1e-4
    batch_size:float=64
        
config=Config()

# Prepare data

In [4]:
filenames = glob.glob('../data/processed/*')
filenames

['../data/processed/test.tfrecords',
 '../data/processed/training.tfrecords',
 '../data/processed/vocab.pkl']

In [5]:
train_data = tf.data.TFRecordDataset(filenames[1])
test_data = tf.data.TFRecordDataset(filenames[0])

2022-11-22 22:59:27.482814: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
def _parse_tf_records(element):
    # Parse the input `tf.train.Example` proto using the dictionary schema.
    schema = {
        "info": tf.io.FixedLenFeature([1], tf.int64),  # [user]
        "x_masked_tokens": tf.io.FixedLenFeature([256], tf.int64),
        "y_tokens": tf.io.FixedLenFeature([256], tf.int64),
        "mask_layer": tf.io.FixedLenFeature([256], tf.int64),
    }
    content = tf.io.parse_single_example(element, schema)
    return content

In [7]:
parsed_tf_records = train_data.map(_parse_tf_records)

In [8]:
parsed_tf_records

<MapDataset shapes: {info: (1,), mask_layer: (256,), x_masked_tokens: (256,), y_tokens: (256,)}, types: {info: tf.int64, mask_layer: tf.int64, x_masked_tokens: tf.int64, y_tokens: tf.int64}>

In [9]:
df = pd.DataFrame(
    parsed_tf_records.as_numpy_iterator(),
    columns=['info', 'x_masked_tokens', 'y_tokens', 'mask_layer']
)
df.head()

Unnamed: 0,info,x_masked_tokens,y_tokens,mask_layer
0,[0],"[22, 6, 5, 25, 12, 40857, 26, 14, 34, 21, 1, 4...","[22, 6, 5, 25, 12, 3, 26, 14, 34, 21, 1, 4, 32...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,[2],"[170, 171, 298, 282, 316, 330, 148, 301, 304, ...","[170, 171, 298, 282, 316, 330, 148, 301, 304, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[3],"[165, 40857, 512, 181, 185, 475, 471, 476, 473...","[165, 470, 512, 181, 185, 475, 471, 476, 473, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ..."
3,[4],"[40, 40857, 533, 540, 527, 528, 40857, 151, 52...","[40, 0, 533, 540, 527, 528, 530, 151, 529, 517...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, ..."
4,[5],"[558, 162, 562, 43, 51, 44, 158, 557, 40857, 5...","[558, 162, 562, 43, 51, 44, 158, 557, 58, 561,...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, ..."


In [10]:
import numpy as np

In [11]:
movies_ds = tf.data.Dataset.from_tensor_slices(
    (np.vstack(df.x_masked_tokens.to_numpy()), 
    np.vstack(df.y_tokens.to_numpy()), 
    np.vstack(df.mask_layer.to_numpy()))
)
movies_ds = movies_ds.shuffle(1000).batch(64)

## Model

In [33]:
from bert4rec.bert import BertModel
from bert4rec.trainer import BertTrainer
# %autoreload 2
# from bert4rec.trainer import BertTrainer

In [40]:
config

{'num_layers': 12,
 'num_heads': 12,
 'd_model': 128,
 'dropout': 0.1,
 'layer_norm_eps': 1e-12,
 'activation': 'gelu',
 'vocab_size': 40857,
 'max_seq_len': 256}

In [45]:
model = BertModel(
    num_layers=config.num_layers,
    num_heads=config.num_heads,
    d_model=config.d_model,
    dropout=config.dropout,
    layer_norm_eps=config.layer_norm_eps,
    activation=config.activation,
    vocab_size=config.vocab_size,
    max_seq_len=config.max_seq_len,
)
model.build((64, 512, 1))
model.summary()

Model: "bert_model_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 positional_embedding_15 (Po  multiple                 5232896   
 sitionalEmbedding)                                              
                                                                 
 layer_normalization_555 (La  multiple                 256       
 yerNormalization)                                               
                                                                 
 transformer_encoder_layer_1  multiple                 194408    
 80 (TransformerEncoderLayer                                     
 )                                                               
                                                                 
 transformer_encoder_layer_1  multiple                 194408    
 81 (TransformerEncoderLayer                                     
 )                                                   

In [46]:
bert_trainer = BertTrainer(model)
bert_trainer.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)

In [38]:
train.er

In [39]:
training.fit(movies_ds, batch_size=64, epochs=2)

AttributeError: 'BertTrainer' object has no attribute 'loss_tracker'

In [None]:
training = CustomFit(model)
training.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)


In [None]:
training.fit(movies_ds, batch_size=64, epochs=2)

In [None]:
training.evaluate(movies_ds, batch_size=64)

In [None]:
print(CustomFit)
print(isinstance(self, CustomFit))

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
training.compile(optimizer=optimizer)
training.summary()

In [None]:
model.fit(movies_ds, epochs=5)

In [None]:
TypeError: super(type, obj): obj must be an instance or subtype of type

In [None]:
def get_bert4rec_model(config):
    inputs = tf.keras.layers.Input((config.max_seq_len,), dtype=tf.int64)
    bert4rec_model = Bert(
        num_layers=12,
        num_heads=12,
        d_model= 128,
        dropout= 0.1,
        layer_norm_eps= 1e-12,
        activation="gelu",
        vocab_size=40857,
        max_seq_len=512,
    )
    model = BertModel()(inputs, bert4rec_model(inputs), name="masked_bert_model")
    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    model.compile(optimizer=optimizer)
    return model

In [None]:
bert_masked_model = get_bert4rec_model(config)
bert_masked_model.summary()

In [None]:
bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
bert_masked_model.save("bert_mlm_imdb.h5")

In [None]:
# Create example data
data = {
    'Age': 29,
    'Movie': ['The Shawshank Redemption', 'Fight Club'],
    'Movie Ratings': [9.0, 9.7],
    'Suggestion': 'Inception',
    'Suggestion Purchased': 1.0,
    'Purchase Price': 9.99
}

print(data)

In [None]:
# Create the Example
example = tf.train.Example(features=tf.train.Features(feature={
    'Age': tf.train.Feature(
        int64_list=tf.train.Int64List(value=[data['Age']])),
    'Movie': tf.train.Feature(
        bytes_list=tf.train.BytesList(
            value=[m.encode('utf-8') for m in data['Movie']])),
    'Movie Ratings': tf.train.Feature(
        float_list=tf.train.FloatList(value=data['Movie Ratings'])),
    'Suggestion': tf.train.Feature(
        bytes_list=tf.train.BytesList(
            value=[data['Suggestion'].encode('utf-8')])),
    'Suggestion Purchased': tf.train.Feature(
        float_list=tf.train.FloatList(
            value=[data['Suggestion Purchased']])),
    'Purchase Price': tf.train.Feature(
        float_list=tf.train.FloatList(value=[data['Purchase Price']]))
}))

print(example)

In [None]:
# Write TFrecord file
with tf.io.TFRecordWriter('customer_1.tfrecord') as writer:
    writer.write(example.SerializeToString())

In [None]:
dataset

In [None]:
result

In [None]:
!pwd

In [None]:
# Read TFRecord file

dataset =  tf.data.TFRecordDataset(['customer_1.tfrecord'])

# _, serialized_example = reader.read(filename_queue)

# Define features
read_features = {
    'Age': tf.io.FixedLenFeature([], dtype=tf.int64),
    'Movie': tf.io.VarLenFeature(dtype=tf.string),
    'Movie Ratings': tf.io.VarLenFeature(dtype=tf.float32),
    'Suggestion': tf.io.FixedLenFeature([], dtype=tf.string),
    'Suggestion Purchased': tf.io.FixedLenFeature([], dtype=tf.float32),
    'Purchase Price': tf.io.FixedLenFeature([], dtype=tf.float32)
}

# Extract features from serialized data
read_data = tf.io.parse_single_example(dataset, read_features)

# Many tf.train functions use tf.train.QueueRunner,
# so we need to start it before we read
tf.train.start_queue_runners(sess)

# Print features
for name, tensor in read_data.items():
    print('{}: {}'.format(name, tensor.eval()))