In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dataclasses import dataclass, asdict
import glob
import tensorflow as tf
import pandas as pd
from pprint import pprint

## Load parameters

In [None]:
@dataclass
class Config:
    num_layers:int=12
    num_heads:int=32
    d_model:int=128
    dropout:float=0.1
    layer_norm_eps:float=1e-12
    activation:str="gelu"
    vocab_size:int=40857
    max_seq_len:int=256
    learning_rate:float=1e-4
    batch_size:float=64
        
config=Config()

# Prepare data

In [None]:
filenames = glob.glob('../data/processed/*')
filenames

In [None]:
train_data = tf.data.TFRecordDataset(filenames[1])
test_data = tf.data.TFRecordDataset(filenames[0])

In [None]:
def _parse_tf_records(element):
    # Parse the input `tf.train.Example` proto using the dictionary schema.
    schema = {
        "info": tf.io.FixedLenFeature([1], tf.int64),  # [user]
        "x_masked_tokens": tf.io.FixedLenFeature([256], tf.int64),
        "y_tokens": tf.io.FixedLenFeature([256], tf.int64),
        "mask_layer": tf.io.FixedLenFeature([256], tf.int64),
    }
    content = tf.io.parse_single_example(element, schema)
    return content

In [None]:
train_tf_records = train_data.map(_parse_tf_records)

In [None]:
train_tf_records

In [None]:
df_train = pd.DataFrame(
    train_tf_records.as_numpy_iterator(),
    columns=['info', 'x_masked_tokens', 'y_tokens', 'mask_layer']
)
df_train.head()

In [None]:
import numpy as np

In [None]:
movies_ds = tf.data.Dataset.from_tensor_slices(
    (np.vstack(df_train.x_masked_tokens.to_numpy()), 
    np.vstack(df_train.y_tokens.to_numpy()), 
    np.vstack(df_train.mask_layer.to_numpy()))
)

In [None]:
np.vstack(df.x_masked_tokens.to_numpy())

In [None]:
movies_ds[]

In [None]:
movies_ds = movies_ds.shuffle(1000).batch(config.batch_size)

In [None]:
test = movies_ds.take(1)
list(test.as_numpy_iterator())[0][0]

## Model

In [None]:
from bert4rec.bert import BertModel
from bert4rec.trainer import BertTrainer
# %autoreload 2
# from bert4rec.trainer import BertTrainer

In [None]:
model = BertModel(
    num_layers=config.num_layers,
    num_heads=config.num_heads,
    d_model=config.d_model,
    dropout=config.dropout,
    layer_norm_eps=config.layer_norm_eps,
    activation=config.activation,
    vocab_size=config.vocab_size,
    max_seq_len=config.max_seq_len,
)
model.build((config.batch_size, config.max_seq_len))
model.summary()

In [None]:
bert_trainer = BertTrainer(model)
bert_trainer.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)

In [None]:
config

In [None]:
bert_trainer.fit(movies_ds, batch_size=config.batch_size, epochs=14)

In [None]:
training = CustomFit(model)
training.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)


In [None]:
training.fit(movies_ds, batch_size=64, epochs=2)

In [None]:
training.evaluate(movies_ds, batch_size=64)

In [None]:
print(CustomFit)
print(isinstance(self, CustomFit))

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
training.compile(optimizer=optimizer)
training.summary()

In [None]:
model.fit(movies_ds, epochs=5)

In [None]:
TypeError: super(type, obj): obj must be an instance or subtype of type

In [None]:
def get_bert4rec_model(config):
    inputs = tf.keras.layers.Input((config.max_seq_len,), dtype=tf.int64)
    bert4rec_model = Bert(
        num_layers=12,
        num_heads=12,
        d_model= 128,
        dropout= 0.1,
        layer_norm_eps= 1e-12,
        activation="gelu",
        vocab_size=40857,
        max_seq_len=512,
    )
    model = BertModel()(inputs, bert4rec_model(inputs), name="masked_bert_model")
    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    model.compile(optimizer=optimizer)
    return model

In [None]:
bert_masked_model = get_bert4rec_model(config)
bert_masked_model.summary()

In [None]:
bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
bert_masked_model.save("bert_mlm_imdb.h5")

In [None]:
# Create example data
data = {
    'Age': 29,
    'Movie': ['The Shawshank Redemption', 'Fight Club'],
    'Movie Ratings': [9.0, 9.7],
    'Suggestion': 'Inception',
    'Suggestion Purchased': 1.0,
    'Purchase Price': 9.99
}

print(data)

In [None]:
# Create the Example
example = tf.train.Example(features=tf.train.Features(feature={
    'Age': tf.train.Feature(
        int64_list=tf.train.Int64List(value=[data['Age']])),
    'Movie': tf.train.Feature(
        bytes_list=tf.train.BytesList(
            value=[m.encode('utf-8') for m in data['Movie']])),
    'Movie Ratings': tf.train.Feature(
        float_list=tf.train.FloatList(value=data['Movie Ratings'])),
    'Suggestion': tf.train.Feature(
        bytes_list=tf.train.BytesList(
            value=[data['Suggestion'].encode('utf-8')])),
    'Suggestion Purchased': tf.train.Feature(
        float_list=tf.train.FloatList(
            value=[data['Suggestion Purchased']])),
    'Purchase Price': tf.train.Feature(
        float_list=tf.train.FloatList(value=[data['Purchase Price']]))
}))

print(example)

In [None]:
# Write TFrecord file
with tf.io.TFRecordWriter('customer_1.tfrecord') as writer:
    writer.write(example.SerializeToString())

In [None]:
dataset

In [None]:
result

In [None]:
!pwd

In [None]:
# Read TFRecord file

dataset =  tf.data.TFRecordDataset(['customer_1.tfrecord'])

# _, serialized_example = reader.read(filename_queue)

# Define features
read_features = {
    'Age': tf.io.FixedLenFeature([], dtype=tf.int64),
    'Movie': tf.io.VarLenFeature(dtype=tf.string),
    'Movie Ratings': tf.io.VarLenFeature(dtype=tf.float32),
    'Suggestion': tf.io.FixedLenFeature([], dtype=tf.string),
    'Suggestion Purchased': tf.io.FixedLenFeature([], dtype=tf.float32),
    'Purchase Price': tf.io.FixedLenFeature([], dtype=tf.float32)
}

# Extract features from serialized data
read_data = tf.io.parse_single_example(dataset, read_features)

# Many tf.train functions use tf.train.QueueRunner,
# so we need to start it before we read
tf.train.start_queue_runners(sess)

# Print features
for name, tensor in read_data.items():
    print('{}: {}'.format(name, tensor.eval()))