In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/malinphy/datasets/main/ml_1M/ratings.dat', sep="::", header=None, engine='python')
df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
df['timestamp']= pd.to_datetime(df['timestamp'], unit='s')
def data_splitter(x):
  return str(x).split()[0]

df['timestamp'] = df['timestamp'].apply(data_splitter)
df['timestamp'] =pd.to_datetime(df['timestamp'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   user_id    1000209 non-null  int64         
 1   movie_id   1000209 non-null  int64         
 2   rating     1000209 non-null  int64         
 3   timestamp  1000209 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3)
memory usage: 30.5 MB


In [4]:
users = df['user_id'].unique()
movies = df['movie_id'].unique()
num_users = len(users)
num_movies = len(movies)

print('num_users',num_users)
print('num_movies',num_movies)

movie_2enc = {i+1:j for i,j in enumerate(movies)}
enc_2movies = {j:i+1 for i,j in enumerate(movies)}

user_2enc = {i+1:j for i,j in enumerate(users)}
enc_2users = {j:i+1 for i,j in enumerate(users)}

num_users 6040
num_movies 3706


In [5]:
encoded_movies = df['movie_id'].map(enc_2movies)
encoded_users = df['user_id'].map(enc_2users)

In [6]:
df2 = pd.DataFrame({'user_id_enc':encoded_users, 'movie_id_enc':encoded_movies, 'timestamp':df['timestamp'].copy()})
x = df2.set_index(['user_id_enc','movie_id_enc']).sort_index()
x = x.reset_index()

x.head(3)

Unnamed: 0,user_id_enc,movie_id_enc,timestamp
0,1,1,2000-12-31
1,1,2,2000-12-31
2,1,3,2000-12-31


In [7]:
x = x.drop(columns= ['timestamp'])  

In [8]:
#### 
df_enc_seq = x.groupby('user_id_enc').aggregate(lambda tdf: tdf.unique().tolist())
df_enc_seq = df_enc_seq.reset_index()


In [9]:
### masked language model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from dataclasses import dataclass
import pandas as pd
import numpy as np
import glob
import re
from pprint import pprint
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
MAX_LEN = 256
BATCH_SIZE = 32
LR = 0.001
VOCAB_SIZE = num_movies + 4
EMBED_DIM = 256
NUM_HEAD = 8  # used in bert model
FF_DIM = 256 +128  # used in bert model
NUM_LAYERS = 2

In [11]:
# MAX_LEN = 256
# BATCH_SIZE = 32
# LR = 0.001
# VOCAB_SIZE = num_movies + 4
# EMBED_DIM = 256
# NUM_HEAD = 8  # used in bert model
# FF_DIM = 256   # used in bert model
# NUM_LAYERS = 2
# epocsh = 25

In [12]:
arranged_seqs = pad_sequences(
    df_enc_seq['movie_id_enc'], maxlen=MAX_LEN, dtype='int32', padding='pre',
    truncating='pre', value=0.0
)

In [13]:
mask_token_id = num_movies+1
print(mask_token_id)

3707


In [14]:
def get_masked_input_and_labels(encoded_texts):
    # 15% BERT masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    # Do not mask special tokens
    inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[
        inp_mask_2mask
    ] = mask_token_id  # mask token is the last in the dict

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels

In [15]:
x_input, y_input = get_masked_input_and_labels(arranged_seqs)

In [16]:
def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc

def bert_module(query, key, value, i):
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=NUM_HEAD,
        key_dim=EMBED_DIM // NUM_HEAD,
        name="encoder_{}/multiheadattention".format(i),
    )(query, key, value)
    attention_output = layers.Dropout(0.1, name="encoder_{}/att_dropout".format(i))(
        attention_output
    )
    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/att_layernormalization".format(i)
    )(query + attention_output)

    # Feed-forward layer
    ffn = keras.Sequential(
        [
            layers.Dense(FF_DIM, activation="relu"),
            layers.Dense(EMBED_DIM),
        ],
        name="encoder_{}/ffn".format(i),
    )
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name="encoder_{}/ffn_dropout".format(i))(
        ffn_output
    )
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/ffn_layernormalization".format(i)
    )(attention_output + ffn_output)
    return sequence_output    


loss_fn = keras.losses.SparseCategoricalCrossentropy(
    reduction=tf.keras.losses.Reduction.NONE
)
loss_tracker = tf.keras.metrics.Mean(name="loss")



class MaskedLanguageModel(tf.keras.Model):
    def train_step(self, inputs):
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None

        with tf.GradientTape() as tape:
            predictions = self(features, training=True)
            loss = loss_fn(labels, predictions, sample_weight=sample_weight)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Compute our own metrics
        loss_tracker.update_state(loss, sample_weight=sample_weight)

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]



def create_masked_language_bert_model():
    inputs = layers.Input((MAX_LEN,), dtype=tf.int64)

    word_embeddings = layers.Embedding(
        VOCAB_SIZE, EMBED_DIM, name="word_embedding"
    )(inputs)
    position_embeddings = layers.Embedding(
        input_dim=MAX_LEN,
        output_dim=EMBED_DIM,
        weights=[get_pos_encoding_matrix(MAX_LEN, EMBED_DIM)],
        name="position_embedding",
    )(tf.range(start=0, limit=MAX_LEN, delta=1))
    embeddings = word_embeddings + position_embeddings

    encoder_output = embeddings
    for i in range(NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)

    mlm_output = layers.Dense(VOCAB_SIZE, name="mlm_cls", activation="softmax")(
        encoder_output
    )
    mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")

    optimizer = keras.optimizers.Adam(learning_rate=LR)
    mlm_model.compile(optimizer=optimizer)
    return mlm_model

bert_masked_model = create_masked_language_bert_model()

In [17]:
bert_masked_model.fit(x_input, y_input, epochs=25,
                      # callbacks=[generator_callback]
                      )
# bert_masked_model.save("bert_mlm_imdb.h5")


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f8be00a2850>

In [18]:
# x = y_input[100].reshape(-1,1)
# len(x)
# np.where(y_input[100] == 0 )[0][-1]
# np.random.randint(149,256)

In [19]:
from sklearn.model_selection import train_test_split
train_x_input, test_x_input, train_y_input, test_y_input = train_test_split(x_input, y_input, 
                                                                            train_size = 0.9, test_size = 0.1,
                                                                            random_state = 42)

print('\n','train_x_input.shape',train_x_input.shape,'\n', 'train_y_input.shape',train_y_input.shape,'\n',
      'test_x_input.shape',test_x_input.shape,'\n','test_y_input.shape',test_y_input.shape,'\n')


 train_x_input.shape (5436, 256) 
 train_y_input.shape (5436, 256) 
 test_x_input.shape (604, 256) 
 test_y_input.shape (604, 256) 



In [20]:
# print(test_x_input[0])
# print(test_y_input[0])

In [21]:
#### tesk masker is designed to randomly select mask position. Since all inputs were pre-padded with 0 values,
#### mask position will be selected from the non-zero element

# def test_masker(test_input):
#   x = test_input.copy()
#   # x = x.reshape(-1,1)
#   non_zero_elemnt = len(x)
#   non_zero_pos = list(np.where(x != 0 )[0])[0]
#   random_masking = np.random.randint(non_zero_pos,len(x))
#   # print('nons',non_zero_pos)
#   # print(x)
#   x[random_masking] = mask_token_id

#   # print(random_masking)
#   return (x,3)


# masked_test_inputs,asf = np.apply_along_axis(test_masker, 1, test_y_input)
# (bir, iki) = np.apply_along_axis(test_masker, 1, test_y_input)

In [22]:
def mask_pos(test_input):
  x=test_input.copy()
  non_zero_element = len(x)
  non_zero_pos = list(np.where(x!=0)[0])[0]
  random_masking = np.random.randint(non_zero_pos,len(x))

  return random_masking

random_masking_positions = np.apply_along_axis(mask_pos, 1, test_y_input)

def mask_filler(test_input):
  x = test_input.copy()
  for i in range(len(x)):
    # x[random_masking_positions[i]][i] = mask_token_id
    y = x[i]
    y[random_masking_positions[i]] =mask_token_id
    x[i] = y
  return x

In [23]:
masked_test_inputs = mask_filler(test_y_input)

In [24]:
# print((preds.shape))

In [25]:
test_y_input[0]

array([1067, 1068, 1073, 1076, 1077, 1082, 1083, 1084, 1085, 1089, 1091,
       1092, 1094, 1095, 1096, 1100, 1110, 1116, 1119, 1140, 1142, 1151,
       1156, 1166, 1173, 1176, 1177, 1183, 1186, 1190, 1193, 1195, 1197,
       1201, 1204, 1209, 1211, 1212, 1214, 1217, 1219, 1222, 1223, 1227,
       1232, 1248, 1257, 1259, 1264, 1265, 1266, 1272, 1275, 1276, 1277,
       1290, 1293, 1300, 1302, 1303, 1305, 1309, 1310, 1311, 1312, 1315,
       1329, 1330, 1332, 1339, 1340, 1341, 1345, 1347, 1351, 1355, 1360,
       1361, 1364, 1369, 1385, 1386, 1392, 1394, 1396, 1397, 1399, 1401,
       1407, 1415, 1416, 1417, 1420, 1424, 1434, 1435, 1468, 1493, 1516,
       1518, 1519, 1527, 1529, 1532, 1538, 1539, 1552, 1554, 1565, 1567,
       1574, 1591, 1596, 1597, 1599, 1600, 1608, 1630, 1643, 1658, 1660,
       1669, 1674, 1681, 1682, 1684, 1686, 1687, 1689, 1707, 1711, 1732,
       1749, 1750, 1767, 1772, 1773, 1776, 1784, 1787, 1791, 1794, 1797,
       1798, 1802, 1804, 1823, 1836, 1844, 1849, 18

In [26]:
masked_test_inputs[0]

array([1067, 1068, 1073, 1076, 1077, 1082, 1083, 1084, 1085, 1089, 1091,
       1092, 1094, 1095, 1096, 1100, 1110, 1116, 1119, 1140, 1142, 1151,
       1156, 1166, 1173, 1176, 1177, 1183, 1186, 1190, 1193, 1195, 1197,
       1201, 1204, 1209, 1211, 1212, 1214, 1217, 1219, 1222, 1223, 1227,
       1232, 1248, 1257, 1259, 1264, 1265, 1266, 1272, 1275, 1276, 1277,
       1290, 1293, 1300, 1302, 1303, 1305, 1309, 1310, 1311, 1312, 1315,
       1329, 1330, 1332, 1339, 1340, 1341, 1345, 1347, 1351, 1355, 1360,
       1361, 1364, 1369, 1385, 1386, 1392, 1394, 1396, 1397, 1399, 1401,
       1407, 1415, 1416, 1417, 1420, 1424, 1434, 1435, 1468, 1493, 1516,
       1518, 1519, 1527, 1529, 1532, 1538, 1539, 1552, 1554, 1565, 1567,
       1574, 1591, 1596, 1597, 1599, 1600, 1608, 1630, 1643, 1658, 1660,
       1669, 1674, 1681, 1682, 1684, 1686, 1687, 1689, 1707, 1711, 1732,
       1749, 1750, 1767, 1772, 1773, 1776, 1784, 1787, 1791, 1794, 1797,
       1798, 1802, 1804, 1823, 1836, 1844, 1849, 18

In [27]:
preds = bert_masked_model.predict([masked_test_inputs[0:500]])
mask_positions = np.where(masked_test_inputs[0:500] == mask_token_id)[1]
# print('mask_positions',mask_positions)

hit_rate_counter = 0
for i in  range(len(preds)):
  actual_token = test_y_input[i][mask_positions[i]] 
  # print('actual_token',actual_token)

  pred_mask_pos = preds[i][mask_positions[i]]
  top_pos = list(np.array((tf.math.top_k(pred_mask_pos, k = 10)[1])))
  # print((top_pos))
  if actual_token in top_pos:
    hit_rate_counter +=1



In [28]:
hit_rate_counter

310

In [29]:
# def predictor():
#   preds = bert_masked_model.predict([masked_test_inputs[0:4]])
#   mask_positions = np.where(masked_test_inputs[0:4] == mask_token_id)[1]
#   t = []
#   for i in  range(len(preds)):
#     actual_token = np.where(test_y_input[i] == mask_token_id)[0]
  
#     pred_mask_pos = preds[i][mask_positions[i]]
#     top_pos = list(np.array((tf.math.top_k(pred_mask_pos, k = 6)[1])))
#   t.append(top_pos)
#   return(t)
# predictor()
# preds = bert_masked_model.predict([masked_test_inputs[0:2]])
# mask_positions = np.where(masked_test_inputs[0:2] == mask_token_id)[1]
# mask_positions
# preds[:,mask_positions].shape
# preds.shape
# tf.math.top_k(preds[:,mask_positions], k=5)[1]