In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup
from sklearn.model_selection import train_test_split
import os
import math

In [None]:
df = pd.read_csv('train.csv')
df=df[:50000]

In [None]:
df["userId"] = df["userId"].apply(lambda x: f"user_{x}")

df["itemId"] = df["itemId"].apply(lambda x: f"item_{x}")

df["rating"] = df["rating"].apply(lambda x: float(x))
    

ratings_group = df.sort_values(by=["date"]).groupby("userId")

ratings_data = pd.DataFrame(
    data={
        "userId": list(ratings_group.groups.keys()),
        "itemId": list(ratings_group.itemId.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "date": list(ratings_group.date.apply(list)),
    }
)
     

In [None]:
sequence_length = 6
step_size = 2
     

def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.itemId = ratings_data.itemId.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del ratings_data["date"]




In [None]:

ratings_data_item = ratings_data[["userId", "itemId"]].explode(
    "itemId", ignore_index=True
)
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_item, ratings_data_rating], axis=1)

ratings_data_transformed.itemId = ratings_data_transformed.itemId.str.join(',')

ratings_data_transformed = ratings_data_transformed.dropna(subset=['ratings'])

ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
    lambda x: ",".join([str(v) for v in x])
)

ratings_data_transformed.rename(
    columns={"itemId": "sequence_item_ids", "ratings": "sequence_ratings"},
    inplace=True,
)
     
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.80
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

train_data.to_csv("train_new.csv", index=False, sep="|", header=False)
test_data.to_csv("test_new.csv", index=False, sep="|", header=False)

CSV_HEADER = list(ratings_data_transformed.columns)

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "userId": list(df.userId.unique()),
    "itemId": list(df.itemId.unique()),
}


def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        items_ids_string = features["sequence_item_ids"]
        sequence_items_ids = tf.strings.split(items_ids_string, ",").to_tensor()

        # The last item id in the sequence is the target item.
        features["target_item_id"] = sequence_items_ids[:, -1]
        features["sequence_item_ids"] = sequence_items_ids[:, :-1]
        
        ratings_string = features["sequence_ratings"]
        sequence_ratings = tf.strings.to_number(
            tf.strings.split(ratings_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last rating in the sequence is the target for the model to predict.
        target = sequence_ratings[:, -1]
        features["sequence_ratings"] = sequence_ratings[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset

In [None]:

def encode_input_features(inputs,include_user_id=True,include_user_features=False,include_item_features=False):

    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("user_id")
    if include_user_features:
        other_feature_names.extend(USER_FEATURES)


    for feature_name in other_feature_names:
        
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)(
            inputs[feature_name]
        )
        
        embedding_dims = int(math.sqrt(len(vocabulary)))
        
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )
        
        encoded_other_features.append(embedding_encoder(idx))
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a item embedding encoder
    item_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["itemId"]
    item_embedding_dims = int(math.sqrt(len(item_vocabulary)))
    # Create a lookup to convert string values to integer indices.
    item_index_lookup = StringLookup(
        vocabulary=item_vocabulary,
        mask_token=None,
        num_oov_indices=0,
        name="item_index_lookup",
    )
    
    item_embedding_encoder = layers.Embedding(
        input_dim=len(item_vocabulary),
        output_dim=item_embedding_dims,
        name=f"item_embedding",
    )
    
    item_embedding_processor = layers.Dense(
        units=item_embedding_dims,
        activation="relu",
        name="process_item_embedding_with_genres",
    )

    def encode_item(item_id):
        # Convert the string input values into integer indices.
        item_idx = item_index_lookup(item_id)
        item_embedding = item_embedding_encoder(item_idx)
        encoded_item = item_embedding
        if include_item_features:
            item_genres_vector = item_genres_lookup(item_idx)
            encoded_item = item_embedding_processor(
                layers.concatenate([item_embedding, item_genres_vector])
            )
        return encoded_item 

    target_item_id = inputs["target_item_id"]
    encoded_target_item = encode_item(target_item_id)

    ## Encoding sequence item_ids.
    sequence_items_ids = inputs["sequence_item_ids"]
    encoded_sequence_items = encode_item(sequence_items_ids)
    # Create positional embedding.
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=item_embedding_dims,
        name="position_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encodded_positions = position_embedding_encoder(positions)
    
    sequence_ratings = tf.expand_dims(inputs["sequence_ratings"], -1)
    
    encoded_sequence_items_with_poistion_and_rating = layers.Multiply()(
        [(encoded_sequence_items + encodded_positions), sequence_ratings]
    )

    # Construct the transformer inputs.
    for encoded_item in tf.unstack(
        encoded_sequence_items_with_poistion_and_rating, axis=1
    ):
        encoded_transformer_features.append(tf.expand_dims(encoded_item, 1))
    encoded_transformer_features.append(encoded_target_item)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1
    )

    return encoded_transformer_features, encoded_other_features
    
    
    
    
def create_model_inputs():
    return {
        "userId": layers.Input(name="userId", shape=(1,), dtype=tf.string),
        "sequence_item_ids": layers.Input(
            name="sequence_item_ids", shape=(sequence_length - 1,), dtype=tf.string
        ),
        "target_item_id": layers.Input(
            name="target_item_id", shape=(1,), dtype=tf.string
        ),
        "sequence_ratings": layers.Input(
            name="sequence_ratings", shape=(sequence_length - 1,), dtype=tf.float32
        )}
     

    
include_user_id = False
include_user_features = False
include_item_features = False

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3

In [None]:
def create_model():
    inputs = create_model_inputs()
    transformer_features, other_features = encode_input_features(
        inputs, include_user_id, include_user_features, include_item_features
    )

    # Create a multi-headed attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    x1 = layers.Add()([transformer_features, attention_output])
    x1 = layers.LayerNormalization()(x1)
    x2 = layers.LeakyReLU()(x1)
    x2 = layers.Dense(units=x2.shape[-1])(x2)
    x2 = layers.Dropout(dropout_rate)(x2)
    transformer_features = layers.Add()([x1, x2])
    transformer_features = layers.LayerNormalization()(transformer_features)
    features = layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = layers.concatenate(
            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = layers.Dense(num_units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.LeakyReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=1)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
model = create_model()


In [None]:
model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanAbsoluteError()],
)

# Read the training data.
train_dataset = get_dataset_from_csv("train_new.csv", shuffle=True, batch_size=265)

# Fit the model with the training data.
model.fit(train_dataset, epochs=5)

# Read the test data.
test_dataset = get_dataset_from_csv("test_new.csv", batch_size=265)