### Na poczatku brakowalo linijki z timestamps adapt i to powodowalo ze jeden z wartosci wektora byla e+08 gdzie reszta (-1, 1)

In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann

[K     |████████████████████████████████| 85 kB 2.6 MB/s 
[K     |████████████████████████████████| 4.0 MB 4.3 MB/s 
[K     |████████████████████████████████| 10.6 MB 6.4 MB/s 
[?25h

In [2]:
import os
import pprint
import tempfile
import pickle
from collections import Counter

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [3]:
import tensorflow_recommenders as tfrs

In [4]:
ratings = tfds.load('movielens/100k-ratings', split="train")
movies = tfds.load('movielens/100k-movies', split="train")

# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": x["user_rating"]
})
movies = movies.map(lambda x: x["movie_title"])

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling movielens-train.tfrecord...:   0%|          | 0/100000 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m
[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling movielens-train.tfrecord...:   0%|          | 0/1682 [00:00<?, ? examples/s]

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.0. Subsequent calls will reuse this data.[0m


In [5]:
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [6]:
user_id_lookup = tf.keras.layers.StringLookup()
user_id_lookup.adapt(ratings.map(lambda x: x["user_id"]))

In [7]:
class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_movie_titles,mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
    ])

    self.title_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

    self.title_text_embedding = tf.keras.Sequential([
      self.title_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.title_vectorizer.adapt(movies)

  def call(self, inputs):
    return tf.concat([
        self.title_embedding(inputs),
        self.title_text_embedding(inputs)
    ], axis=1)

In [8]:
class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = MovieModel()

    self.dense_layers = tf.keras.Sequential()

    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [10]:
class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
        user_id_lookup,
        tf.keras.layers.Embedding(user_id_lookup.vocab_size(), 32),
    ])

  def call(self, inputs):
    return tf.concat([
        self.user_embedding(inputs)
    ], axis=1)

In [11]:
class QueryModel(tf.keras.Model):
  """Model for encoding user queries."""

  def __init__(self, layer_sizes):
    """Model for encoding user queries.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = UserModel()

    self.dense_layers = tf.keras.Sequential()

    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [68]:
class MovielensModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    super().__init__()

    # User and movie models.
    self.movie_model = CandidateModel([32,16])
    self.user_model = QueryModel([32, 16])
    self.movies = movies

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    # self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
    #     metrics=tfrs.metrics.FactorizedTopK(
    #         candidates=movies.batch(128).map(self.movie_model)
    #     )
    # )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    # user_embeddings = self.user_model({
    #     "user_id": features["user_id"],
    #     # "timestamp": features["timestamp"],
    # })
    user_embeddings = self.user_model(features['user_id'])
    # And pick out the movie features and pass them into the movie model.
    movie_embeddings = self.movie_model(features["movie_title"])

    return (
        user_embeddings,
        movie_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("user_rating")

    user_embeddings, movie_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    # retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings, compute_metrics=not training)

    # And combine them using the loss weights.
    # return (self.rating_weight * rating_loss
    #         + self.retrieval_weight * retrieval_loss)
    return (self.rating_weight * rating_loss)

In [69]:
model = MovielensModel(rating_weight=1.0, retrieval_weight=0.0)
#model.compile(optimizer=tf.keras.optimizers.Adagrad(0.005))
model.compile(optimizer=tf.keras.optimizers.Adam())





In [70]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [71]:
one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    # validation_freq=5,
    epochs=1,
    verbose=1)
# metrics = model.evaluate(cached_test, return_dict=True)

# print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
# print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")



In [72]:
tf.saved_model.save(model, "export")



INFO:tensorflow:Assets written to: export/assets


INFO:tensorflow:Assets written to: export/assets


In [25]:
class InferenceMovieModel:
  def __init__(self, base_model:MovielensModel, movies, n_ran:int = 5, n_ret:int = 150) -> None:
    super().__init__()
    self.base_model = base_model
    self.scann_index = tfrs.layers.factorized_top_k.ScaNN(base_model.user_model)
    self.scann_index.index_from_dataset(tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model))))
    self.n_ran = n_ran
    self.n_ret = n_ret

  def call(self, id:str) -> tf.Tensor:
    ret = self.scann_index(tf.constant([id]), self.n_ret)
    movies = ret[1].numpy().flatten().tolist()
    ranking = {}
    for movie in movies:
      pred = self.base_model({"user_id":np.array(id).reshape(-1, 1), 'movie_title':np.array(movie).reshape(-1, 1)})
      ranking[movie] = pred[2].numpy()[0].tolist()[0]
    counter = Counter(ranking)
    ranking = counter.most_common(self.n_ran)
    return ranking

In [26]:
obj = InferenceMovieModel(model, movies)

In [22]:
obj.saved_models.save('./m')

NameError: ignored

In [None]:
for x in ratings.filter(lambda x: x['user_id'] == '11').as_numpy_iterator():
  print(x)

In [None]:
%%time
a = obj('11')

In [None]:
a

In [27]:
with open('/content/drive/MyDrive/models/model_pkl', 'wb') as files:
  pickle.dump(obj, files)





FailedPreconditionError: ignored