## IMPORT

In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 85 kB 3.8 MB/s 
[K     |████████████████████████████████| 462 kB 56.5 MB/s 
[K     |████████████████████████████████| 4.2 MB 18.2 MB/s 
[?25h

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd

import tensorflow_recommenders as tfrs

# getting data
from google.colab import auth
import gspread
from google.auth import default
from gspread_dataframe import get_as_dataframe, set_with_dataframe

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [3]:
cd /content/drive/Othercomputers/My Laptop/Bangkit/Capstone/Recommender system/Notogo-ML/model_v2.0 

/content/drive/Othercomputers/My Laptop/Bangkit/Capstone/Recommender system/Notogo-ML/model_v2.0


In [4]:
print(os.getcwd())

/content/drive/Othercomputers/My Laptop/Bangkit/Capstone/Recommender system/Notogo-ML/model_v2.0


## Preparing the dataset

In [5]:
import userFeatures
builder = tfds.builder('Userfeatures')
userFeatureDs = tfds.load('Userfeatures',split='train')

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/userfeatures/1.0.0...[0m




Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/userfeatures/1.0.0.incompleteL6LTU4/userfeatures-train.tfrecord*...:   0%|…

[1mDataset userfeatures downloaded and prepared to /root/tensorflow_datasets/userfeatures/1.0.0. Subsequent calls will reuse this data.[0m


In [6]:
import wishEmbedding
builder = tfds.builder('Wishembedding')
wishEmbeddingDs = tfds.load('Wishembedding',split='train')

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/wishembedding/1.0.0...[0m




Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/wishembedding/1.0.0.incomplete1GN10E/wishembedding-train.tfrecord*...:   0…

[1mDataset wishembedding downloaded and prepared to /root/tensorflow_datasets/wishembedding/1.0.0. Subsequent calls will reuse this data.[0m


In [7]:
ratings = userFeatureDs.map(lambda x: {
    "location_name": x["location_name"],
    "user_id": x["user_id"],
    "add" : x["add"]
})
locations = wishEmbeddingDs.map(lambda x: x["location_name"])

In [8]:
for x in ratings.take(2).as_numpy_iterator():
  pprint.pprint(x)

{'add': 0, 'location_name': b'JAKARTA', 'user_id': b'93'}
{'add': 0, 'location_name': b'DUBAI', 'user_id': b'112'}


In [9]:
tf.random.set_seed(42)

NUM_DATA = ratings.__len__().numpy()

shuffled = ratings.shuffle(NUM_DATA, seed=42, reshuffle_each_iteration=False)

trainset_size = 0.8 * NUM_DATA

train = shuffled.take(trainset_size)
test = shuffled.skip(trainset_size).take(NUM_DATA - trainset_size)

location_name = locations.batch(1000)
user_ids = ratings.batch(1000).map(lambda x: x["user_id"])

unique_location_name = np.unique(np.concatenate(list(location_name)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

## MODEL

There are two critical parts to multi-task recommenders:

1. They optimize for two or more objectives, and so have two or more losses.
2. They share variables between the tasks, allowing for transfer learning.

In [10]:
class NoToGoModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    # We take the loss weights in the constructor: this allows us to instantiate
    # several model objects with different loss weights.

    super().__init__()

    embedding_dimension = 32

    # User and movie models.
    self.location_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_location_name, mask_token=None),
      tf.keras.layers.Embedding(len(unique_location_name) + 1, embedding_dimension),
      tf.keras.layers.Dense(32, activation="relu")
    ])
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
      tf.keras.layers.Dense(32, activation="relu")
    ])

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(1, activation = "sigmoid"),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=locations.batch(128).map(self.location_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model.
    location_embeddings = self.location_model(features["location_name"])
    
    return (
        user_embeddings,
        location_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings,location_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("add")

    user_embeddings, location_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, location_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

### SKIP

In [11]:
testModel = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_location_name, mask_token=None),
      tf.keras.layers.Embedding(len(unique_location_name) + 1, 32),
      tf.keras.layers.Dense(32, activation="relu")
])

In [12]:
cek = tf.keras.layers.StringLookup(vocabulary=unique_location_name, mask_token=None)

In [None]:
unique_location_name

array([b'AIR TERJUN SIPISO-PISO', b'AKIHABARA', b'ALASKA',
       b'AMERIKA SERIKAT', b'ANCOL', b'ASIA TIMUR', b'BALI',
       b'BALI TOWER', b'BANDUNG', b'BANGKA BELITUNG', b'BHUTAN',
       b'BRANCHSTO BSD', b'CANDI BOROBUDUR', b'CANDI PRAMBANAN',
       b'CANGGU', b'CAPADOCIA', b'DANAU SALAR DE UYUNI', b'DANAU TOBA',
       b'DENMARK', b'DIENG', b'DISNEY SEA', b'DISNEYLAND PARIS', b'DUBAI',
       b'DUFAN', b'EROPA', b'ES KRIM RAGUSA', b'EUROPE',
       b'GAMPLONG STUDIO', b'GEOPARK CILETUH', b'GUA GONG',
       b'GUA TABUHAN', b'GUMUK PASIR PARANGKUSUMO', b'GUNUNG',
       b'GUNUNG AGUNG', b'GUNUNG BROMO', b'GUNUNG FUJI', b'GUNUNG GEDE',
       b'GUNUNG PANGRANGO', b'GUNUNG PARANG', b'GUNUNG RINJANI',
       b'GUNUNG SEMERU', b'INDONESIA', b'ISRAEL', b'JAKARTA',
       b'JAWA BARAT', b'JEPANG', b'JERMAN', b'KATULAMPA RAFTING BOGOR',
       b'KENDARI', b'KOREA SELATAN', b'LABUAN BAJO', b'LAMPUNG',
       b'LOMBOK', b'LONDON', b'MALANG', b'MALAYSIA', b'MALDIVES',
       b'MALIOBORO',

In [None]:
cek(tf.constant([['GUNUNG BROMO','DANAU TOBA']])) # 0 means out of vocabulary

<tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[35, 18]])>

### Rating-specialized model

Depending on the weights we assign, the model will encode a different balance of the tasks. Let's start with a model that only considers ratings.

In [20]:
model = NoToGoModel(rating_weight=1.0, retrieval_weight=0.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [11]:
cached_train = train.shuffle(NUM_DATA).batch(512).cache()
cached_test = test.batch(256).cache()

In [15]:
model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.826.
Ranking RMSE: 0.446.


### Retrieval-specialized model

Let's now try a model that focuses on retrieval only.

In [12]:
model = NoToGoModel(rating_weight=0.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [13]:
model.fit(cached_train, epochs=10)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Retrieval top-100 accuracy: 0.945.
Ranking RMSE: 0.511.


In [14]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_100_categorical_accuracy': 0.9453850388526917,
 'factorized_top_k/top_10_categorical_accuracy': 0.23539049923419952,
 'factorized_top_k/top_1_categorical_accuracy': 0.10067358613014221,
 'factorized_top_k/top_50_categorical_accuracy': 0.6379027962684631,
 'factorized_top_k/top_5_categorical_accuracy': 0.1452758014202118,
 'loss': 558.7587280273438,
 'regularization_loss': 0,
 'root_mean_squared_error': 0.5109630823135376,
 'total_loss': 558.7587280273438}

In [15]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((locations.batch(100), locations.batch(100).map(model.location_model)))
)

# Get recommendations.
user_id = "183"
_, titles = index(tf.constant([user_id]))
print(f"Recommendations for Dimas : {titles[0, :7]}")

Recommendations for Dimas : [b'GUNUNG FUJI' b'WAKATOBI' b'PRANCIS' b'LABUAN BAJO' b'TAWANGMANGU'
 b'BALI TOWER' b'NUSA TENGGARA BARAT']


### Joint model

Let's now train a model that assigns positive weights to both tasks.

In [None]:
model = MovielensModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.965.
Ranking RMSE: 0.447.


## INTERFACE FOR NEW USER

In [16]:
worksheetCat = gc.open('capstone_dataset').worksheet('category')
rows = worksheetCat.get_all_values()
dfCategories = pd.DataFrame.from_records(rows[1:], columns=rows[0])
list_categories = [i for i in dfCategories['category'] if i != '']
worksheetUserFeat = gc.open('capstone_dataset').worksheet('userFeatures(coldstartsol)')
rows = worksheetUserFeat.get_all_values()
dfUserFeat = pd.DataFrame.from_records(rows[1:], columns=rows[0])

In [None]:
last_user_id = dfUserFeat['user_id'].to_list()
last_user_id = int(last_user_id[-1]) + 1 

def input_user_data():
  user_data = {}
  # input_name = input("Nama: ")
  user_data['user_id'] = str(last_user_id)
  last_user_ids = last_user_id + 1
  user_categories = list()
  for i in list_categories:
    input_category = int(input("{} ? (1: yes, 0: no) : ".format(i)))
    if input_category == 1:
      user_categories.append(i)
  user_data["categories"] = user_categories
  return user_data, last_user_ids
user_data, last_user_id = input_user_data()

In [62]:
def encode_dummy_category(location):
  if location == "beach":
    return 121
  if location == "mountain":
    return 123
  if location == "lake":
    return 124
  if location == "zoo":
    return 125
  if location == "river":
    return 126
  if location == "conservation":
    return 127
  if location == "waterpark":
    return 128
  if location == "waterfall":
    return 129
  if location == "artGallery":
    return 130
  if location == "amusementPark":
    return 131
  if location == "mall":
    return 132
  if location == "HistoricalPlace":
    return 133
  if location == "religious":
    return 134
  if location == "outbond":
    return 135
  if location == "culinary":
    return 136
  if location == "photoHunting":
    return 137
  if location == "sightSeeing":
    return 138
  if location == "shopping":
    return 139

In [81]:
def create_data_frame(user_data, dfUserFeatcold):
  user_id = [user_data['user_id'] for i in list_categories]
  like = ['0' for i in list_categories]
  categories = list_categories
  location = categories
  location_id = [encode_dummy_category(i) for i in list_categories]
  add = list()
  for i in list_categories:
    if i in user_data['categories']:
      add.append("1")
    else:
      add.append("0")
  data = {'user_id' : user_id, 'like' : like, "add" : add, "category" : categories, "location" : location, "location_id" : location_id}
  df = pd.DataFrame(data)
  df_concat = pd.concat([dfUserFeatcold,df], ignore_index= True)
  return df, df_concat   


In [84]:
df, df_concat = create_data_frame(user_data, dfUserFeat)
worksheetUser = gc.open('capstone_dataset').worksheet('userFeatures')
set_with_dataframe(worksheetUser, df_concat)