<a href="https://colab.research.google.com/github/louis-not/Notogo-ML/blob/main/model_v2.2/Retrieval_Model_Ver_2_(Multitask_recommender)_Experimental.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORT

In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 85 kB 2.8 MB/s 
[K     |████████████████████████████████| 462 kB 37.7 MB/s 
[K     |████████████████████████████████| 4.2 MB 5.2 MB/s 
[?25h

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd

import tensorflow_recommenders as tfrs

# getting data
from google.colab import auth
import gspread
from google.auth import default
from gspread_dataframe import get_as_dataframe, set_with_dataframe

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd /content/drive/MyDrive/Bangkit/Capstone/ML/ML-GH/Notogo-ML/"model_v2.1"

/content/drive/MyDrive/Bangkit/Capstone/ML/ML-GH/Notogo-ML/model_v2.1


In [5]:
print(os.getcwd())

/content/drive/MyDrive/Bangkit/Capstone/ML/ML-GH/Notogo-ML/model_v2.1


## Preparing the dataset

In [6]:
import userFeatures
builder = tfds.builder('Userfeatures')
userFeatureDs = tfds.load('Userfeatures',split='train')

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/userfeatures/1.0.0...[0m




Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/userfeatures/1.0.0.incomplete2S8L9E/userfeatures-train.tfrecord*...:   0%|…

[1mDataset userfeatures downloaded and prepared to /root/tensorflow_datasets/userfeatures/1.0.0. Subsequent calls will reuse this data.[0m


In [7]:
import wishEmbedding
builder = tfds.builder('Wishembedding')
wishEmbeddingDs = tfds.load('Wishembedding',split='train')

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/wishembedding/1.0.0...[0m




Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/wishembedding/1.0.0.incomplete4NSTYB/wishembedding-train.tfrecord*...:   0…

[1mDataset wishembedding downloaded and prepared to /root/tensorflow_datasets/wishembedding/1.0.0. Subsequent calls will reuse this data.[0m


In [8]:
ratings = userFeatureDs.map(lambda x: {
    "location_name": x["location_name"],
    "user_id": x["user_id"],
    "add" : x["add"],
    "like" : x['like']
})
locations = wishEmbeddingDs.map(lambda x: x["location_name"])

In [9]:
for x in ratings.take(2).as_numpy_iterator():
  pprint.pprint(x)

{'add': 0, 'like': 1, 'location_name': b'JAKARTA', 'user_id': b'93'}
{'add': 0, 'like': 1, 'location_name': b'DUBAI', 'user_id': b'112'}


In [10]:
tf.random.set_seed(42)

NUM_DATA = ratings.__len__().numpy()

shuffled = ratings.shuffle(NUM_DATA, seed=42, reshuffle_each_iteration=False)

trainset_size = 0.8 * NUM_DATA

train = shuffled.take(trainset_size)
test = shuffled.skip(trainset_size).take(NUM_DATA - trainset_size)

location_name = locations.batch(1000)
user_ids = ratings.batch(1000).map(lambda x: x["user_id"])

unique_location_name = np.unique(np.concatenate(list(location_name)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [11]:
cached_train = train.shuffle(NUM_DATA).batch(512).cache()
cached_test = test.batch(256).cache()

## MODEL

There are two critical parts to multi-task recommenders:

1. They optimize for two or more objectives, and so have two or more losses.
2. They share variables between the tasks, allowing for transfer learning.

In [12]:
class NoToGoModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, like_weight: float,retrieval_weight: float) -> None:
    # We take the loss weights in the constructor: this allows us to instantiate
    # several model objects with different loss weights.

    super().__init__()

    embedding_dimension = 32

    # User and movie models.
    self.location_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_location_name, mask_token=None),
      tf.keras.layers.Embedding(len(unique_location_name) + 1, embedding_dimension),
      tf.keras.layers.Dense(16, activation="relu")
    ])

    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
      tf.keras.layers.Dense(16, activation="relu")
    ])

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(8, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation = "sigmoid"),
    ])

    self.like_model = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation = "sigmoid"),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )

    self.like_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=locations.batch(128).map(self.location_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight
    self.like_weight = like_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model.
    location_embeddings = self.location_model(features["location_name"])
    
    return (
        user_embeddings,
        location_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings,location_embeddings], axis=1)
        ),
        self.like_model(
            tf.concat([user_embeddings,location_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("add","like")
    like = features.pop("like", "add")

    user_embeddings, location_embeddings, rating_predictions, like_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )

    like_loss = self.like_task(
        labels=like,
        predictions=like_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, location_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss + like_loss*self.like_weight)

### SKIP

In [13]:
testModel = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_location_name, mask_token=None),
      tf.keras.layers.Embedding(len(unique_location_name) + 1, 32),
      tf.keras.layers.Dense(32, activation="relu")
])

In [14]:
cek = tf.keras.layers.StringLookup(vocabulary=unique_location_name, mask_token=None)

In [15]:
unique_location_name

array([b'AIR TERJUN SIPISO-PISO', b'AKIHABARA', b'ALASKA',
       b'AMERIKA SERIKAT', b'ANCOL', b'ASIA TIMUR', b'BALI',
       b'BALI TOWER', b'BANDUNG', b'BANGKA BELITUNG', b'BHUTAN',
       b'BRANCHSTO BSD', b'CANDI BOROBUDUR', b'CANDI PRAMBANAN',
       b'CANGGU', b'CAPADOCIA', b'DANAU SALAR DE UYUNI', b'DANAU TOBA',
       b'DENMARK', b'DIENG', b'DISNEY SEA', b'DISNEYLAND PARIS', b'DUBAI',
       b'DUFAN', b'EROPA', b'ES KRIM RAGUSA', b'EUROPE',
       b'GAMPLONG STUDIO', b'GEOPARK CILETUH', b'GUA GONG',
       b'GUA TABUHAN', b'GUMUK PASIR PARANGKUSUMO', b'GUNUNG',
       b'GUNUNG AGUNG', b'GUNUNG BROMO', b'GUNUNG FUJI', b'GUNUNG GEDE',
       b'GUNUNG PANGRANGO', b'GUNUNG PARANG', b'GUNUNG RINJANI',
       b'GUNUNG SEMERU', b'INDONESIA', b'ISRAEL', b'JAKARTA',
       b'JAWA BARAT', b'JEPANG', b'JERMAN', b'KATULAMPA RAFTING BOGOR',
       b'KENDARI', b'KOREA SELATAN', b'LABUAN BAJO', b'LAMPUNG',
       b'LOMBOK', b'LONDON', b'MALANG', b'MALAYSIA', b'MALDIVES',
       b'MALIOBORO',

In [16]:
cek(tf.constant([['GUNUNG BROMO','DANAU TOBA']])) # 0 means out of vocabulary

<tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[35, 18]])>

### Rating-specialized model

Depending on the weights we assign, the model will encode a different balance of the tasks. Let's start with a model that only considers ratings.

In [17]:
model = NoToGoModel(rating_weight=1.0, retrieval_weight=0.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

TypeError: ignored

In [None]:
model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

### Retrieval-specialized model

Let's now try a model that focuses on retrieval only.

In [13]:
model = NoToGoModel(rating_weight=0.0, like_weight = 0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [14]:
model.fit(cached_train, epochs=50)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Retrieval top-100 accuracy: 1.000.
Ranking RMSE: 0.498.


In [15]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'factorized_top_k/top_10_categorical_accuracy': 0.6279258728027344,
 'factorized_top_k/top_1_categorical_accuracy': 0.31518206000328064,
 'factorized_top_k/top_50_categorical_accuracy': 0.9507477283477783,
 'factorized_top_k/top_5_categorical_accuracy': 0.33078673481941223,
 'loss': 16.856651306152344,
 'regularization_loss': 0,
 'root_mean_squared_error': 0.4982590079307556,
 'total_loss': 16.856651306152344}

In [18]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((locations.batch(100), locations.batch(100).map(model.location_model)))
)

# Get recommendations.
user_id = "183"
_, titles = index(tf.constant([user_id]))
print(f"Recommendations for New User : {titles[0, :7]}")

Recommendations for New User : [b'CANGGU' b'CAPADOCIA' b'SINGAPURA' b'GUA GONG'
 b'MARINA BAY STREET CIRCUIT' b'GUA TABUHAN' b'MUSEUM MOJA']


### Joint model

Let's now train a model that assigns positive weights to both tasks.

In [None]:
model = MovielensModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
model.fit(cached_train, epochs=3)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

## INTERFACE FOR NEW USER

In [26]:
worksheetCat = gc.open('capstone_dataset').worksheet('category')
rows = worksheetCat.get_all_values()
dfCategories = pd.DataFrame.from_records(rows[1:], columns=rows[0])
list_categories = [i for i in dfCategories['category'] if i != '']
worksheetUserFeat = gc.open('capstone_dataset').worksheet('userFeatures(coldstartsol)')
rows = worksheetUserFeat.get_all_values()
dfUserFeat = pd.DataFrame.from_records(rows[1:], columns=rows[0])

In [28]:
last_user_id = dfUserFeat['user_id'].to_list()
last_user_id = int(last_user_id[-1]) + 1 

def input_user_data():
  user_data = {}
  # input_name = input("Nama: ")
  user_data['user_id'] = str(last_user_id)
  last_user_ids = last_user_id + 1
  user_categories = list()
  for i in list_categories:
    input_category = int(input("{} ? (1: yes, 0: no) : ".format(i)))
    if input_category == 1:
      user_categories.append(i)
  user_data["categories"] = user_categories
  return user_data, last_user_ids
user_data, last_user_id = input_user_data()

beach ? (1: yes, 0: no) : 1
mountain ? (1: yes, 0: no) : 1
lake ? (1: yes, 0: no) : 0
zoo ? (1: yes, 0: no) : 0
river ? (1: yes, 0: no) : 0
conservation ? (1: yes, 0: no) : 0
waterpark ? (1: yes, 0: no) : 0
waterfall ? (1: yes, 0: no) : 0
artGallery ? (1: yes, 0: no) : 1
amusementPark ? (1: yes, 0: no) : 0
mall ? (1: yes, 0: no) : 1
HistoricalPlace ? (1: yes, 0: no) : 1
religious ? (1: yes, 0: no) : 0
outbond ? (1: yes, 0: no) : 1
culinary ? (1: yes, 0: no) : 1
photoHunting ? (1: yes, 0: no) : 1
sightSeeing ? (1: yes, 0: no) : 1
shopping ? (1: yes, 0: no) : 0


In [29]:
def encode_dummy_category(location):
  if location == "beach":
    return 121
  if location == "mountain":
    return 123
  if location == "lake":
    return 124
  if location == "zoo":
    return 125
  if location == "river":
    return 126
  if location == "conservation":
    return 127
  if location == "waterpark":
    return 128
  if location == "waterfall":
    return 129
  if location == "artGallery":
    return 130
  if location == "amusementPark":
    return 131
  if location == "mall":
    return 132
  if location == "HistoricalPlace":
    return 133
  if location == "religious":
    return 134
  if location == "outbond":
    return 135
  if location == "culinary":
    return 136
  if location == "photoHunting":
    return 137
  if location == "sightSeeing":
    return 138
  if location == "shopping":
    return 139

In [30]:
def create_data_frame(user_data, dfUserFeatcold):
  user_id = [user_data['user_id'] for i in list_categories]
  like = ['0' for i in list_categories]
  categories = list_categories
  location = categories
  location_id = [encode_dummy_category(i) for i in list_categories]
  add = list()
  for i in list_categories:
    if i in user_data['categories']:
      add.append("1")
    else:
      add.append("0")
  data = {'user_id' : user_id, 'like' : like, "add" : add, "category" : categories, "location" : location, "location_id" : location_id}
  df = pd.DataFrame(data)
  df_concat = pd.concat([dfUserFeatcold,df], ignore_index= True)
  return df, df_concat   


In [31]:
df, df_concat = create_data_frame(user_data, dfUserFeat)
worksheetUser = gc.open('capstone_dataset').worksheet('userFeatures(coldstartsol)')
set_with_dataframe(worksheetUser, df_concat)