<a href="https://colab.research.google.com/github/louis-not/Notogo-ML/blob/main/model_v2.2/Hyperparameter_Tuning_V2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RUN

## Import Module

### For Optimization

In [1]:
!pip install bayesian-optimization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=ee7d80a339d29fe0e4316cfc0e8fd471da3009e2b499a4d878e53e1eb8049d4a
  Stored in directory: /root/.cache/pip/wheels/fd/9b/71/f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [2]:
import numpy as np
from bayes_opt import BayesianOptimization

### For Model

In [3]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 85 kB 3.6 MB/s 
[K     |████████████████████████████████| 462 kB 39.8 MB/s 
[K     |████████████████████████████████| 4.2 MB 14.7 MB/s 
[?25h

In [4]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd

import tensorflow_recommenders as tfrs

# getting data
from google.colab import auth
import gspread
from google.auth import default
from gspread_dataframe import get_as_dataframe, set_with_dataframe

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
cd /content/drive/MyDrive/Bangkit/Capstone/ML/ML-GH/Notogo-ML/"model_v2.2"

/content/drive/MyDrive/Bangkit/Capstone/ML/ML-GH/Notogo-ML/model_v2.2


In [7]:
print(os.getcwd())

/content/drive/MyDrive/Bangkit/Capstone/ML/ML-GH/Notogo-ML/model_v2.2


## Retrieval Model

### Preparing Dataset

In [21]:
import userFeatures
builder = tfds.builder('Userfeatures')
userFeatureDs = tfds.load('Userfeatures',split='train')

import wishEmbedding
builder = tfds.builder('Wishembedding')
wishEmbeddingDs = tfds.load('Wishembedding',split='train')

ratings = userFeatureDs.map(lambda x: {
    "location_id": x["location_id"],
    "user_id": x["user_id"],
    "add" : x["add"],
    "like" : x['like']
})
locations = wishEmbeddingDs.map(lambda x: x["location_id"])

for x in ratings.take(2).as_numpy_iterator():
  pprint.pprint(x)

{'add': 0, 'like': 1, 'location_id': b'61', 'user_id': b'93'}
{'add': 0, 'like': 1, 'location_id': b'53', 'user_id': b'112'}


In [22]:
tf.random.set_seed(42)

NUM_DATA = ratings.__len__().numpy()

shuffled = ratings.shuffle(NUM_DATA, seed=42, reshuffle_each_iteration=False)

trainset_size = 0.8 * NUM_DATA

train = shuffled.take(trainset_size)
test = shuffled.skip(trainset_size).take(NUM_DATA - trainset_size)

location_name = locations.batch(1000)
user_ids = ratings.batch(1000).map(lambda x: x["user_id"])

unique_location_name = np.unique(np.concatenate(list(location_name)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

cached_train = train.shuffle(NUM_DATA).batch(512).cache()
cached_test = test.batch(256).cache()

### Model
change this if needed for a different model

In [24]:
class NoToGoModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, like_weight: float,retrieval_weight: float, 
               BATCH_SIZE, EMB_DIM
               ) -> None:
    # We take the loss weights in the constructor: this allows us to instantiate
    # several model objects with different loss weights.

    super().__init__()

    embedding_dimension = int(EMB_DIM)

    # User and movie models.
    self.location_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_location_name, mask_token=None),
      tf.keras.layers.Embedding(len(unique_location_name) + 1, embedding_dimension),
      tf.keras.layers.Dense(16, activation="relu")
    ])

    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
      tf.keras.layers.Dense(16, activation="relu")
    ])

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(8, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation = "sigmoid"),
    ])

    self.like_model = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation = "sigmoid"),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )

    self.like_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=locations.batch(BATCH_SIZE).map(self.location_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight
    self.like_weight = like_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model.
    location_embeddings = self.location_model(features["location_id"])
    
    return (
        user_embeddings,
        location_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings,location_embeddings], axis=1)
        ),
        self.like_model(
            tf.concat([user_embeddings,location_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("add","like")
    like = features.pop("like", "add")

    user_embeddings, location_embeddings, rating_predictions, like_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )

    like_loss = self.like_task(
        labels=like,
        predictions=like_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, location_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss + like_loss*self.like_weight)

## Bayes Optimization

In [11]:
def black_box(lr,emb_dim, epoch):
  """black box function for optimization used for Retrieval-specialized model"""
  METRIC = 'factorized_top_k/top_1_categorical_accuracy'
  batch = 180


  # Initialize model
  model = NoToGoModel(rating_weight=0.0, like_weight = 0, retrieval_weight=1.0,
                      BATCH_SIZE=batch, EMB_DIM=emb_dim)
  model.compile(optimizer=tf.keras.optimizers.Adagrad(lr))

  # training model
  model.fit(cached_train, epochs=int(epoch), verbose=0)
  result = model.evaluate(cached_train, return_dict=True, verbose=0)

  return result[METRIC]

In [12]:
param_bounds = {
    # 'batch' : (150, 200),
    'lr' : (0.001,0.1),
    'emb_dim' : (16, 128),
    'epoch' : (5, 25)
}

optimizer = BayesianOptimization(
    f=black_box,
    pbounds=param_bounds,
    random_state=1,
)

In [13]:
optimizer.maximize(
    init_points=2, #How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    n_iter=50, #How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
)

|   iter    |  target   |  emb_dim  |   epoch   |    lr     |
-------------------------------------------------------------


KeyboardInterrupt: ignored

In [None]:
print(optimizer.max)

## Evaluate

In [25]:
# Result :
batch = 180
emb_dim = 41
epoch = 8
lr = 0.09478

In [45]:
model = NoToGoModel(rating_weight=0.0, like_weight = 0, retrieval_weight=1.0,
                    BATCH_SIZE=batch, EMB_DIM=emb_dim)
model.compile(optimizer=tf.keras.optimizers.Adagrad(lr))

# training model
model.fit(cached_train, epochs=int(epoch))
model.evaluate(cached_train, return_dict=True)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


{'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'factorized_top_k/top_10_categorical_accuracy': 0.8165711760520935,
 'factorized_top_k/top_1_categorical_accuracy': 0.4781583845615387,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.6423665881156921,
 'loss': 115.41682434082031,
 'regularization_loss': 0,
 'root_mean_squared_error': 0.5019081830978394,
 'total_loss': 115.41682434082031}

In [44]:
model.evaluate(cached_train, return_dict=True)



{'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'factorized_top_k/top_10_categorical_accuracy': 0.8775244951248169,
 'factorized_top_k/top_1_categorical_accuracy': 0.844122052192688,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.861229658126831,
 'loss': 115.39244842529297,
 'regularization_loss': 0,
 'root_mean_squared_error': 0.49841198325157166,
 'total_loss': 115.39244842529297}

In [82]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((locations.batch(100), locations.batch(100).map(model.location_model)))
)

# Get recommendations.
user_id = "0"
_, titles = index(tf.constant([user_id]))
print(f"Recommendations for New User : {titles}")

Recommendations for New User : [[b'26' b'30' b'38' b'39' b'55' b'28' b'64' b'90' b'81' b'6']]


In [76]:
MAX_LIM = 30

dfRecommendation = []

for user_id in unique_user_ids:
  _, recommendation = index(tf.constant([user_id]))
  recommendation = recommendation[0, :MAX_LIM].numpy().tolist()
  # print(str(user_id) + ' : ' + str(recommendation))
  recommendation = [ x.decode("utf-8") for x in recommendation ]
  dfRecommendation.append(recommendation)

In [77]:
dfRecommendation = pd.DataFrame(dfRecommendation)
dfRecommendation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,26,30,38,39,55,28,64,90,81,6
1,26,30,62,54,38,22,101,39,55,28
2,27,71,63,53,118,66,17,91,78,22
3,39,55,28,64,90,81,6,50,47,33
4,26,30,62,38,54,22,101,39,55,28


In [78]:
dfRecommendation.reset_index(inplace=True)

In [79]:
dfRecommendation.rename(columns={'index':'user_id'}, inplace=True)

In [87]:
path = os.path.join(os.getcwd(),'recommendation.csv')
print(path)

/content/drive/MyDrive/Bangkit/Capstone/ML/ML-GH/Notogo-ML/model_v2.2/recommendation.csv


In [88]:
dfRecommendation.to_csv(path,index=False)