## General steps

###**Import TFRS**

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [None]:
# Additional for basic retrieval
!pip install -q scann

In [None]:
# Additional for basic retrieval
import pprint
import tempfile

###**The datasets**

In [None]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")

In [None]:
# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"]
})
movies = movies.map(lambda x: x["movie_title"])

In [None]:
dataset_name = "movielens/100k-ratings"
dataset, info = tfds.load(dataset_name, split="train", with_info=True)
print(info.features)

In [None]:
dataset_name = "movielens/100k-movies"
dataset, info = tfds.load(dataset_name, split="train", with_info=True)
print(info.features)

##Quickstart
[TensorFlow Recommenders: Quickstart](https://www.tensorflow.org/recommenders/examples/quickstart)

Build vocabularies to convert user ids and movie titles into integer indices for embedding layers:

In [None]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"]))

movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)

###**Define a model**

We can define a TFRS model by inheriting from `tfrs.Model` and implementing the `compute_loss` method:

In [None]:
class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      movie_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.movie_model = movie_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.movie_model(features["movie_title"])

    return self.task(user_embeddings, movie_embeddings)



Define the two models and the retrieval task.

In [None]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])
movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocabulary_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    movies.batch(128).map(movie_model)
  )
)

###**Fit and evaluate it.**

Create the model, train it, and generate predictions:

In [None]:
# Create a retrieval model.
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(ratings.batch(4096), epochs=3)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    movies.batch(100).map(lambda title: (title, model.movie_model(title))))

# Get some recommendations.
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")

##**Applying quickstart to our own dataset**

###Import module and connect to gdrive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
%cd /content/gdrive/My Drive/Colab_Notebooks/Bangkit_Capstone/Dataset

/content/gdrive/My Drive/Colab_Notebooks/Bangkit_Capstone/Dataset


In [None]:
df = pd.read_csv("last_merged_data.csv")

In [None]:
# Print the row and column numbers
num_rows, num_columns = df.shape
print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

Number of rows: 6013
Number of columns: 10


In [None]:
df.head()

Unnamed: 0,gunungID,nama_gunung,ketinggian,kota,provinsi_gunung,provinsi_user,userID,name,keahlian,tingkat_kesulitan
0,1,Arjuno,3339,Malang,Jawa Timur,Jawa Timur,43,Everleigh Cain,Pemula,Sulit
1,1,Arjuno,3339,Malang,Jawa Timur,Jawa Timur,133,Elise Winters,Pemula,Sulit
2,1,Arjuno,3339,Malang,Jawa Timur,Jawa Timur,464,Thomas Farley,Sedang,Sulit
3,1,Arjuno,3339,Malang,Jawa Timur,Jawa Timur,483,Milana Wu,Sedang,Sulit
4,1,Arjuno,3339,Malang,Jawa Timur,Jawa Timur,495,Genevieve Gillespie,Pemula,Sulit


In [None]:
df["gunungID"] = df["gunungID"].astype("string")
df["nama_gunung"] = df["nama_gunung"].astype("string")
df["ketinggian"] = df["ketinggian"].astype("int64")
df["kota"] = df["kota"].astype("string")
df["provinsi_gunung"] = df["provinsi_gunung"].astype("string")
df["tingkat_kesulitan"] = df["tingkat_kesulitan"].astype("string")
df["userID"] = df["userID"].astype("string")
df["name"] = df["name"].astype("string")
df["keahlian"] = df["keahlian"].astype("string")
df["provinsi_user"] = df["provinsi_user"].astype("string")

In [None]:
# mountain_data = pd.read_csv("Gunung_fix.csv")
# user_data = pd.read_csv("user_sample.csv")

In [None]:
# Converting separated data to tfds
# mountain_tf_dataset = tf.data.Dataset.from_tensor_slices(dict(mountain_data))
# user_tf_dataset = tf.data.Dataset.from_tensor_slices(dict(user_data))

In [None]:
# Converting merged dataset to tfds
df_tf_dataset = tf.data.Dataset.from_tensor_slices(dict(df))

In [None]:
# Get column data types
merged_data_types = {key: value.dtype for key, value in df_tf_dataset.element_spec.items()}
for key, value in merged_data_types.items():
    print(key, value)

gunungID <dtype: 'string'>
nama_gunung <dtype: 'string'>
ketinggian <dtype: 'int64'>
kota <dtype: 'string'>
provinsi_gunung <dtype: 'string'>
provinsi_user <dtype: 'string'>
userID <dtype: 'string'>
name <dtype: 'string'>
keahlian <dtype: 'string'>
tingkat_kesulitan <dtype: 'string'>


In [None]:
# Get column data types
# mountain_data_types = {key: value.dtype for key, value in mountain_tf_dataset.element_spec.items()}
# print(mountain_data_types)

# Get column data types
# user_data_types = {key: value.dtype for key, value in user_tf_dataset.element_spec.items()}
# print(user_data_types)

In [None]:
# Select the basic features.
climbable = df_tf_dataset.map(lambda x: {
    "nama_gunung": x["nama_gunung"],
    "userID": x["userID"]
})
mount = df_tf_dataset.map(lambda x: x["nama_gunung"])

In [None]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(climbable.map(lambda x: x["userID"]))

mount_names_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
mount_names_vocabulary.adapt(mount)

In [None]:
class MountainModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      mountain_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.mountain_model = mountain_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["userID"])
    mountain_embeddings = self.mountain_model(features["nama_gunung"])

    return self.task(user_embeddings, mountain_embeddings)

In [None]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])
mountain_model = tf.keras.Sequential([
    mount_names_vocabulary,
    tf.keras.layers.Embedding(mount_names_vocabulary.vocabulary_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    mount.batch(128).map(mountain_model)
  )
)

In [None]:
# Create a retrieval model.
model = MountainModel(user_model, mountain_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
# model.fit(climbable.batch(4096), epochs=3)
model.fit(climbable.batch(8192), epochs=3)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    mount.batch(100).map(lambda title: (title, model.mountain_model(title))))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f003cde48b0>

In [None]:
# Get some recommendations.
_, titles = index(np.array(["43"]))
print(f"Top 3 recommendations for user 43: {titles[0, :3]}")

Top 3 recommendations for user 43: [b'Kawi' b'Kawi' b'Kawi']


In [None]:
# Get some recommendations.
_, titles = index(np.array(["43"]))
print(f"Top recommendations for user 43: {titles[0, :1]}")

Top recommendations for user 43: [b'Kawi']


In [None]:
duplicates = df[df.duplicated('name')]
# print(duplicates)
duplicates

Unnamed: 0,gunungID,nama_gunung,ketinggian,kota,provinsi_gunung,provinsi_user,userID,name,keahlian,tingkat_kesulitan
63,8,Kawi,2551,Malang,Jawa Timur,Jawa Timur,43,Everleigh Cain,Pemula,Sulit
64,8,Kawi,2551,Malang,Jawa Timur,Jawa Timur,133,Elise Winters,Pemula,Sulit
65,8,Kawi,2551,Malang,Jawa Timur,Jawa Timur,464,Thomas Farley,Sedang,Sulit
66,8,Kawi,2551,Malang,Jawa Timur,Jawa Timur,483,Milana Wu,Sedang,Sulit
67,8,Kawi,2551,Malang,Jawa Timur,Jawa Timur,495,Genevieve Gillespie,Pemula,Sulit
...,...,...,...,...,...,...,...,...,...,...
6008,248,Pantai Siombak,3700,Tolikara,Papua,Papua,4542,Jovanni Case,Pemula,Sulit
6009,248,Pantai Siombak,3700,Tolikara,Papua,Papua,4583,Mavis Reyna,Mahir,Sulit
6010,248,Pantai Siombak,3700,Tolikara,Papua,Papua,4743,Bridget Prince,Mahir,Sulit
6011,248,Pantai Siombak,3700,Tolikara,Papua,Papua,4817,Maryam Sweeney,Mahir,Sulit


### Trying some configuration

In [None]:
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(
        user_ids_vocabulary.vocabulary_size(),
        64,
        embeddings_regularizer=tf.keras.regularizers.l2(0.01)
    )
])

mountain_model = tf.keras.Sequential([
    mount_names_vocabulary,
    tf.keras.layers.Embedding(
        mount_names_vocabulary.vocabulary_size(),
        64,
        embeddings_regularizer=tf.keras.regularizers.l2(0.01)
    )
])
