# Import library

In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Read dataset

Dataset obtained https://www.kaggle.com/datasets/utarasetyaw/data-set-destination-in-bali/

In [22]:
df_combination = pd.read_csv('./dataset/combination.csv')
df_combination.head()

Unnamed: 0,Place 1,Place 2,Place 3,Place 4,Place 5
0,Pantai Pasih Uug,Pantai Angel's Billabong,Pantai Kelingking,Pantai Crystal Bay,
1,Pantai Diamond,Pantai Atuh,Bukit Teletubies,,
2,Tanjung Benoa,Pantai Waterblow,Pura Uluwatu,Pantai Pandawa,Pantai Jimbaran
3,Tari Barong dan Keris,Sari Merta,Pura Tirta Empul,Pantai Jimbaran,
4,Tari Barong dan Keris,Sari Merta,Objek Wisata Desa Panglipuran,Pantai Jimbaran,


## Dataset rating

In [23]:
df_rating = pd.read_csv('./dataset/rating.csv')
df_rating.head()

Unnamed: 0,User_Id,Place_Id,Nama Tempat,Place_Rating
0,1,2,Agung Bali Oleh-Oleh,3
1,1,7,Aloha Ubud Swing,5
2,1,8,Bali Bird Park,5
3,1,12,Big Garden Corner,2
4,1,13,Wanagiri Hidden Hills,5


In [24]:
df_rating.dtypes

User_Id          int64
Place_Id         int64
Nama Tempat     object
Place_Rating     int64
dtype: object

Get user count

In [25]:
user_count = df_rating['User_Id'].nunique()
user_count

100

In [26]:
df_rating['User_Id'].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100], dtype=int64)

## Dataset tourism

In [27]:
df_tourism = pd.read_csv('./dataset/tourism.csv')
df_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Description,Weekend Holiday Price,Weekday Price,Category,City,Rating,Alamat,Coordinate,Lat,Long,Gambar
0,1,Agrowisata Satria,Agrowisata Satria menawarkan 'rasa' Bali pada ...,50000,50000,Agrowisata,Gianyar,3.0,"Manukaya, Kec. Tampaksiring, Kabupaten Gianya...","-8.402228546131536, 115.32296693687739",-8.402229,115.322967,V
1,2,Agung Bali,Dapatkan berbagai produk oleh-oleh khas Bali b...,0,0,Belanja,Badung,4.0,"Jln. Dewi Sri No.18XX, Kuta, Badung, Bali","-8.700234336021559, 115.176534407375",-8.700234,115.176534,V
2,3,Wisata Air Panas Toya Bungkah,Pemandian air panas di dekat gunung Batur (Pem...,70000,70000,Alam,Bangli,4.0,"Batur Tengah, Kec. Kintamani, Kabupaten Bangli...","-8.251298580809106, 115.39981444195475",-8.251299,115.399814,V
3,4,Air Terjun Aling-Aling,Keberadaan air terjun ini semakin mempercantik...,20000,20000,Alam,Buleleng,4.5,"Jl. Raya Desa Sambangan, Banjar, Sambangan, Ke...","-8.173073590748519, 115.10513024047802",-8.173074,115.10513,V
4,5,Air Terjun Tegenungan,Bosan dengan wisata alam pantai dan ingin meni...,20000,20000,Alam,Gianyar,4.0,"Jl. Ir. Sutami, Kemenuh, Kec. Sukawati, Kabupa...","-8.575191460385602, 115.28870189592831",-8.575191,115.288702,V


In [28]:
df_tourism.dtypes

Place_Id                   int64
Place_Name                object
Description               object
Weekend Holiday Price      int64
Weekday Price              int64
Category                  object
City                      object
Rating                   float64
Alamat                    object
Coordinate                object
Lat                      float64
Long                     float64
Gambar                    object
dtype: object

Get POI count

In [29]:
poi_count = df_rating['Place_Id'].nunique()
poi_count

75

In [30]:
np.sort(df_rating['Place_Id'].unique())

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75], dtype=int64)

# Membuat matriks user-rating

In [31]:
matrix_user_rating = np.zeros((poi_count, user_count))

In [32]:
for i in range(0, poi_count):
    for j in range(0, user_count):
        try:
            matrix_user_rating[i][j] = df_rating[(df_rating['User_Id'] == i + 1 )& (df_rating['Place_Id'] == j + 1)].iloc[0]['Place_Rating']
        except Exception:
            continue

In [33]:
matrix_user_rating

array([[0., 3., 0., ..., 0., 0., 0.],
       [0., 3., 5., ..., 0., 0., 0.],
       [4., 5., 0., ..., 0., 0., 0.],
       ...,
       [0., 3., 0., ..., 0., 0., 0.],
       [4., 4., 5., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [34]:
df_user_rating = pd.DataFrame(matrix_user_rating,
                              index=range(1, poi_count + 1),
                              columns=range(1, user_count + 1))
df_user_rating.head()


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
1,0.0,3.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,3.0,5.0,0.0,2.0,3.0,5.0,5.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,5.0,0.0,5.0,0.0,1.0,0.0,5.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,3.0,5.0,0.0,0.0,0.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
df_user_rating.to_csv('./dataset/user_rating.csv', index=False)

# Prediksi rating

## User-based Collaborative Filtering

In [36]:
def predict_ratings_user_based(rating_matrix):
    # Create a copy of the rating matrix
    predictions = rating_matrix.copy()

    # Calculate user similarity matrix using cosine similarity
    # Transpose the matrix because we want user similarity (users are columns)
    user_similarity = cosine_similarity(rating_matrix.T, rating_matrix.T)

    # For each item-user pair with zero rating
    for item in range(rating_matrix.shape[0]):
        for user in range(rating_matrix.shape[1]):
            if rating_matrix[item, user] == 0:
                # Get similar users (exclude the current user)
                similar_users = user_similarity[user]
                similar_users[user] = 0  # exclude self-similarity

                # Get ratings for the current item from other users
                item_ratings = rating_matrix[item]

                # Consider only non-zero ratings
                mask = item_ratings != 0

                # Calculate weighted average of ratings from similar users
                numerator = np.sum(similar_users * item_ratings * mask)
                denominator = np.sum(np.abs(similar_users) * mask)

                if denominator != 0:
                    predictions[item, user] = numerator / denominator
                else:
                    # If no similar users rated this item, use mean of non-zero ratings
                    non_zero_ratings = item_ratings[item_ratings != 0]
                    predictions[item, user] = np.mean(non_zero_ratings) if len(non_zero_ratings) > 0 else 0

    return predictions

# Apply the prediction
predicted_ratings = predict_ratings_user_based(matrix_user_rating)

# Create DataFrame for better visualization
df_user_based_predicted_ratings = pd.DataFrame(predicted_ratings,
                                  index=range(1, matrix_user_rating.shape[0] + 1),
                                  columns=range(1, matrix_user_rating.shape[1] + 1))

predicted_ub_cf_matrix = df_user_based_predicted_ratings.to_numpy()
predicted_ub_cf_matrix

array([[3.92380705, 3.        , 3.97565102, ..., 3.93333333, 3.93333333,
        3.93333333],
       [4.25528985, 3.        , 5.        , ..., 4.16666667, 4.16666667,
        4.16666667],
       [4.        , 5.        , 4.52479864, ..., 4.53333333, 4.53333333,
        4.53333333],
       ...,
       [3.88842673, 3.        , 3.84884551, ..., 3.93333333, 3.93333333,
        3.93333333],
       [4.        , 4.        , 5.        , ..., 3.76666667, 3.76666667,
        3.76666667],
       [1.        , 3.39816685, 3.42051977, ..., 3.5       , 3.5       ,
        3.5       ]])

## Item-based Collaborative Filtering

In [37]:
def predict_ratings_item_based(rating_matrix):
    # Create a copy of the rating matrix
    predictions = rating_matrix.copy()

    # Calculate item similarity matrix using cosine similarity
    # No need to transpose since rows are already items
    item_similarity = cosine_similarity(rating_matrix, rating_matrix)

    # For each item-user pair with zero rating
    for item in range(rating_matrix.shape[0]):
        for user in range(rating_matrix.shape[1]):
            if rating_matrix[item, user] == 0:
                # Get similar items (exclude the current item)
                similar_items = item_similarity[item]
                similar_items[item] = 0  # exclude self-similarity

                # Get user's ratings for all items
                user_ratings = rating_matrix[:, user]

                # Consider only non-zero ratings
                mask = user_ratings != 0

                # Calculate weighted average of ratings from similar items
                numerator = np.sum(similar_items * user_ratings * mask)
                denominator = np.sum(np.abs(similar_items) * mask)

                if denominator != 0:
                    predictions[item, user] = numerator / denominator
                else:
                    # If no similar items were rated by this user, use mean of non-zero ratings for the item
                    non_zero_ratings = rating_matrix[item][rating_matrix[item] != 0]
                    predictions[item, user] = np.mean(non_zero_ratings) if len(non_zero_ratings) > 0 else 0

    return predictions

# Apply the prediction
predicted_ratings = predict_ratings_item_based(matrix_user_rating)

# Create DataFrame for better visualization
df_item_based_predicted_ratings = pd.DataFrame(predicted_ratings,
                                  index=range(1, matrix_user_rating.shape[0] + 1),
                                  columns=range(1, matrix_user_rating.shape[1] + 1))
predicted_ib_cf_matrix = df_item_based_predicted_ratings.to_numpy()
predicted_ib_cf_matrix


array([[2.77027174, 3.        , 3.95796556, ..., 3.93333333, 3.93333333,
        3.93333333],
       [2.73873318, 3.        , 5.        , ..., 4.16666667, 4.16666667,
        4.16666667],
       [4.        , 5.        , 3.91862689, ..., 4.53333333, 4.53333333,
        4.53333333],
       ...,
       [2.78127436, 3.        , 3.93475525, ..., 3.93333333, 3.93333333,
        3.93333333],
       [4.        , 4.        , 5.        , ..., 3.76666667, 3.76666667,
        3.76666667],
       [1.        , 3.85808676, 3.99479862, ..., 3.5       , 3.5       ,
        3.5       ]])

## SVD

In [38]:
class MatrixFactorization:
    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict missing ratings.

        Parameters:
        - R (np.array): User-item rating matrix with 0 for missing values
        - K (int): Number of latent factors
        - alpha (float): Learning rate
        - beta (float): Regularization parameter
        - iterations (int): Number of iterations for SGD
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent factor matrices with small random values
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Create a mask for observed ratings (non-zero entries)
        self.observed = self.R > 0

        # SGD optimization
        for it in range(self.iterations):
            for i in range(self.num_users):
                for j in range(self.num_items):
                    if self.observed[i, j]:
                        # Compute error of prediction
                        prediction = self.predict_single(i, j)
                        e_ij = self.R[i, j] - prediction

                        # Update latent factors with gradient descent
                        self.P[i, :] += self.alpha * (e_ij * self.Q[j, :] - self.beta * self.P[i, :])
                        self.Q[j, :] += self.alpha * (e_ij * self.P[i, :] - self.beta * self.Q[j, :])

            # Optionally, compute total error to monitor convergence
            error = self.compute_error()
            if (it + 1) % 10 == 0 or it == 0:
                print(f"Iteration {it+1}/{self.iterations}, error: {error:.4f}")

    def predict_single(self, i, j):
        """Predict rating of user i for item j."""
        return np.dot(self.P[i, :], self.Q[j, :].T)

    def full_matrix(self):
        """Reconstruct the full rating matrix with predicted values."""
        return np.dot(self.P, self.Q.T)

    def compute_error(self):
        """Compute total squared error on observed ratings with regularization."""
        prediction = self.full_matrix()
        error = 0
        for i in range(self.num_users):
            for j in range(self.num_items):
                if self.observed[i, j]:
                    error += (self.R[i, j] - prediction[i, j]) ** 2
        # Add regularization terms
        error += self.beta * (np.sum(np.square(self.P)) + np.sum(np.square(self.Q)))
        return error

mf = MatrixFactorization(matrix_user_rating, K=3, alpha=0.01, beta=0.01, iterations=100)
mf.train()

predicted_svd_matrix = mf.full_matrix()
predicted_svd_matrix = np.clip(predicted_svd_matrix, 0, 5)  # Clip ratings to [0, 5]

predicted_svd_matrix

Iteration 1/100, error: 40568.7605
Iteration 10/100, error: 1874.1312
Iteration 20/100, error: 1732.1384
Iteration 30/100, error: 1622.4198
Iteration 40/100, error: 1550.3043
Iteration 50/100, error: 1506.6547
Iteration 60/100, error: 1479.0579
Iteration 70/100, error: 1459.7956
Iteration 80/100, error: 1445.0474
Iteration 90/100, error: 1433.1699
Iteration 100/100, error: 1423.4077


array([[1.60108518, 3.62667085, 3.96512096, ..., 0.        , 1.5897935 ,
        0.        ],
       [2.86297886, 4.11585294, 4.03814137, ..., 0.        , 1.79486702,
        0.        ],
       [2.56385231, 4.52015085, 4.69050345, ..., 0.        , 1.56779407,
        0.        ],
       ...,
       [2.90580925, 3.15127375, 2.77740209, ..., 0.        , 1.95604629,
        0.        ],
       [2.22744308, 3.43163806, 3.43729966, ..., 0.        , 1.48824432,
        0.        ],
       [1.87515669, 2.37810474, 2.2373208 , ..., 0.02155369, 2.28309735,
        0.04680358]])

## Alternating Least Square (ALS)

In [39]:
class ALS:
    def __init__(self, R, K=10, lambda_reg=0.1, iterations=20):
        """
        ALS matrix factorization for explicit feedback.

        Parameters:
        - R (np.array): User-item rating matrix with 0 for missing ratings
        - K (int): Number of latent factors
        - lambda_reg (float): Regularization parameter
        - iterations (int): Number of ALS iterations
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.lambda_reg = lambda_reg
        self.iterations = iterations

        # Initialize user and item latent factor matrices with random values
        self.U = np.random.rand(self.num_users, K)
        self.V = np.random.rand(self.num_items, K)

        # Mask for observed ratings
        self.observed = R > 0

    def train(self):
        for it in range(self.iterations):
            # Fix V, update U
            for i in range(self.num_users):
                V_i = self.V[self.observed[i, :], :]
                R_i = self.R[i, self.observed[i, :]]
                if V_i.shape[0] > 0:
                    A = V_i.T @ V_i + self.lambda_reg * np.eye(self.K)
                    b = V_i.T @ R_i
                    self.U[i, :] = np.linalg.solve(A, b)

            # Fix U, update V
            for j in range(self.num_items):
                U_j = self.U[self.observed[:, j], :]
                R_j = self.R[self.observed[:, j], j]
                if U_j.shape[0] > 0:
                    A = U_j.T @ U_j + self.lambda_reg * np.eye(self.K)
                    b = U_j.T @ R_j
                    self.V[j, :] = np.linalg.solve(A, b)

            error = self.compute_error()
            print(f"Iteration {it+1}/{self.iterations}, error: {error:.4f}")

    def predict(self):
        """Reconstruct the full rating matrix."""
        return self.U @ self.V.T

    def compute_error(self):
        """Compute squared error on observed ratings with regularization."""
        prediction = self.predict()
        error = 0
        for i in range(self.num_users):
            for j in range(self.num_items):
                if self.observed[i, j]:
                    error += (self.R[i, j] - prediction[i, j]) ** 2
        # Add regularization terms
        error += self.lambda_reg * (np.sum(np.square(self.U)) + np.sum(np.square(self.V)))
        return error

# mf = ALS(matrix_user_rating, K=3, alpha=0.01, beta=0.01, iterations=100)
als = ALS(matrix_user_rating, K=3, lambda_reg=0.1, iterations=30)
als.train()

predicted_als_matrix = als.predict()
predicted_als_matrix = np.clip(predicted_als_matrix, 0, 5)

np.round(predicted_als_matrix, 2)


Iteration 1/30, error: 1949.9581
Iteration 2/30, error: 1686.8226
Iteration 3/30, error: 1621.4926
Iteration 4/30, error: 1588.9073
Iteration 5/30, error: 1564.6337
Iteration 6/30, error: 1545.3384
Iteration 7/30, error: 1530.9953
Iteration 8/30, error: 1521.1557
Iteration 9/30, error: 1515.0180
Iteration 10/30, error: 1511.1723
Iteration 11/30, error: 1508.4752
Iteration 12/30, error: 1506.3226
Iteration 13/30, error: 1504.4314
Iteration 14/30, error: 1502.6573
Iteration 15/30, error: 1500.9067
Iteration 16/30, error: 1499.0953
Iteration 17/30, error: 1497.1256
Iteration 18/30, error: 1494.8734
Iteration 19/30, error: 1492.1912
Iteration 20/30, error: 1488.9431
Iteration 21/30, error: 1485.0673
Iteration 22/30, error: 1480.6276
Iteration 23/30, error: 1475.8375
Iteration 24/30, error: 1471.0838
Iteration 25/30, error: 1466.8481
Iteration 26/30, error: 1463.4628
Iteration 27/30, error: 1460.9448
Iteration 28/30, error: 1459.0906
Iteration 29/30, error: 1457.6643
Iteration 30/30, error:

array([[1.63, 3.  , 4.64, ..., 1.48, 2.22, 4.92],
       [4.71, 3.03, 4.72, ..., 1.6 , 3.04, 5.  ],
       [3.24, 4.48, 3.88, ..., 1.44, 2.98, 5.  ],
       ...,
       [4.83, 3.14, 3.44, ..., 1.28, 2.88, 4.4 ],
       [4.25, 2.72, 4.18, ..., 1.42, 2.72, 4.7 ],
       [1.32, 0.  , 5.  , ..., 2.07, 1.31, 5.  ]])

## PMF

In [40]:
class PMF:
    def __init__(self, R, K=10, lambda_u=0.1, lambda_v=0.1, learning_rate=0.005, iterations=100):
        """
        Probabilistic Matrix Factorization using gradient descent.

        Parameters:
        - R (np.array): User-item rating matrix with 0 for missing ratings
        - K (int): Number of latent factors
        - lambda_u (float): Regularization for user factors
        - lambda_v (float): Regularization for item factors
        - learning_rate (float): Learning rate for gradient descent
        - iterations (int): Number of iterations for training
        """
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.lambda_u = lambda_u
        self.lambda_v = lambda_v
        self.learning_rate = learning_rate
        self.iterations = iterations

        # Initialize latent factors with small random values
        self.U = 0.1 * np.random.randn(self.num_users, K)
        self.V = 0.1 * np.random.randn(self.num_items, K)

        # Mask for observed ratings
        self.observed = R > 0

    def train(self):
        for it in range(self.iterations):
            self.gradient_descent_step()
            error = self.compute_error()
            if (it + 1) % 10 == 0 or it == 0:
                print(f"Iteration {it+1}/{self.iterations}, error: {error:.4f}")

    def gradient_descent_step(self):
        for i in range(self.num_users):
            for j in range(self.num_items):
                if self.observed[i, j]:
                    prediction = self.U[i, :].dot(self.V[j, :].T)
                    e_ij = self.R[i, j] - prediction

                    # Update user and item latent factors
                    self.U[i, :] += self.learning_rate * (e_ij * self.V[j, :] - self.lambda_u * self.U[i, :])
                    self.V[j, :] += self.learning_rate * (e_ij * self.U[i, :] - self.lambda_v * self.V[j, :])

    def predict(self):
        """Reconstruct the full rating matrix."""
        return self.U.dot(self.V.T)

    def compute_error(self):
        """Compute regularized squared error on observed ratings."""
        prediction = self.predict()
        error = 0
        for i in range(self.num_users):
            for j in range(self.num_items):
                if self.observed[i, j]:
                    error += (self.R[i, j] - prediction[i, j]) ** 2
        # Add regularization terms
        error += self.lambda_u * np.sum(self.U ** 2) + self.lambda_v * np.sum(self.V ** 2)
        return error

# Example user-item matrix (0 means missing rating)
R = np.array([
    [5, 3, 0, 1],
    [4, 0, 0, 1],
    [1, 1, 0, 5],
    [1, 0, 0, 4],
    [0, 1, 5, 4],
])

pmf = PMF(matrix_user_rating, K=3, lambda_u=0.1, lambda_v=0.1, learning_rate=0.005, iterations=100)
pmf.train()

predicted_pmf_matrix = pmf.predict()
# Clip predicted ratings to [0, 5]
predicted_pmf_matrix = np.clip(predicted_pmf_matrix, 0, 5)
np.round(predicted_pmf_matrix, 2)

Iteration 1/100, error: 40963.4331
Iteration 10/100, error: 4339.9030
Iteration 20/100, error: 1998.1888
Iteration 30/100, error: 1973.4348
Iteration 40/100, error: 1936.6719
Iteration 50/100, error: 1884.7631
Iteration 60/100, error: 1821.0017
Iteration 70/100, error: 1755.6128
Iteration 80/100, error: 1699.2747
Iteration 90/100, error: 1656.4120
Iteration 100/100, error: 1625.7262


array([[3.13, 3.59, 3.75, ..., 0.28, 0.  , 0.04],
       [2.26, 3.94, 3.98, ..., 0.24, 0.  , 0.1 ],
       [2.58, 4.26, 4.26, ..., 0.26, 0.  , 0.09],
       ...,
       [2.78, 3.51, 3.57, ..., 0.25, 0.  , 0.04],
       [2.95, 3.41, 3.68, ..., 0.28, 0.  , 0.06],
       [0.53, 3.01, 3.  , ..., 0.11, 0.  , 0.15]])