In [1]:
# Import libraries used for collaborative filtering

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Dense, Input, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split


2024-06-20 21:20:14.310508: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-20 21:20:14.807275: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-20 21:20:14.807476: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-20 21:20:14.899451: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-20 21:20:15.075907: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# Load the data
df = pd.read_csv('combined-dataset/final_reviews_dataV2.csv')
df.head(10)

Unnamed: 0,id,types,review_number,review,user_id,sentiment
0,ChIJYcGr7GSb0S0RckePBrCWikw,"hotel, lodging",review 1,"It has quite small room, and the hallway is qu...",user_5061,0.0125
1,ChIJZbWX6Aia0S0R0tM3h1RZ1h8,"indonesian_restaurant, restaurant, food",review 1,"Surprisingly, a really good warung that’s hidd...",user_17031,0.2125
2,ChIJYyHbhgia0S0RzdjNXLmcf54,"tourist_attraction, restaurant, food",review 1,"Only had a fleeting visit here, came by coach,...",user_13759,-0.016667
3,ChIJ6zf9LJCb0S0RFv3BdLl61ZY,"coffee_shop, cafe, food, store",review 1,"One word, underrated! How come place like this...",user_5570,0.344676
4,ChIJxaITmQia0S0RyrbukE8vsJU,"tourist_attraction, place_of_worship",review 1,"This temple is located in Singaraja, located i...",user_20325,0.139524
5,ChIJ63FmGgaa0S0RWD5dfwhjGHQ,"indonesian_restaurant, restaurant, food",review 1,"We came here for dinner, food was good, i like...",user_3693,0.186111
6,ChIJAQAA5Aia0S0RBL27x0I5sHk,"indonesian_restaurant, restaurant, food",review 1,"They have 3 menus. Chicken, beef, or mixed. Th...",user_38971,-0.13263
7,ChIJA7v-3Qia0S0RdP8U7AQZBHE,"indonesian_restaurant, restaurant, food",review 1,"Did not meet my expectation...dirty table, not...",user_19070,-0.2
8,ChIJnSg1VAia0S0R3r6ej3XRQ50,"breakfast_restaurant, indonesian_restaurant, r...",review 1,Super yum rawon (beef stew with spices)!\nDefi...,user_80954,0.470833
9,ChIJL_GkVA-a0S0R6UJc7dfTH4A,"restaurant, coffee_shop, cafe, bakery, wholesa...",review 1,A Singaraga Gem...\nHighly recommended...\nA f...,user_7798,0.308378


In [3]:
import random
import string
import hashlib
import time

def generate_cuid():
    c = 'c'
    timestamp_str = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))  # Random 8 char string for timestamp
    counter = random.randint(0, 9999)  # Random counter
    client_fingerprint = ''.join(random.choices(string.ascii_lowercase + string.digits, k=4))  # Random 4 char string
    random_string = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))  # Random 8 char string
    
    return f'{c}{timestamp_str}{counter}{client_fingerprint}{random_string}'

unique_user_ids = df['user_id'].unique()
user_id_to_cuid = {user_id: generate_cuid() for user_id in unique_user_ids}

# Replace original user IDs with CUIDs
df['user_cuid'] = df['user_id'].map(user_id_to_cuid)


In [4]:
df.head()

Unnamed: 0,id,types,review_number,review,user_id,sentiment,user_cuid
0,ChIJYcGr7GSb0S0RckePBrCWikw,"hotel, lodging",review 1,"It has quite small room, and the hallway is qu...",user_5061,0.0125,cy2b6vk5n120hrwu2eu55l3k
1,ChIJZbWX6Aia0S0R0tM3h1RZ1h8,"indonesian_restaurant, restaurant, food",review 1,"Surprisingly, a really good warung that’s hidd...",user_17031,0.2125,cyu8yc77m8991w81jele5he4z
2,ChIJYyHbhgia0S0RzdjNXLmcf54,"tourist_attraction, restaurant, food",review 1,"Only had a fleeting visit here, came by coach,...",user_13759,-0.016667,cdmnbu1268093imcrbhl9wsu9
3,ChIJ6zf9LJCb0S0RFv3BdLl61ZY,"coffee_shop, cafe, food, store",review 1,"One word, underrated! How come place like this...",user_5570,0.344676,cn0ei2w863674yqmeh3mqj5hj
4,ChIJxaITmQia0S0RyrbukE8vsJU,"tourist_attraction, place_of_worship",review 1,"This temple is located in Singaraja, located i...",user_20325,0.139524,c2lp5vvf96582n3jpyw1uet02


In [5]:
# Encode the place types and place ids
type_encoder = LabelEncoder()
df['types_encoded'] = type_encoder.fit_transform(df['types'])

place_encoder = LabelEncoder()
df['place_id_encoded'] = place_encoder.fit_transform(df['id'])

# Prepare the dataset
X = df[['types_encoded', 'review']].copy()
y = df['place_id_encoded'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
class RecommenderModel(tf.keras.Model):
    def __init__(self, num_places, num_types, embedding_dim):
        super().__init__()
        self.type_embedding = tf.keras.layers.Embedding(num_types, embedding_dim)
        self.review_vectorizer = tf.keras.layers.TextVectorization(max_tokens=10000, output_mode='tf_idf')
        self.dense = tf.keras.layers.Dense(128, activation='relu')
        self.output_layer = tf.keras.layers.Dense(num_places, activation='softmax')
        self.review_vectorizer.adapt(X_train['review'])

    def call(self, inputs):
        type_embedding = self.type_embedding(inputs['types_encoded'])
        review_embedding = self.review_vectorizer(inputs['review'])
        x = tf.concat([type_embedding, review_embedding], axis=1)
        x = self.dense(x)
        return self.output_layer(x)

# Initialize the model
num_places = df['place_id_encoded'].nunique()
num_types = df['types_encoded'].nunique()
embedding_dim = 50

model = RecommenderModel(num_places, num_types, embedding_dim)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Create a custom data generator
def data_generator(X, y, batch_size=2):
    num_samples = len(X)
    while True:
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            batch_indices = indices[start:end]
            batch_X = {
                'types_encoded': np.array(X.iloc[batch_indices]['types_encoded']),
                'review': np.array(X.iloc[batch_indices]['review'])
            }
            batch_y = np.array(y[batch_indices])
            yield batch_X, batch_y

# Train the model
train_gen = data_generator(X_train, y_train)
test_gen = data_generator(X_test, y_test)

model.fit(train_gen, steps_per_epoch=len(X_train) // 2, epochs=10, validation_data=test_gen, validation_steps=len(X_test) // 2)

2024-06-20 21:20:21.244455: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-20 21:20:21.622037: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-20 21:20:21.622102: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-20 21:20:21.627589: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-20 21:20:21.627801: I external/local_xla/xla/stream_executor

Epoch 1/10


2024-06-20 21:21:07.488040: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fb840033140 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-06-20 21:21:07.488117: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2024-06-20 21:21:07.518800: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-06-20 21:21:07.575669: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1718893267.670746   10201 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fb9a938eb50>

In [184]:

# Make recommendations
def recommend(user_cuid, model, df, place_encoder, type_encoder):
    user_reviews = df[df['user_cuid'] == user_cuid]['review'].values
    user_types = df[df['user_cuid'] == user_cuid]['types'].values
    
    recommendations = []
    for review, place_type in zip(user_reviews, user_types):
        type_encoded = type_encoder.transform([place_type])
        user_input = {
            'types_encoded': np.array(type_encoded),
            'review': np.array([review])
        }
        predictions = model.predict(user_input)
        recommended_place_ids = np.argsort(predictions[0])[-5:][::-1]
        recommendations.extend(place_encoder.inverse_transform(recommended_place_ids))
        # drop duplicates
        recommendations = list(set(recommendations))
    
    return recommendations

# Example of making recommendations
selected_cuid = "cdmnbu1268093imcrbhl9wsu9"
user_cuid = df.sample(1)['user_cuid'].values[0]

df_place = pd.read_csv('final-dataset/main_dataset.csv')

recommendations = recommend(user_cuid, model, df, place_encoder, type_encoder)
print(f"Recommended places for {user_cuid}:")
# print that user reviews
print(df[df['user_cuid'] == user_cuid]['review'].values)
# print name based on place_id
for place_id in recommendations:
    print(df_place[df_place['id'] == place_id]['name'].values)
    
    
    

Recommended places for c0s9bsiw55816xsqzxh3pa02x:
['Nice']
['Batur Water Park']
[]
['X-ecutive Karaoke']
["Jay's Villas Umalas"]
['Pepe Bocelli Family Karaoke']


In [186]:
model.save('collab_modelVwhatever.keras')