In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
import pickle as pkl

In [2]:
df_tourism_rating = pd.read_csv('../../../Dataset/Tourism Rating/raw/tourism_rating.csv')
df_tourism  = pd.read_csv('../../../Dataset/Tourism/tourism.csv')
df_user = pd.read_csv('../../../Dataset/User/raw/user.csv')

In [3]:
df_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Lat,Long,Rating_Count
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Sejarah Edukasi,Jakarta,20000,4.6,15.0,-6.175392,106.827153,18
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...","Sejarah Edukasi,Seni Budaya,Kuliner",Jakarta,0,4.6,90.0,-6.137645,106.817125,25
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,"Rekreasi,Hiburan",Jakarta,270000,4.6,360.0,-6.125312,106.833538,19
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,"Sejarah Edukasi,Seni Budaya,Rekreasi",Jakarta,10000,4.5,,-6.302446,106.895156,21
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,"Rekreasi,Olahraga",Jakarta,94000,4.5,60.0,-6.12419,106.839134,24


In [4]:
df_tourism['Category'] = df_tourism['Category'].apply(lambda x: x.replace(',', ' '))
df_tourism['Tags'] = df_tourism['Description'] + ' ' + df_tourism['Category'] + ' ' + df_tourism['City']

In [5]:
df_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Lat,Long,Rating_Count,Tags
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Sejarah Edukasi,Jakarta,20000,4.6,15.0,-6.175392,106.827153,18,Monumen Nasional atau yang populer disingkat d...
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Sejarah Edukasi Seni Budaya Kuliner,Jakarta,0,4.6,90.0,-6.137645,106.817125,25,"Kota tua di Jakarta, yang juga bernama Kota Tu..."
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Rekreasi Hiburan,Jakarta,270000,4.6,360.0,-6.125312,106.833538,19,Dunia Fantasi atau disebut juga Dufan adalah t...
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Sejarah Edukasi Seni Budaya Rekreasi,Jakarta,10000,4.5,,-6.302446,106.895156,21,Taman Mini Indonesia Indah merupakan suatu kaw...
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Rekreasi Olahraga,Jakarta,94000,4.5,60.0,-6.12419,106.839134,24,Atlantis Water Adventure atau dikenal dengan A...


In [6]:
new_df_tourism = df_tourism[['Place_Id', 'Place_Name', 'Tags']]
new_df_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Tags
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu..."
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...


In [7]:
stopwords = pd.read_csv('../../../Dataset/Stop Words/stopwordbahasa.csv')
stopwords_list = stopwords.iloc[:,0].tolist()

In [8]:
cv = CountVectorizer(max_features = 5000, lowercase=True, stop_words=stopwords_list)
vectors = cv.fit_transform(new_df_tourism['Tags']).toarray()
vectors.shape



(437, 5000)

In [9]:
print(vectors[0,1000:2000])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [10]:
cv.get_feature_names_out()

array(['00', '000', '01', ..., 'ꦭꦮ', 'ꦱꦔ', 'ꦱꦩꦱ'], dtype=object)

In [11]:
similarity = cosine_similarity(vectors)
similarity.shape

(437, 437)

In [12]:
pkl.dump(similarity, open('pickles/similarity.pkl', 'wb'))
pkl.dump(new_df_tourism, open('pickles/new_df_tourism.pkl', 'wb'))

In [13]:
def similarity_tourism(tourism_name):
    index = new_df_tourism[new_df_tourism['Place_Name'] == tourism_name].index[0]
    similarity_score = similarity[index]
    similarity_place = sorted(list(enumerate(similarity_score)),key=lambda x: x[1],reverse=True)[1:11]
    similarity_list = []    
    for i in similarity_place:
        similarity_list.append([df_tourism.iloc[i[0], 1]] + [i[1]])
    return similarity_list

In [14]:
similarity_place = similarity_tourism('Masjid Istiqlal')
similarity_place

[['Masjid Raya Bandung', 0.5941924364659286],
 ['Masjid Pusdai', 0.5839673225796117],
 ['Masjid Nasional Al-Akbar', 0.5619082831750656],
 ['Masjid Muhammad Cheng Hoo', 0.5055458603111107],
 ['Masjid Salman ITB', 0.48997929952961095],
 ['Masjid Daarut Tauhiid Bandung', 0.4816832620448062],
 ['Masjid Agung Ungaran', 0.45893200206502166],
 ['Masjid Agung Trans Studio Bandung', 0.43399360204375065],
 ['Masjid Kapal Semarang', 0.40064042750948253],
 ['Masjid Al-Imtizaj', 0.3853185038734673]]

In [15]:
df_collaborative = df_tourism_rating.copy()
df_collaborative

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4
...,...,...,...
9995,300,425,2
9996,300,64,4
9997,300,311,3
9998,300,279,4


In [16]:
user_id = df_collaborative['User_Id'].unique().tolist()
user_id_encoded = {value: index for index, value in enumerate(user_id)}
print(user_id_encoded)

place_id = df_collaborative['Place_Id'].unique().tolist()
place_id_encoded = {value: index for index, value in enumerate(place_id)}
print(place_id_encoded)


{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 33: 32, 34: 33, 35: 34, 36: 35, 37: 36, 38: 37, 39: 38, 40: 39, 41: 40, 42: 41, 43: 42, 44: 43, 45: 44, 46: 45, 47: 46, 48: 47, 49: 48, 50: 49, 51: 50, 52: 51, 53: 52, 54: 53, 55: 54, 56: 55, 57: 56, 58: 57, 59: 58, 60: 59, 61: 60, 62: 61, 63: 62, 64: 63, 65: 64, 66: 65, 67: 66, 68: 67, 69: 68, 70: 69, 71: 70, 72: 71, 73: 72, 74: 73, 75: 74, 76: 75, 77: 76, 78: 77, 79: 78, 80: 79, 81: 80, 82: 81, 83: 82, 84: 83, 85: 84, 86: 85, 87: 86, 88: 87, 89: 88, 90: 89, 91: 90, 92: 91, 93: 92, 94: 93, 95: 94, 96: 95, 97: 96, 98: 97, 99: 98, 100: 99, 101: 100, 102: 101, 103: 102, 104: 103, 105: 104, 106: 105, 107: 106, 108: 107, 109: 108, 110: 109, 111: 110, 112: 111, 113: 112, 114: 113, 115: 114, 116: 115, 117: 116, 118: 117, 119: 118, 120: 119, 121: 120, 122: 12

In [17]:
scaler = MinMaxScaler()
place_ratings = df_collaborative['Place_Ratings'].tolist()
place_ratings_scaled = scaler.fit_transform(np.array(place_ratings).reshape(-1, 1))
df_collaborative['Place_Ratings'] = place_ratings_scaled


In [18]:
df_collaborative['User'] = df_collaborative['User_Id'].map(user_id_encoded)
df_collaborative['Place'] = df_collaborative['Place_Id'].map(place_id_encoded)
df_collaborative

Unnamed: 0,User_Id,Place_Id,Place_Ratings,User,Place
0,1,179,0.50,0,0
1,1,344,0.25,0,1
2,1,5,1.00,0,2
3,1,373,0.50,0,3
4,1,101,0.75,0,4
...,...,...,...,...,...
9995,300,425,0.25,299,324
9996,300,64,0.75,299,132
9997,300,311,0.50,299,348
9998,300,279,0.75,299,290


In [19]:
x_train, x_test, y_train, y_test = train_test_split(df_collaborative[['User', 'Place']].values, df_collaborative['Place_Ratings'].values, test_size=0.2, random_state=42,shuffle=True)

In [20]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((8000, 2), (2000, 2), (8000,), (2000,))

In [21]:
def build_recommender_net(num_users, num_place, embedding_size):
    input_layer = tf.keras.layers.Input(shape=(2,), name='input_layer')

    user_input = input_layer[:,:1]
    place_input = input_layer[:,1:]

    user_embedding = tf.keras.layers.Embedding(
        num_users,
        embedding_size,
        embeddings_initializer='he_normal',
        embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
    )(user_input)
    user_bias = tf.keras.layers.Embedding(num_users, 1)(user_input)

    place_embedding = tf.keras.layers.Embedding(
        num_place,
        embedding_size,
        embeddings_initializer='he_normal',
        embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
    )(place_input)
    place_bias = tf.keras.layers.Embedding(num_place, 1)(place_input)

    dot_user_place = tf.keras.layers.Dot(axes=2)([user_embedding, place_embedding])

    x = tf.keras.layers.Add()([dot_user_place, user_bias, place_bias])
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=input_layer, outputs=output)
    return model

In [22]:
model = build_recommender_net(num_users=len(user_id_encoded), num_place=len(place_id_encoded), embedding_size=128)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_layer (InputLayer)       [(None, 2)]          0           []                               
                                                                                                  
 tf.__operators__.getitem (Slic  (None, 1)           0           ['input_layer[0][0]']            
 ingOpLambda)                                                                                     
                                                                                                  
 tf.__operators__.getitem_1 (Sl  (None, 1)           0           ['input_layer[0][0]']            
 icingOpLambda)                                                                                   
                                                                                              

In [24]:
model.fit(x_train, y_train, batch_size=128,epochs=100, verbose=2, validation_data=(x_test, y_test))

Epoch 1/100
63/63 - 3s - loss: 0.6931 - root_mean_squared_error: 0.3454 - val_loss: 0.6941 - val_root_mean_squared_error: 0.3437 - 3s/epoch - 55ms/step
Epoch 2/100
63/63 - 1s - loss: 0.6797 - root_mean_squared_error: 0.3358 - val_loss: 0.7046 - val_root_mean_squared_error: 0.3507 - 690ms/epoch - 11ms/step
Epoch 3/100
63/63 - 1s - loss: 0.5947 - root_mean_squared_error: 0.2724 - val_loss: 0.7746 - val_root_mean_squared_error: 0.3859 - 709ms/epoch - 11ms/step
Epoch 4/100
63/63 - 1s - loss: 0.5360 - root_mean_squared_error: 0.2267 - val_loss: 0.7828 - val_root_mean_squared_error: 0.3905 - 633ms/epoch - 10ms/step
Epoch 5/100
63/63 - 1s - loss: 0.5027 - root_mean_squared_error: 0.1965 - val_loss: 0.8212 - val_root_mean_squared_error: 0.4042 - 616ms/epoch - 10ms/step
Epoch 6/100
63/63 - 1s - loss: 0.4836 - root_mean_squared_error: 0.1782 - val_loss: 0.8373 - val_root_mean_squared_error: 0.4083 - 622ms/epoch - 10ms/step
Epoch 7/100
63/63 - 1s - loss: 0.4630 - root_mean_squared_error: 0.1560 -

<keras.callbacks.History at 0x27fbc27d040>

In [25]:
user_id = 1
place_visited = df_tourism_rating[df_tourism_rating['User_Id'] == user_id]
place_visited.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4


In [26]:
place_not_visited = df_tourism[~df_tourism['Place_Id'].isin(place_visited['Place_Id'])]['Place_Id']
place_not_visited = list(set(place_not_visited).intersection(set(place_id_encoded.keys())))
place_not_visited = [[place_id_encoded.get(x)] for x in place_not_visited]
user_encoder = user_id_encoded.get(user_id)
user_place_array = np.hstack(([[user_encoder]] * len(place_not_visited), place_not_visited))

In [27]:
ratings = model.predict(user_place_array).flatten()



In [28]:
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_place_ids = [place_id_encoded.get(place_not_visited[x][0]) for x in top_ratings_indices]


In [29]:
print(f"\nRecommended places for user id {user_id}")
recommended_place = df_tourism[df_tourism['Place_Id'].isin(recommended_place_ids)]
for row in recommended_place.itertuples():
  print("-",row.Place_Id, ":", row.Place_Name)



Recommended places for user id 1
- 3 : Dunia Fantasi
- 27 : Sea World
- 87 : Sindu Kusuma Edupark (SKE)
- 159 : Hutan Pinus Asri
- 198 : Pantai Greweng
- 270 : Bukit Bintang
- 301 : Lereng Anteng Panoramic Coffee Place
- 358 : Masjid Kapal Semarang
- 375 : Kota Lama Semarang
- 421 : Museum Kesehatan Dr. Adhyatma


In [31]:
pkl.dump(df_tourism_rating, open('pickles/df_tourism_rating.pkl', 'wb'))
pkl.dump(df_tourism, open('pickles/df_tourism.pkl', 'wb'))
pkl.dump(place_id_encoded, open('pickles/place_id_encoded.pkl', 'wb'))
pkl.dump(user_id_encoded, open('pickles/user_id_encoded.pkl', 'wb'))

In [30]:
model.save('model/model.h5')