In [177]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
import pickle as pkl

In [178]:
df_tourism_rating = pd.read_csv('../../Dataset/Tourism Rating/raw/tourism_rating.csv')
df_tourism  = pd.read_csv('../../Dataset/Tourism/tourism_encoded.csv')
df_user = pd.read_csv('../../Dataset/User/raw/user.csv')

In [179]:
df_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,...,_1,Rating_Count,Alam,Hiburan,Kuliner,Olahraga,Rekreasi,Religius,Sejarah Edukasi,Seni Budaya
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Sejarah Edukasi,Jakarta,20000,46,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-61753924,...,1,18,0,0,0,0,0,0,1,0
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...","Sejarah Edukasi,Seni Budaya,Kuliner",Jakarta,0,46,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-61376448,...,2,25,0,0,1,0,0,0,1,1
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,"Rekreasi,Hiburan",Jakarta,270000,46,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-61253124,...,3,19,0,1,0,0,1,0,0,0
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,"Sejarah Edukasi,Seni Budaya,Rekreasi",Jakarta,10000,45,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-63024459,...,4,21,0,0,0,0,1,0,1,1
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,"Rekreasi,Olahraga",Jakarta,94000,45,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-612419,...,5,24,0,0,0,1,1,0,0,0


In [180]:
df_tourism['Category'] = df_tourism['Category'].apply(lambda x: x.replace(',', ' '))
df_tourism['Tags'] = df_tourism['Description'] + ' ' + df_tourism['Category'] + ' ' + df_tourism['City']

In [181]:
df_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,...,Rating_Count,Alam,Hiburan,Kuliner,Olahraga,Rekreasi,Religius,Sejarah Edukasi,Seni Budaya,Tags
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Sejarah Edukasi,Jakarta,20000,46,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-61753924,...,18,0,0,0,0,0,0,1,0,Monumen Nasional atau yang populer disingkat d...
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Sejarah Edukasi Seni Budaya Kuliner,Jakarta,0,46,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-61376448,...,25,0,0,1,0,0,0,1,1,"Kota tua di Jakarta, yang juga bernama Kota Tu..."
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Rekreasi Hiburan,Jakarta,270000,46,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-61253124,...,19,0,1,0,0,1,0,0,0,Dunia Fantasi atau disebut juga Dufan adalah t...
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Sejarah Edukasi Seni Budaya Rekreasi,Jakarta,10000,45,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-63024459,...,21,0,0,0,0,1,0,1,1,Taman Mini Indonesia Indah merupakan suatu kaw...
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Rekreasi Olahraga,Jakarta,94000,45,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-612419,...,24,0,0,0,1,1,0,0,0,Atlantis Water Adventure atau dikenal dengan A...


In [182]:
new_df_tourism = df_tourism[['Place_Id', 'Place_Name', 'Tags']]
new_df_tourism.head()

Unnamed: 0,Place_Id,Place_Name,Tags
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu..."
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...


In [183]:
stopwords = pd.read_csv('../../Dataset/Stop Words/stopwordbahasa.csv')
stopwords_list = stopwords.iloc[:,0].tolist()

In [184]:
cv = CountVectorizer(max_features = 1000, lowercase=True, stop_words=stopwords_list)
vectors = cv.fit_transform(new_df_tourism['Tags']).toarray()
vectors.shape



(437, 1000)

In [185]:
cv.get_feature_names_out()

array(['00', '000', '10', '11', '12', '13', '14', '15', '150', '16', '17',
       '18', '19', '1945', '1973', '1980', '20', '200', '2000', '2001',
       '2002', '2003', '2004', '2006', '2010', '2014', '2015', '2016',
       '2017', '2018', '2019', '23', '25', '27', '29', '30', '40', '45',
       '50', '500', '60', '600', '70', '_x000d_', 'abad', 'abang',
       'acara', 'ada', 'administrasi', 'administratif', 'afrika', 'agama',
       'agro', 'agung', 'agustus', 'ahmad', 'air', 'airnya', 'akibat',
       'akses', 'aktivitas', 'al', 'alam', 'alami', 'alamnya', 'alat',
       'aliran', 'alternatif', 'alun', 'ambarawa', 'an', 'anak', 'ancol',
       'aneka', 'anggrek', 'angin', 'angkatan', 'angklung', 'api',
       'april', 'arah', 'ardilla', 'area', 'areal', 'arena', 'arsitek',
       'arsitektur', 'art', 'asia', 'asing', 'asisten', 'asli', 'asri',
       'atap', 'atasnya', 'atlantis', 'atraksi', 'babakan', 'badak',
       'bagus', 'bahan', 'bahasa', 'balai', 'bali', 'bambu', 'bandung',

In [186]:
similarity = cosine_similarity(vectors)
similarity.shape

(437, 437)

In [187]:
pkl.dump(similarity, open('similarity.pkl', 'wb'))
pkl.dump(new_df_tourism, open('new_df_tourism.pkl', 'wb'))

In [188]:
def similarity_tourism(tourism_name):
    index = new_df_tourism[new_df_tourism['Place_Name'] == tourism_name].index[0]
    similarity_score = similarity[index]
    similarity_place = sorted(list(enumerate(similarity_score)),key=lambda x: x[1],reverse=True)[1:11]
    similarity_list = []    
    for i in similarity_place:
        similarity_list.append([df_tourism.iloc[i[0], 1]] + [i[1]])
    return similarity_list

In [189]:
similarity_place = similarity_tourism('Dunia Fantasi')
similarity_place

[['Taman Impian Jaya Ancol', 0.6504436355879908],
 ['Grand Indonesia Mall', 0.4387634544762783],
 ['Skyrink - Mall Taman Anggrek', 0.420599782339669],
 ['Taman Mini Indonesia Indah (TMII)', 0.408248290463863],
 ['Taman Legenda Keong Emas', 0.379941632827523],
 ['Pelabuhan Marina', 0.3783937343321347],
 ['Kidzania', 0.37688918072220445],
 ['The Escape Hunt', 0.3763089045031908],
 ['Margasatwa Muara Angke', 0.3662599370166148],
 ['Pulau Pari', 0.35176323534072423]]

In [190]:
df_collaborative = df_tourism_rating.copy()
df_collaborative

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4
...,...,...,...
9995,300,425,2
9996,300,64,4
9997,300,311,3
9998,300,279,4


In [191]:
user_id = df_collaborative['User_Id'].unique().tolist()
user_id_encoded = {value: index for index, value in enumerate(user_id)}
print(user_id_encoded)

place_id = df_collaborative['Place_Id'].unique().tolist()
place_id_encoded = {value: index for index, value in enumerate(place_id)}
print(place_id_encoded)


{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 33: 32, 34: 33, 35: 34, 36: 35, 37: 36, 38: 37, 39: 38, 40: 39, 41: 40, 42: 41, 43: 42, 44: 43, 45: 44, 46: 45, 47: 46, 48: 47, 49: 48, 50: 49, 51: 50, 52: 51, 53: 52, 54: 53, 55: 54, 56: 55, 57: 56, 58: 57, 59: 58, 60: 59, 61: 60, 62: 61, 63: 62, 64: 63, 65: 64, 66: 65, 67: 66, 68: 67, 69: 68, 70: 69, 71: 70, 72: 71, 73: 72, 74: 73, 75: 74, 76: 75, 77: 76, 78: 77, 79: 78, 80: 79, 81: 80, 82: 81, 83: 82, 84: 83, 85: 84, 86: 85, 87: 86, 88: 87, 89: 88, 90: 89, 91: 90, 92: 91, 93: 92, 94: 93, 95: 94, 96: 95, 97: 96, 98: 97, 99: 98, 100: 99, 101: 100, 102: 101, 103: 102, 104: 103, 105: 104, 106: 105, 107: 106, 108: 107, 109: 108, 110: 109, 111: 110, 112: 111, 113: 112, 114: 113, 115: 114, 116: 115, 117: 116, 118: 117, 119: 118, 120: 119, 121: 120, 122: 12

In [192]:
scaler = MinMaxScaler()
place_ratings = df_collaborative['Place_Ratings'].tolist()
place_ratings_scaled = scaler.fit_transform(np.array(place_ratings).reshape(-1, 1))
df_collaborative['Place_Ratings'] = place_ratings_scaled


In [193]:
df_collaborative['User'] = df_collaborative['User_Id'].map(user_id_encoded)
df_collaborative['Place'] = df_collaborative['Place_Id'].map(place_id_encoded)
df_collaborative

Unnamed: 0,User_Id,Place_Id,Place_Ratings,User,Place
0,1,179,0.50,0,0
1,1,344,0.25,0,1
2,1,5,1.00,0,2
3,1,373,0.50,0,3
4,1,101,0.75,0,4
...,...,...,...,...,...
9995,300,425,0.25,299,324
9996,300,64,0.75,299,132
9997,300,311,0.50,299,348
9998,300,279,0.75,299,290


In [194]:
x_train, x_test, y_train, y_test = train_test_split(df_collaborative[['User', 'Place']].values, df_collaborative['Place_Ratings'].values, test_size=0.2, random_state=42,shuffle=True)

In [195]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((8000, 2), (2000, 2), (8000,), (2000,))

In [196]:
def build_recommender_net(num_users, num_place, embedding_size):
    input_layer = tf.keras.layers.Input(shape=(2,), name='input_layer')

    user_input = input_layer[:,:1]
    place_input = input_layer[:,1:]

    user_embedding = tf.keras.layers.Embedding(
        num_users,
        embedding_size,
        embeddings_initializer='he_normal',
        embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
    )(user_input)
    user_bias = tf.keras.layers.Embedding(num_users, 1)(user_input)

    place_embedding = tf.keras.layers.Embedding(
        num_place,
        embedding_size,
        embeddings_initializer='he_normal',
        embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
    )(place_input)
    place_bias = tf.keras.layers.Embedding(num_place, 1)(place_input)

    dot_user_place = tf.keras.layers.Dot(axes=2)([user_embedding, place_embedding])

    x = tf.keras.layers.Add()([dot_user_place, user_bias, place_bias])
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=input_layer, outputs=output)
    return model

In [197]:
model = build_recommender_net(num_users=len(user_id_encoded), num_place=len(place_id_encoded), embedding_size=128)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [198]:
model.fit(x_train, y_train, batch_size=128,epochs=100, verbose=2, validation_data=(x_test, y_test))

Epoch 1/100


63/63 - 2s - loss: 0.6931 - root_mean_squared_error: 0.3454 - val_loss: 0.6939 - val_root_mean_squared_error: 0.3435 - 2s/epoch - 30ms/step
Epoch 2/100
63/63 - 0s - loss: 0.6633 - root_mean_squared_error: 0.3236 - val_loss: 0.7307 - val_root_mean_squared_error: 0.3670 - 405ms/epoch - 6ms/step
Epoch 3/100
63/63 - 0s - loss: 0.5318 - root_mean_squared_error: 0.2146 - val_loss: 0.7969 - val_root_mean_squared_error: 0.3972 - 373ms/epoch - 6ms/step
Epoch 4/100
63/63 - 0s - loss: 0.4507 - root_mean_squared_error: 0.1282 - val_loss: 0.8472 - val_root_mean_squared_error: 0.4135 - 360ms/epoch - 6ms/step
Epoch 5/100
63/63 - 0s - loss: 0.4218 - root_mean_squared_error: 0.0990 - val_loss: 0.8829 - val_root_mean_squared_error: 0.4218 - 396ms/epoch - 6ms/step
Epoch 6/100
63/63 - 0s - loss: 0.4120 - root_mean_squared_error: 0.0937 - val_loss: 0.9221 - val_root_mean_squared_error: 0.4289 - 438ms/epoch - 7ms/step
Epoch 7/100
63/63 - 0s - loss: 0.4102 - root_mean_squared_error: 0.0962 - val_loss: 0.9269

<keras.callbacks.History at 0x1db56253b20>

In [199]:
user_id = df_tourism_rating['User_Id'].sample(1).iloc[0]
place_visited = df_tourism_rating[df_tourism_rating['User_Id'] == user_id]
place_visited.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
2263,70,196,5
2264,70,161,5
2265,70,245,1
2266,70,227,5
2267,70,437,4


In [200]:
place_not_visited = df_tourism[~df_tourism['Place_Id'].isin(place_visited['Place_Id'])]['Place_Id']
place_not_visited = list(set(place_not_visited).intersection(set(place_id_encoded.keys())))
place_not_visited = [[place_id_encoded.get(x)] for x in place_not_visited]
user_encoder = user_id_encoded.get(user_id)
user_place_array = np.hstack(([[user_encoder]] * len(place_not_visited), place_not_visited))

In [201]:
ratings = model.predict(user_place_array).flatten()



In [202]:
top_ratings_indices = ratings.argsort()[-10:][::-1]
recommended_place_ids = [place_id_encoded.get(place_not_visited[x][0]) for x in top_ratings_indices]


In [203]:
print(f"Top 5 places that user id {user_id} has visited")
top_ratings_place = (place_visited.sort_values(by='Place_Ratings', ascending=False).head(5)['Place_Id'].values)
df_tourism_rows = df_tourism[df_tourism['Place_Id'].isin(top_ratings_place)]
for row in df_tourism_rows.itertuples():
  print("-",row.Place_Name)

print(f"\nRecommended places for user id {user_id}")
recommended_place = df_tourism[df_tourism['Place_Id'].isin(recommended_place_ids)]
for row in recommended_place.itertuples():
  print("-",row.Place_Name)


Top 5 places that user id 70 has visited
- Sindu Kusuma Edupark (SKE)
- Watu Lumbung
- Bukit Paralayang, Watugupit
- Pantai Ngrenehan
- NuArt Sculpture Park

Recommended places for user id 70
- Pulau Tidung
- Museum Sumpah Pemuda
- Jurang Tembelan Kanigoro
- Goa Pindul
- Museum Geologi Bandung
- Kampung Korea Bandung
- Tektona Waterpark
- Hutan Pinus Kayon
- Taman Buah Surabaya
- Kebun Bibit Wonorejo


In [204]:
model.save('model.h5')