In [1]:
# import library
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from text_processing.base import LowerCasing
from text_processing.cleaning import StopWords
from text_processing.normalization import Lemmatization, Stemming
from text_processing.representation import Tfidf, CosineSimilarity

[nltk_data] Downloading package punkt to C:\Users\Afan
[nltk_data]     Ramadhan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Afan
[nltk_data]     Ramadhan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Afan
[nltk_data]     Ramadhan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# import dataset tempat destinasi wisata
df = pd.read_csv("Dataset/tourism_with_id.csv")

## **Exploratory Data Analysis**

### Check Basic Dataset Information

In [3]:
# cek 5 data pertama dalam dataset
df.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Unnamed: 11,Unnamed: 12
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153,,1
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125,,2
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-6.125312,106.833538,,3
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-6.302446,106.895156,,4
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-6.12419,106.839134,,5


In [4]:
# cek nama kolom yang terdapat pada dataset
df.columns

Index(['Place_Id', 'Place_Name', 'Description', 'Category', 'City', 'Price',
       'Rating', 'Time_Minutes', 'Coordinate', 'Lat', 'Long', 'Unnamed: 11',
       'Unnamed: 12'],
      dtype='object')

In [5]:
# melihat informasi dasar dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Place_Id      437 non-null    int64  
 1   Place_Name    437 non-null    object 
 2   Description   437 non-null    object 
 3   Category      437 non-null    object 
 4   City          437 non-null    object 
 5   Price         437 non-null    int64  
 6   Rating        437 non-null    float64
 7   Time_Minutes  205 non-null    float64
 8   Coordinate    437 non-null    object 
 9   Lat           437 non-null    float64
 10  Long          437 non-null    float64
 11  Unnamed: 11   0 non-null      float64
 12  Unnamed: 12   437 non-null    int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 44.5+ KB


In [6]:
# melihat jumlah nilai unik pada setiap kolom
df.nunique()

Place_Id        437
Place_Name      437
Description     437
Category          6
City              5
Price            50
Rating           14
Time_Minutes     15
Coordinate      437
Lat             437
Long            437
Unnamed: 11       0
Unnamed: 12     437
dtype: int64

### Check Missing Data

In [7]:
# cek jumlah data kosong pada setiap kolom
df.isna().sum()

Place_Id          0
Place_Name        0
Description       0
Category          0
City              0
Price             0
Rating            0
Time_Minutes    232
Coordinate        0
Lat               0
Long              0
Unnamed: 11     437
Unnamed: 12       0
dtype: int64

### Check Duplicate Data

In [8]:
# cek data duplikat
df[df.duplicated()==True]

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Unnamed: 11,Unnamed: 12


### Check Descriptive Statistics Value

In [9]:
# cek nilai statistik deskriptif pada setiap kolom
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Place_Id,437.0,219.0,126.295289,1.0,110.0,219.0,328.0,437.0
Price,437.0,24652.173913,66446.374709,0.0,0.0,5000.0,20000.0,900000.0
Rating,437.0,4.442792,0.208587,3.4,4.3,4.5,4.6,5.0
Time_Minutes,205.0,82.609756,52.872339,10.0,45.0,60.0,120.0,360.0
Lat,437.0,-7.095438,0.727241,-8.197894,-7.74959,-7.020524,-6.829411,1.07888
Long,437.0,109.160142,1.962848,103.931398,107.578369,110.237468,110.431869,112.821662
Unnamed: 11,0.0,,,,,,,
Unnamed: 12,437.0,219.0,126.295289,1.0,110.0,219.0,328.0,437.0


In [10]:
# menghapus data yang tidak digunakan
drop_cols = ['Price', 'Rating', 'Time_Minutes', 'Coordinate', 'Lat', 'Long', 'Unnamed: 11', 'Unnamed: 12']

df.drop(drop_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,Place_Id,Place_Name,Description,Category,City
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta


## **Data Preprocessing**

### Stopword, Stemming, and Lemmatization

In [11]:
# fungsi untuk pemrosesan data teks
def TextPreprocessing(text):
    lower_text = LowerCasing(text)       # lowercasing untuk seluruh data teks
    word_text = StopWords(lower_text)    # menghilangkan kata-kata yang tidak penting seperti konjungsi
    stem_text = Stemming(word_text)      # menghilangkan kata tambahan (mem..., ...kan, ber...)
    lemm_text = Lemmatization(stem_text) # mengubah kata menjadi bentuk yang baku
    return lemm_text

In [12]:
if os.path.exists("content_based_filtering_data.csv"):
    # menggunakan dataset yang sudah dibuat sebelumnya
    df = pd.read_csv("content_based_filtering_data.csv")
else:
    # membuat kolom baru 'Tags' yang berisi data yang sudah diproses
    df['Tags'] = df['Description'].apply(TextPreprocessing)

### TF-IDF (Term Frequency-Inverse Document Frequency)

In [13]:
# menghitung kata yang penting pada sebuah data menggunakan TF-IDF
tags_matrix, tags_tfidf = Tfidf(df['Tags'])
tags_matrix.shape

(437, 4529)

In [14]:
# hasil pemrosesan data menggunakan TF-IDF
pd.DataFrame(
    tags_matrix.todense(),
    columns= tags_tfidf.get_feature_names_out(),
    index=df['Place_Name']
).sample(5)

Unnamed: 0_level_0,01,02,04,07,10,100,1000,102,106,1063,...,yunani,yustinus,zaman,zeeland,zeven,zheng,ziarah,zona,zoo,zuider
Place_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kawasan Malioboro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kebun Binatang Ragunan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Museum Nike Ardilla,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Taman Sejarah Bandung,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Perkebunan Teh Malabar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cosine Similarity

In [15]:
# mengukur kemiripan antara kedua data teks
cosim_tags = CosineSimilarity(tags_matrix)
cosim_tags

array([[1.        , 0.04610392, 0.07565753, ..., 0.02176716, 0.00763884,
        0.01081589],
       [0.04610392, 1.        , 0.02036696, ..., 0.02634686, 0.0171539 ,
        0.06215193],
       [0.07565753, 0.02036696, 1.        , ..., 0.02693263, 0.03989713,
        0.00962143],
       ...,
       [0.02176716, 0.02634686, 0.02693263, ..., 1.        , 0.02817126,
        0.04235235],
       [0.00763884, 0.0171539 , 0.03989713, ..., 0.02817126, 1.        ,
        0.10957445],
       [0.01081589, 0.06215193, 0.00962143, ..., 0.04235235, 0.10957445,
        1.        ]])

In [16]:
# hasil pemrosesan data menggunakan Cosine Similarity
pd.DataFrame(
    cosim_tags,
    index=df['Place_Name'],
    columns=df['Place_Name'] 
).sample(5)

Place_Name,Monumen Nasional,Kota Tua,Dunia Fantasi,Taman Mini Indonesia Indah (TMII),Atlantis Water Adventure,Taman Impian Jaya Ancol,Kebun Binatang Ragunan,Ocean Ecopark,Pelabuhan Marina,Pulau Tidung,...,Monumen Bambu Runcing Surabaya,House of Sampoerna,Atlantis Land Surabaya,Taman Hiburan Rakyat,Taman Mundu,Museum Mpu Tantular,Taman Bungkul,Taman Air Mancur Menari Kenjeran,Taman Flora Bratang Surabaya,Gereja Perawan Maria Tak Berdosa Surabaya
Place_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Museum TNI AL Loka Jala Crana,0.042505,0.080333,0.020263,0.0,0.006528,0.011542,0.035996,0.0,0.0,0.0,...,0.013702,0.047514,0.031155,0.011732,0.019499,0.153974,0.006444,0.039501,0.004315,0.0
Situs Warungboto,0.009444,0.015735,0.006948,0.034209,0.005888,0.0,0.011124,0.003757,0.011068,0.058313,...,0.036747,0.059577,0.028323,0.018094,0.022408,0.028987,0.023725,0.0,0.017071,0.036168
Monumen Nasional,1.0,0.046104,0.075658,0.033091,0.005045,0.017806,0.020535,0.004715,0.017602,0.017497,...,0.210815,0.04877,0.010892,0.026666,0.036399,0.039517,0.040869,0.021767,0.007639,0.010816
Gunung Lalakon,0.003955,0.021552,0.009462,0.017346,0.010517,0.0,0.027971,0.018546,0.012716,0.018567,...,0.03997,0.012811,0.028097,0.007158,0.011843,0.00762,0.019193,0.007993,0.002069,0.004534
Museum Benteng Vredeburg Yogyakarta,0.013507,0.062474,0.01531,0.031656,0.0,0.0,0.008284,0.0,0.0,0.010756,...,0.009235,0.043991,0.018322,0.020319,0.012238,0.107036,0.00705,0.0,0.0,0.040182


In [17]:
# menyimpan dataset
try:
    df.drop('Description', axis=1, inplace=True)
except:
    df.to_csv("content_based_filtering_data.csv", index=False)
df.head()

Unnamed: 0,Place_Id,Place_Name,Category,City,Tags
0,1,Monumen Nasional,Budaya,Jakarta,monumen nasional populer singkat mona tugu mon...
1,2,Kota Tua,Budaya,Jakarta,kota tua jakarta nama kota tua pusat fatahilla...
2,3,Dunia Fantasi,Taman Hiburan,Jakarta,dunia fantasi dufan hibur letak kawasan taman ...
3,4,Taman Mini Indonesia Indah (TMII),Taman Hiburan,Jakarta,taman mini indonesia indah kawasan taman wisat...
4,5,Atlantis Water Adventure,Taman Hiburan,Jakarta,atlantis water adventure kenal atlantis ancol ...


## **Demo of Recommended Tourist Destinations**

In [18]:
# fungsi untuk sistem rekomendasi destinasi wisata menggunakan metode content based filtering
def travel_recommendations(inputDescription, inputCity, inputCategory, top_n=3):
    # preprocessing data input deskripsi wisata
    input_pre_desc = TextPreprocessing(inputDescription)
    input_tfidf_desc = tags_tfidf.transform([input_pre_desc])
    cosim = cosine_similarity(input_tfidf_desc, tags_matrix)[0]
    
    # filtering data berdasarkan kota dan kategori destinasi wisata
    filtered_indicies = [
        i for i in range(len(df['Place_Name']))
        if df['Category'][i] == inputCategory and
        df['City'][i] == inputCity
    ]
    
    # mengecek apakah terdapat destinasi wisata yang sesuai dengan kota dan kategori
    if not filtered_indicies:
        return []
    filtered_similarity = cosim[filtered_indicies] # mengambil nilai kemiripan data yang sesuai dengan filter
    
    # mengurutkan hasil 'top_n' teratas dari data termirip berdasarkan probabilitas tertinggi    
    sorted_order = np.argsort(filtered_similarity)[::-1][:top_n]
    recommendation = [filtered_indicies[i] for i in sorted_order]
    prob = (cosim / np.sum(cosim))*100 # mencari nilai probabilitas setiap hasil rekomendasi
    
    # menyimpan nama, kota, dan kategori destinasi wisata yang direkomendasikan
    recomendation_dict = {"Name": [], "City": [], "Category": [], "Probability": []}
    for i in recommendation:
        recomendation_dict["Name"].append(df['Place_Name'][i])
        recomendation_dict["Category"].append(df['Category'][i])
        recomendation_dict["City"].append(df['City'][i])
        recomendation_dict["Probability"].append(round(prob[i], 2))
    return recomendation_dict

In [19]:
# membuat kamus data untuk category dan city
category_dict = {i: cat for i, cat in enumerate(sorted(df['Category'].unique().tolist()))}
city_dict = {i: cit for i, cit in enumerate(sorted(df['City'].unique().tolist()))}

# fungsi untuk dekode data category
def decode_category(idx):
    result = [values for key, values in category_dict.items() if idx == key][0]
    return result

# fungsi untuk dekode data city
def decode_city(idx):
    result = [values for key, values in city_dict.items() if idx == key][0]
    return result

In [20]:
# interface sistem rekomendasi destinasi wisata
def view_result():
    # menampilkan list category dan city yang tersedia
    print("="*50)
    print(" Tourist Destination Reccomendation ".center(50, " "))
    print("="*50)
    print("List City:")
    for i, cit in city_dict.items():   
        print(f"{i+1}.".ljust(3, " ")+f"{cit}")
    print("-"*50)
    print("List Category:")
    for i, cat in category_dict.items():
        print(f"{i+1}.".ljust(3, " ")+f"{cat}")
    print("_"*50)

    # input user
    city = int(input("Input City name   : ")) # input berupa nomor urut dari list nama kota
    catg = int(input("Input Category    : ")) # input berupa nomor urut dari list kategori
    desc = str(input("Input Description : "))
    input_city = decode_city(city - 1)
    input_catg = decode_category(catg - 1)
    print("="*50)

    # hasil
    results = travel_recommendations(inputDescription=desc, inputCategory=input_catg, inputCity=input_city)
    print("Result:\n")
    for i in range(len(results['Name'])):
        print(f"{i+1}.".ljust(3, " ") + f"Name        : {results["Name"][i]}")
        print(" "*3 + f"City        : {results["City"][i]}")
        print(" "*3 + f"Category    : {results["Category"][i]}")
        print(" "*3 + f"Probability : {results["Probability"][i]}")
        print()
    print("="*50)

view_result()

        Tourist Destination Reccomendation        
List City:
1. Bandung
2. Jakarta
3. Semarang
4. Surabaya
5. Yogyakarta
--------------------------------------------------
List Category:
1. Bahari
2. Budaya
3. Cagar Alam
4. Pusat Perbelanjaan
5. Taman Hiburan
6. Tempat Ibadah
__________________________________________________
Result:

1. Name        : Pantai Drini
   City        : Yogyakarta
   Category    : Bahari
   Probability : 2.92

2. Name        : Pantai Kukup
   City        : Yogyakarta
   Category    : Bahari
   Probability : 2.86

3. Name        : Pantai Ngrawe (Mesra)
   City        : Yogyakarta
   Category    : Bahari
   Probability : 2.85

