In [1]:
import pandas as pd
import numpy as np
import langid
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

# Data Loading

In [2]:
review_url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-Alaska.json.gz'
metadata_url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/meta-Alaska.json.gz'

In [3]:
review = pd.read_json('alaska dataset/review-Alaska.json', lines=True)
meta = pd.read_json('alaska dataset/meta-Alaska.json', lines=True)

# Data Understanding

## Exploratory Data Analysis

In [4]:
review.head()

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,1.091298e+20,Nicki Gore,1566331951619,5.0,We always stay here when in Valdez for silver ...,,,0x56b646ed2220b77f:0xd8975e316de80952
1,1.082339e+20,Mitch Eichman,1503373018846,5.0,This was an amazing RV camping experience with...,,,0x56b646ed2220b77f:0xd8975e316de80952
2,1.127191e+20,Johnnie Anderson,1410062370985,5.0,Spent the summer of 2011. Had a wonderful time...,,,0x56b646ed2220b77f:0xd8975e316de80952
3,1.114239e+20,Eric Fox,1495241580499,5.0,My Wife and I have stayed at Bear Creek severa...,,,0x56b646ed2220b77f:0xd8975e316de80952
4,1.132409e+20,Allen Ratliff,1504917982385,5.0,Great campground for the price. Nice hot unlim...,,,0x56b646ed2220b77f:0xd8975e316de80952


In [5]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1051246 entries, 0 to 1051245
Data columns (total 8 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   user_id  1043116 non-null  float64
 1   name     1051246 non-null  object 
 2   time     1051246 non-null  int64  
 3   rating   1043116 non-null  float64
 4   text     639252 non-null   object 
 5   pics     44311 non-null    object 
 6   resp     119832 non-null   object 
 7   gmap_id  1051246 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 64.2+ MB


Pada data review ini, kita hanya menggunakan user_id, rating, dan gmap_id untuk model sistem rekomendasi kita. Data-data tersebut memiliki nilai null pada user_id dan rating

In [6]:
review[review['user_id'].isna()]

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
10784,,Hotels.com reviewer,1593475200000,,The cabin is quite new and very well appointed...,,,0x56cd250c30f1b975:0x9dd674c84d86e371
15270,,Expedia reviewer,1567468800000,,"This place was a little out of the way, but th...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
15272,,Gunnar,1564012800000,,"Positiv: Kathy war eine sehr gute Gastgeberin,...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
15273,,Jürgen,1533945600000,,Positiv: großes und gut ausgestattetes Zimmer;...,,,0x56c68f06160d842f:0x5754eb340f3f4a89
21695,,Hotels.com reviewer,1623628800000,,The bed and breakfast area is the lower level ...,,,0x56c897d182582593:0xbf0eb6a246c9b74d
...,...,...,...,...,...,...,...,...
1039559,,Expedia reviewer,1565136000000,,Outstanding location! Clean rooms that you can...,,,0x56cebe2660b2b109:0xbcc457abac27499
1039606,,Expedia reviewer,1472947200000,,Views from the pricey Mountainside are spectac...,,,0x56cebe2660b2b109:0xbcc457abac27499
1039685,,Travelocity reviewer,1437955200000,,I booked two rooms - even responded to an e-ma...,,,0x56cebe2660b2b109:0xbcc457abac27499
1039744,,Expedia reviewer,1474156800000,,We completely loved the lodge. Not just gorgeo...,,,0x56cebe2660b2b109:0xbcc457abac27499


Setelah melakukan analisis, sepertinya ada kesalahan dalam input data. Terlihat kita memiliki nomor berformat user_id pada kolom pertama dan untuk namanya kemungkinan besar review ini berasal dari third party app untuk memberikan review kepada google local

In [7]:
review[review['rating'].isna()]

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
10784,,Hotels.com reviewer,1593475200000,,The cabin is quite new and very well appointed...,,,0x56cd250c30f1b975:0x9dd674c84d86e371
15270,,Expedia reviewer,1567468800000,,"This place was a little out of the way, but th...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
15272,,Gunnar,1564012800000,,"Positiv: Kathy war eine sehr gute Gastgeberin,...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
15273,,Jürgen,1533945600000,,Positiv: großes und gut ausgestattetes Zimmer;...,,,0x56c68f06160d842f:0x5754eb340f3f4a89
21695,,Hotels.com reviewer,1623628800000,,The bed and breakfast area is the lower level ...,,,0x56c897d182582593:0xbf0eb6a246c9b74d
...,...,...,...,...,...,...,...,...
1039559,,Expedia reviewer,1565136000000,,Outstanding location! Clean rooms that you can...,,,0x56cebe2660b2b109:0xbcc457abac27499
1039606,,Expedia reviewer,1472947200000,,Views from the pricey Mountainside are spectac...,,,0x56cebe2660b2b109:0xbcc457abac27499
1039685,,Travelocity reviewer,1437955200000,,I booked two rooms - even responded to an e-ma...,,,0x56cebe2660b2b109:0xbcc457abac27499
1039744,,Expedia reviewer,1474156800000,,We completely loved the lodge. Not just gorgeo...,,,0x56cebe2660b2b109:0xbcc457abac27499


Selanjutnya mengalisis pada rating. Terlihat rating tidak memiliki nilai, tetapi kita memiliki kolom text review yang dapat digunakan untuk menganalisis sentimen dari text tersebut. Namun kita juga memilik tantangan lain yaitu selain bahasa inggris kita juga memiliki bahasa jerman.

Namun, jika diperhatikan kedua output dari kode pengecekan data user_id dan dan rating yang memiliki nilai null mengembalikan jumlah data yang sama selanjutnya kita akan memeriksa apakah kedua dataframe tersebut sama

In [8]:
review[review['user_id'].isna()].equals(review[review['rating'].isna()])

True

Terlihat dari output bahwa kedua dataframe tersebut sama

Dari hasil analisis ini kita memiliki kesimpulan kalau kolom text memiliki informasi yang penting oleh karena itu kita harus mengecek apakah kita memiliki text yang memiliki nilai null

In [9]:
review[review['user_id'].isna()]['text'].isna().sum()

0

Setelah dilakukan analisis, kolom text tidak memiliki nilai null

In [10]:
meta.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Bear Creek Cabins & RV Park,"Bear Creek Cabins & RV Park, 3181 Richardson H...",0x56b646ed2220b77f:0xd8975e316de80952,,61.100644,-146.214552,"[RV park, Cabin rental agency, Campground]",4.5,18,,,,,"[0x56b6445fd9f9e387:0x6dd3d374ef56431a, 0x56b6...",https://www.google.com/maps/place//data=!4m2!3...
1,Anchorage Market,"Anchorage Market, 88th Ave, Anchorage, AK 99515",0x56c8992b5dee7225:0x9f7f4bf151868cf7,,61.141435,-149.868482,[Farmers' market],4.2,18,,"[[Thursday, Closed], [Friday, 10AM–5PM], [Satu...","{'Service options': ['In-store shopping'], 'Ac...",Closed ⋅ Opens 10AM Fri,,https://www.google.com/maps/place//data=!4m2!3...
2,Happy Camper RV,"Happy Camper RV, 1151 N Shenandoah Dr # 4, Pal...",0x56c8e0455225be87:0xf24828df75e2f8ae,,61.591855,-149.290657,[RV repair shop],4.4,28,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x56c8e104d9929a1d:0x2070ad63defadbf, 0x56c91...",https://www.google.com/maps/place//data=!4m2!3...
3,Cajun Corner,"Cajun Corner, 302 G St, Anchorage, AK 99501",0x56c8bdb5d91017cd:0xca19fd9afceed343,,61.219378,-149.895852,[American restaurant],4.5,24,,"[[Wednesday, 11AM–2PM], [Thursday, 11AM–2PM], ...","{'Service options': ['Takeout', 'Dine-in', 'De...",Closed ⋅ Opens 11AM Thu,,https://www.google.com/maps/place//data=!4m2!3...
4,Alaska General Seafoods,"Alaska General Seafoods, 980 Stedman St, Ketch...",0x540c251956395673:0x16f5a4fe26c18931,,55.336119,-131.630669,"[Seafood wholesaler, Food]",4.7,8,,"[[Wednesday, 7AM–11PM], [Thursday, 7AM–11PM], ...",,Open ⋅ Closes 11PM,"[0x540c25a882a72685:0xac5663d19d0a1893, 0x540c...",https://www.google.com/maps/place//data=!4m2!3...


In [11]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12774 entries, 0 to 12773
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              12774 non-null  object 
 1   address           12606 non-null  object 
 2   gmap_id           12774 non-null  object 
 3   description       1516 non-null   object 
 4   latitude          12774 non-null  float64
 5   longitude         12774 non-null  float64
 6   category          12712 non-null  object 
 7   avg_rating        12774 non-null  float64
 8   num_of_reviews    12774 non-null  int64  
 9   price             1585 non-null   object 
 10  hours             8684 non-null   object 
 11  MISC              9387 non-null   object 
 12  state             8062 non-null   object 
 13  relative_results  10966 non-null  object 
 14  url               12774 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 1.5+ MB


Pada data meta ini, kita hanya menggunakan name, gmap_id, category, dan avg_rating untuk model sistem rekomendasi kita. Data-data tersebut sudah bersih dari Null kecuali data category

Namun, setelah melakukan analisis terhadap avg_rating, kita melihat kolom num_of_review. Tentu menjadi sangat tidak adil jika mengurutkan tempat terbaik dari avg_rating saja tanpa ada bobot num_of_review

# Data Preparation

## Dataframe Review

In [12]:
# masukkan user_id yang null pada varibel website_reviewer ke csv untuk memudahkan persiapan pada data
website_reviewer = review[review['user_id'].isna()]
website_reviewer.to_csv('website_reviewer.csv')

Selanjutnya kita hanya perlu memperbaiki csv yang memiliki kesalahan input dengan menghapus koma di depan dan menambahkan koma setelah user_id dan ubah nama filenya menjadi _website_reviewer.csv

In [13]:
# memasukkan data csv ke variabel website_reviewer
website_reviewer = pd.read_csv('_website_reviewer.csv')
website_reviewer

Unnamed: 0,user_id,Unnamed: 1,name,time,rating,text,pics,resp,gmap_id
0,10784,,Hotels.com reviewer,1593475200000,,The cabin is quite new and very well appointed...,,,0x56cd250c30f1b975:0x9dd674c84d86e371
1,15270,,Expedia reviewer,1567468800000,,"This place was a little out of the way, but th...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
2,15272,,Gunnar,1564012800000,,"Positiv: Kathy war eine sehr gute Gastgeberin,...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
3,15273,,Jürgen,1533945600000,,Positiv: großes und gut ausgestattetes Zimmer;...,,,0x56c68f06160d842f:0x5754eb340f3f4a89
4,21695,,Hotels.com reviewer,1623628800000,,The bed and breakfast area is the lower level ...,,,0x56c897d182582593:0xbf0eb6a246c9b74d
...,...,...,...,...,...,...,...,...,...
8125,1039559,,Expedia reviewer,1565136000000,,Outstanding location! Clean rooms that you can...,,,0x56cebe2660b2b109:0xbcc457abac27499
8126,1039606,,Expedia reviewer,1472947200000,,Views from the pricey Mountainside are spectac...,,,0x56cebe2660b2b109:0xbcc457abac27499
8127,1039685,,Travelocity reviewer,1437955200000,,I booked two rooms - even responded to an e-ma...,,,0x56cebe2660b2b109:0xbcc457abac27499
8128,1039744,,Expedia reviewer,1474156800000,,We completely loved the lodge. Not just gorgeo...,,,0x56cebe2660b2b109:0xbcc457abac27499


Terdapat kolom baru yaitu Unnamed: 1, kita dapat menghapusnya

In [14]:
# menghapus kolom Unnamed: 1
website_reviewer = website_reviewer.drop(columns=['Unnamed: 1'])
website_reviewer

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,10784,Hotels.com reviewer,1593475200000,,The cabin is quite new and very well appointed...,,,0x56cd250c30f1b975:0x9dd674c84d86e371
1,15270,Expedia reviewer,1567468800000,,"This place was a little out of the way, but th...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
2,15272,Gunnar,1564012800000,,"Positiv: Kathy war eine sehr gute Gastgeberin,...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
3,15273,Jürgen,1533945600000,,Positiv: großes und gut ausgestattetes Zimmer;...,,,0x56c68f06160d842f:0x5754eb340f3f4a89
4,21695,Hotels.com reviewer,1623628800000,,The bed and breakfast area is the lower level ...,,,0x56c897d182582593:0xbf0eb6a246c9b74d
...,...,...,...,...,...,...,...,...
8125,1039559,Expedia reviewer,1565136000000,,Outstanding location! Clean rooms that you can...,,,0x56cebe2660b2b109:0xbcc457abac27499
8126,1039606,Expedia reviewer,1472947200000,,Views from the pricey Mountainside are spectac...,,,0x56cebe2660b2b109:0xbcc457abac27499
8127,1039685,Travelocity reviewer,1437955200000,,I booked two rooms - even responded to an e-ma...,,,0x56cebe2660b2b109:0xbcc457abac27499
8128,1039744,Expedia reviewer,1474156800000,,We completely loved the lodge. Not just gorgeo...,,,0x56cebe2660b2b109:0xbcc457abac27499


In [15]:
# fungsi untuk mengklasifikasikan bahasa
def classify_language(text):
    lang, _ = langid.classify(text)
    return lang

Kita dapat menggunakan model atau library yang telah dibuat, pada kasus ini kita menggunakan langid untuk mengklasifikasikan bahasa

In [16]:
# menerapkan fungsi pada kolom 'text' dan simpan hasilnya di kolom baru 'language'
website_reviewer['language'] = website_reviewer['text'].apply(classify_language)

In [17]:
website_reviewer

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id,language
0,10784,Hotels.com reviewer,1593475200000,,The cabin is quite new and very well appointed...,,,0x56cd250c30f1b975:0x9dd674c84d86e371,en
1,15270,Expedia reviewer,1567468800000,,"This place was a little out of the way, but th...",,,0x56c68f06160d842f:0x5754eb340f3f4a89,en
2,15272,Gunnar,1564012800000,,"Positiv: Kathy war eine sehr gute Gastgeberin,...",,,0x56c68f06160d842f:0x5754eb340f3f4a89,de
3,15273,Jürgen,1533945600000,,Positiv: großes und gut ausgestattetes Zimmer;...,,,0x56c68f06160d842f:0x5754eb340f3f4a89,de
4,21695,Hotels.com reviewer,1623628800000,,The bed and breakfast area is the lower level ...,,,0x56c897d182582593:0xbf0eb6a246c9b74d,en
...,...,...,...,...,...,...,...,...,...
8125,1039559,Expedia reviewer,1565136000000,,Outstanding location! Clean rooms that you can...,,,0x56cebe2660b2b109:0xbcc457abac27499,en
8126,1039606,Expedia reviewer,1472947200000,,Views from the pricey Mountainside are spectac...,,,0x56cebe2660b2b109:0xbcc457abac27499,en
8127,1039685,Travelocity reviewer,1437955200000,,I booked two rooms - even responded to an e-ma...,,,0x56cebe2660b2b109:0xbcc457abac27499,en
8128,1039744,Expedia reviewer,1474156800000,,We completely loved the lodge. Not just gorgeo...,,,0x56cebe2660b2b109:0xbcc457abac27499,en


Dataframe telah memiliki kolom baru yaitu bahasa sebagai klasifikasinya, tahap selanjutnya yaitu membuang data selain yang berbahasa inggris

In [18]:
# drop data selain bahasa inggris
website_reviewer = website_reviewer[website_reviewer['language'] == 'en']

In [19]:
website_reviewer

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id,language
0,10784,Hotels.com reviewer,1593475200000,,The cabin is quite new and very well appointed...,,,0x56cd250c30f1b975:0x9dd674c84d86e371,en
1,15270,Expedia reviewer,1567468800000,,"This place was a little out of the way, but th...",,,0x56c68f06160d842f:0x5754eb340f3f4a89,en
4,21695,Hotels.com reviewer,1623628800000,,The bed and breakfast area is the lower level ...,,,0x56c897d182582593:0xbf0eb6a246c9b74d,en
5,21697,Hotels.com reviewer,1533513600000,,We were in Anchorage as a base to visit four n...,,,0x56c897d182582593:0xbf0eb6a246c9b74d,en
6,23814,Hotels.com reviewer,1625616000000,,The cabin was perfect for our stay in Talkeent...,,,0x56ce958d49d601d3:0xfcb6dade60bfb7ee,en
...,...,...,...,...,...,...,...,...,...
8124,1039526,Orbitz reviewer,1584316800000,,"wait staff most great, but one waiter was trul...",,,0x56cebe2660b2b109:0xbcc457abac27499,en
8125,1039559,Expedia reviewer,1565136000000,,Outstanding location! Clean rooms that you can...,,,0x56cebe2660b2b109:0xbcc457abac27499,en
8126,1039606,Expedia reviewer,1472947200000,,Views from the pricey Mountainside are spectac...,,,0x56cebe2660b2b109:0xbcc457abac27499,en
8127,1039685,Travelocity reviewer,1437955200000,,I booked two rooms - even responded to an e-ma...,,,0x56cebe2660b2b109:0xbcc457abac27499,en


Selanjutnya kita akan mengklasifikasikan sentimen dari text dan konversi sentimen tersebut menjadi sebuah rating dengan ketentuan positif 5.0, netral 3.0, dan negatif 1.0

In [20]:
# inisiasi analyzer untuk analisis sentimen
analyzer = SentimentIntensityAnalyzer()

In [21]:
# fungsi untuk mengklasifikasi sentimen
def sentiment_analysis(text):
    vs = analyzer.polarity_scores(text)
    score = vs['compound']
    
    if score >= 0.05:
        return 5.0
    elif -0.05 < score < 0.05:
        return 3.0
    else:
        return 1.0

In [22]:
# menerapkan fungsi pada kolom 'text' dan simpan hasilnya di kolom 'rating'
website_reviewer['rating'] = website_reviewer['text'].apply(sentiment_analysis)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  website_reviewer['rating'] = website_reviewer['text'].apply(sentiment_analysis)


In [23]:
website_reviewer = website_reviewer.drop(columns=['language'])

In [24]:
website_reviewer

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,10784,Hotels.com reviewer,1593475200000,5.0,The cabin is quite new and very well appointed...,,,0x56cd250c30f1b975:0x9dd674c84d86e371
1,15270,Expedia reviewer,1567468800000,5.0,"This place was a little out of the way, but th...",,,0x56c68f06160d842f:0x5754eb340f3f4a89
4,21695,Hotels.com reviewer,1623628800000,5.0,The bed and breakfast area is the lower level ...,,,0x56c897d182582593:0xbf0eb6a246c9b74d
5,21697,Hotels.com reviewer,1533513600000,5.0,We were in Anchorage as a base to visit four n...,,,0x56c897d182582593:0xbf0eb6a246c9b74d
6,23814,Hotels.com reviewer,1625616000000,5.0,The cabin was perfect for our stay in Talkeent...,,,0x56ce958d49d601d3:0xfcb6dade60bfb7ee
...,...,...,...,...,...,...,...,...
8124,1039526,Orbitz reviewer,1584316800000,5.0,"wait staff most great, but one waiter was trul...",,,0x56cebe2660b2b109:0xbcc457abac27499
8125,1039559,Expedia reviewer,1565136000000,5.0,Outstanding location! Clean rooms that you can...,,,0x56cebe2660b2b109:0xbcc457abac27499
8126,1039606,Expedia reviewer,1472947200000,5.0,Views from the pricey Mountainside are spectac...,,,0x56cebe2660b2b109:0xbcc457abac27499
8127,1039685,Travelocity reviewer,1437955200000,5.0,I booked two rooms - even responded to an e-ma...,,,0x56cebe2660b2b109:0xbcc457abac27499


In [25]:
website_reviewer.rating.unique()

array([5., 3., 1.])

Sekarang, kita telah memiliki rating dan user_id pada data yang sebelumnya hilang

Kita berhasil mempertahankan 93.7% data dari seluruh informasi data yang hilang, teknik ini lebih baik dibanding kita melakukan drop pada semua data yang hilang

Tahap selanjutnya yaitu melakukan penggabungan data ini pada data review

In [26]:
# menghapus dataframe review yang tidak memiliki nilai
review = review.dropna(subset=['user_id'])

In [27]:
# menambahkan dataframe yang sebelumnya sudah dipersiapkan
review = pd.concat([review, website_reviewer], ignore_index=True)

In [28]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050734 entries, 0 to 1050733
Data columns (total 8 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   user_id  1050734 non-null  float64
 1   name     1050731 non-null  object 
 2   time     1050734 non-null  int64  
 3   rating   1050734 non-null  float64
 4   text     638740 non-null   object 
 5   pics     44311 non-null    object 
 6   resp     119832 non-null   object 
 7   gmap_id  1050734 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 64.1+ MB


In [29]:
# Mengubah user_id menjadi list tanpa nilai yang sama
user_ids = review['user_id'].unique().tolist()

In [30]:
# Melakukan encoding userID
user_to_user_encoded = {x: i for i, x in enumerate(user_ids)}

In [31]:
# Melakukan proses encoding angka ke ke userID
user_encoded_to_user = {i: x for i, x in enumerate(user_ids)}

Selanjutnya, lakukan hal yang sama pada kolom gmap_id

In [32]:
# Mengubah gmap_id menjadi list tanpa nilai yang sama
place_ids = review['gmap_id'].unique().tolist()
 
# Melakukan proses encoding gmap_id
gmap_to_gmap_encoded = {x: i for i, x in enumerate(place_ids)}
 
# Melakukan proses encoding angka ke gmap_id
gmap_encoded_to_gmap = {i: x for i, x in enumerate(place_ids)}

In [33]:
# Mapping userID ke dataframe user
review['user'] = review['user_id'].map(user_to_user_encoded)
 
# Mapping place ke dataframe place
review['gmap'] = review['gmap_id'].map(gmap_to_gmap_encoded)

In [34]:
# Mendapatkan jumlah user
num_users = len(user_to_user_encoded)
print(num_users)
 
# Mendapatkan jumlah places
num_places = len(gmap_encoded_to_gmap)
print(num_places)
 
# Mengubah rating menjadi nilai float
review['rating'] = review['rating'].values.astype(np.float32)
 
# Nilai minimum rating
min_rating = min(review['rating'])
 
# Nilai maksimal rating
max_rating = max(review['rating'])
 
print('Number of User: {}, Number of places: {}, Min Rating: {}, Max Rating: {}'.format(
    num_users, num_places, min_rating, max_rating
))


286313
12689
Number of User: 286313, Number of places: 12689, Min Rating: 1.0, Max Rating: 5.0


In [35]:
review = review.sample(frac=1, random_state=42)
review

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id,user,gmap
824236,1.086098e+20,Ramirez Jeep,1533061362453,5.0,,[{'url': ['https://lh5.googleusercontent.com/p...,,0x56c7b612942f1da1:0x42e6f5327929a9ef,146868,12189
70292,1.104279e+20,Brad Wuerer,1619113654480,5.0,They are amazing. Great customer service and ...,,"{'time': 1619200892421, 'text': 'Thank you for...",0x56c89797b9267fab:0xee32c926feb1b48e,8715,3970
949861,1.037296e+20,ohmyheck31,1542086620517,5.0,The Dog Sled Demo is a must-experience!,,,0x56cd209e8e116f63:0xe153a8daf0f05240,2804,12515
77559,1.115868e+20,Marissa Wood,1602733667884,5.0,I went into due to a impacted lower wisdom too...,,,0x51325ab1515b2ca1:0x3c1faae5b3e4ca0a,12260,4286
825280,1.102832e+20,Lt. Colonel David K. Swendiman,1598116354646,5.0,Peaceful and beautiful- even though right in t...,,,0x56c79c7cbc43a02b:0xe0a8e540ac8c61bb,54051,12199
...,...,...,...,...,...,...,...,...,...,...
110268,1.070980e+20,Lewis Sunnyboy,1540860599729,5.0,,,,0x51325ad73b924d65:0x3dffeb1b71c07b20,74274,5390
259178,1.032595e+20,Josh Gogus,1566679702518,4.0,She groomed our yorkie once. She was great wi...,,"{'time': 1566679289055, 'text': 'I am so sorry...",0x51324d59f5abb0ed:0xd3778e39d733ff8e,114072,8714
131932,1.085170e+20,Chelsey M.,1622024063183,5.0,Yummy food,,,0x56c91db84151adf7:0x483f4c47115c8d6f,3711,6041
671155,1.045853e+20,chelbie garcia,1602384204953,5.0,Food and service were both great. Will definit...,[{'url': ['https://lh5.googleusercontent.com/p...,"{'time': 1602637342860, 'text': 'We strive to ...",0x56c8bd877c06ac0f:0xebf1abe87d2435f5,36645,11686


In [36]:
# Membuat variabel x untuk mencocokkan data user dan gmap menjadi satu value
x = review[['user', 'gmap']].values
 
# Membuat variabel y untuk membuat rating dari hasil 
y = review['rating'].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
 
# Membagi menjadi 80% data train dan 20% data validasi
train_indices = int(0.8 * review.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:]
)
 
print(x, y)

[[146868  12189]
 [  8715   3970]
 [  2804  12515]
 ...
 [  3711   6041]
 [ 36645  11686]
 [ 44672   5745]] [1. 1. 1. ... 1. 1. 0.]


In [37]:
x_train.shape

(840587, 2)

Sekarang, data review sudah siap digunakan! selanjutnya kita akan menyiapkan data meta

## Dataframe Metadata

In [38]:
# Mengganti NaN dengan list kosong
meta['category'] = meta['category'].apply(lambda x: x if isinstance(x, list) else [])

Pada data category di Meta, kita mengganti data yang kosong menjadi list [] yang artinya tidak memiliki kategori apapun

In [39]:
mlb = MultiLabelBinarizer()
categories_encoded = mlb.fit_transform(meta['category'])

In [40]:
categories_df = pd.DataFrame(categories_encoded, columns=mlb.classes_)

Melakukan vektorisasi data category terhadap dataframe meta dengan Multi Label Binarizer dan menyimpannya di categories_df

In [41]:
categories_df

Unnamed: 0,ATM,ATV dealer,ATV rental service,ATV repair shop,Abortion clinic,Abrasives supplier,Accountant,Acrylic store,Acupuncture clinic,Acupuncturist,...,Yarn store,Yoga studio,Youth center,Youth clothing store,Youth club,Youth group,Youth hostel,Youth organization,Youth social services organization,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12769,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12770,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12772,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
meta = meta.join(categories_df)

menggabungkan categories_df ke dataframe meta dan melakukan drop kolom category

In [43]:
# similiarity matrix category dengan cosine similiarity
similarity_matrix = cosine_similarity(categories_df)
print(similarity_matrix)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


melakukan similiarity matrix dengan cosine similiarity

# Model Development

## Non-Personalized User

In [44]:
def wilson_score_interval(avg_rating, num_of_reviews):
    p = avg_rating / 5.0
    n = num_of_reviews
    z = 1.96  # Z-score for 95% confidence interval
    denominator = 1 + (z ** 2) / n
    centre_adjusted_probability = p + (z ** 2) / (2 * n)
    adjusted_probability = centre_adjusted_probability / denominator
    return  adjusted_probability

Kita dapat menggunakan Wilson Score Interval untuk memberikan bobot pada rating supaya lebih adil

In [45]:
# Hitung batas atas interval Wilson Score dan tambahkan ke dataframe
meta['wilson_score'] = meta.apply(lambda row: wilson_score_interval(row['avg_rating'], row['num_of_reviews']), axis=1)

In [46]:
meta.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,...,Yoga studio,Youth center,Youth clothing store,Youth club,Youth group,Youth hostel,Youth organization,Youth social services organization,Zoo,wilson_score
0,Bear Creek Cabins & RV Park,"Bear Creek Cabins & RV Park, 3181 Richardson H...",0x56b646ed2220b77f:0xd8975e316de80952,,61.100644,-146.214552,"[RV park, Cabin rental agency, Campground]",4.5,18,,...,0,0,0,0,0,0,0,0,0,0.829646
1,Anchorage Market,"Anchorage Market, 88th Ave, Anchorage, AK 99515",0x56c8992b5dee7225:0x9f7f4bf151868cf7,,61.141435,-149.868482,[Farmers' market],4.2,18,,...,0,0,0,0,0,0,0,0,0,0.780199
2,Happy Camper RV,"Happy Camper RV, 1151 N Shenandoah Dr # 4, Pal...",0x56c8e0455225be87:0xf24828df75e2f8ae,,61.591855,-149.290657,[RV repair shop],4.4,28,,...,0,0,0,0,0,0,0,0,0,0.834154
3,Cajun Corner,"Cajun Corner, 302 G St, Anchorage, AK 99501",0x56c8bdb5d91017cd:0xca19fd9afceed343,,61.219378,-149.895852,[American restaurant],4.5,24,,...,0,0,0,0,0,0,0,0,0,0.844808
4,Alaska General Seafoods,"Alaska General Seafoods, 980 Stedman St, Ketch...",0x540c251956395673:0x16f5a4fe26c18931,,55.336119,-131.630669,"[Seafood wholesaler, Food]",4.7,8,,...,0,0,0,0,0,0,0,0,0,0.797257


Dengan begini, dataframe meta telah memiliki skor wilson untuk memberikan rekomendasi tempat terbaik berdasarkan rating kepada Non-personalized user

In [47]:
# menampilkan tempat terbaik berdasarkan rating
best_places = meta.sort_values(by='wilson_score', ascending=False).head(10)
best_places

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,...,Yoga studio,Youth center,Youth clothing store,Youth club,Youth group,Youth hostel,Youth organization,Youth social services organization,Zoo,wilson_score
4097,A Clean Slate Credit Consultants,,0x88d9b7b7c1662903:0xa5fbc84566909fbb,,42.756389,-140.301319,"[Credit counseling service, Service establishm...",5.0,518,,...,0,0,0,0,0,0,0,0,0,0.996319
1903,"The Dar Walden Team, Keller Williams Realty, A...","The Dar Walden Team, Keller Williams Realty, A...",0x56c897c5f9655555:0x965bf44512428041,,61.194767,-149.884594,[Real estate agency],5.0,467,,...,0,0,0,0,0,0,0,0,0,0.99592
7682,1-800-GOT-JUNK? Anchorage,,0x56c89797b8875ab3:0xe8b5f4ed9b7d0024,,60.983961,-150.057399,"[Garbage dump service, Business to business se...",5.0,398,,...,0,0,0,0,0,0,0,0,0,0.99522
5463,"Meridian Dental, LLC","Meridian Dental, LLC, 3465 E Meridian Park Loo...",0x56c8e081a93a060b:0x19618d52ec735e72,,61.592692,-149.361735,"[Dentist, Cosmetic dentist, Dental clinic, Den...",5.0,386,,...,0,0,0,0,0,0,0,0,0,0.995073
5294,Allen Rapid Dry Carpet Cleaning (Pet Odor Expe...,Allen Rapid Dry Carpet Cleaning (Pet Odor Expe...,0x56c8975a317a92b7:0xc71ca09335bc7821,,61.132375,-149.788825,"[Carpet cleaning service, Upholstery cleaning ...",5.0,348,,...,0,0,0,0,0,0,0,0,0,0.994541
8043,Muffy's Flowers & Gifts,"Muffy's Flowers & Gifts, 333 W 4th Ave #218, A...",0x56c896e5f87d10ed:0xb2e2e0e6354e3a60,,61.218665,-149.888525,"[Florist, Balloon store, Flower delivery, Flow...",5.0,336,,...,0,0,0,0,0,0,0,0,0,0.994348
9139,"Unity Home Group Alaska - eXp Realty, LLC","Unity Home Group Alaska - eXp Realty, LLC, 725...",0x56c897c5f93f0e37:0xcc56a3db4e06fe17,,61.198689,-149.869466,"[Real estate agency, Real estate agents, Real ...",5.0,328,,...,0,0,0,0,0,0,0,0,0,0.994212
842,True Life Chiropractic,"True Life Chiropractic, 1142 North Muldoon Roa...",0x56c897bd92cd5bbd:0x63be17908a90f182,,61.228746,-149.742432,[Chiropractor],5.0,328,,...,0,0,0,0,0,0,0,0,0,0.994212
2408,Luff Orthodontics,"Luff Orthodontics, 3708 Rhone Cir, Anchorage, ...",0x56c897ba5bb7b90b:0x1cb0a140c127cac2,,61.187259,-149.861872,"[Dental clinic, Orthodontist]",5.0,304,,...,0,0,0,0,0,0,0,0,0,0.99376
5404,Home Inspections Plus+ LLC,,0x56c8eb7a5206ecc7:0xbbad10a0c9c2fb65,,62.108352,-149.715168,"[Home inspector, Commercial real estate inspec...",5.0,265,,...,0,0,0,0,0,0,0,0,0,0.992855


Tampilan rekomendasi tempat terbaik untuk Non-personalized user sebanyak 10

## Content-Based Filter

In [48]:
# fungsi untuk mendapatkan rekomendasi content-based
def get_content_based(gmap_id, similarity_matrix, data, top_n=10):
    item_index = data[data['gmap_id'] == gmap_id].index[0]
    
    similarity_scores = list(enumerate(similarity_matrix[item_index]))
    
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    
    top_items = data.iloc[top_indices][['gmap_id', 'name', 'category']]
    
    return top_items

In [49]:
# Contoh mendapatkan rekomendasi
recommendations = get_content_based('0x56b646ed2220b77f:0xd8975e316de80952', data=meta, similarity_matrix=similarity_matrix)
recommendations

Unnamed: 0,gmap_id,name,category
85,0x56b646ed2220b77f:0xd8975e316de80952,Bear Creek Cabins & RV Park,"[RV park, Cabin rental agency, Campground]"
3771,0x56cedaacb2e94d57:0x7575332e5c393696,Mat-Su RV Park & Campground,"[Campground, RV park]"
5466,0x56ced6cd63117f0b:0x4b4d20be83e76ed4,Willow Creek Resort,"[Campground, RV park]"
5583,0x56c6960687b49259:0xa388514405ec2393,Scenic View RV Park,"[RV park, Campground]"
5678,0x513246fc8f1d16ef:0x8fc0ef9acfa78dbe,Northern Moosed RV Park & Campground,"[RV park, Campground]"
6583,0x5400e2167988f493:0xe6dc010757430f56,Spruce Meadow RV Park,"[RV park, Campground]"
7033,0x56b57910c22c95f3:0x9089f865e578e0eb,Gakona Alaska RV Park,"[RV park, Campground]"
9792,0x56c8e11c08a33563:0xc438304cb20e41c,Big Bear Campground & RV Park,"[RV park, Campground]"
10241,0x56cd25f55cafc42d:0xbde2ea92fcda8a87,Midnight Sun RV & Campground,"[Campground, RV park]"
10473,0x51325b3c9fc8574f:0x8c4efbdd040f2a35,Rivers Edge RV Park & Campground,"[RV park, Campground]"


## Colaborative-Based Filter

In [50]:
# membuat model KNN
knn = KNeighborsRegressor(n_neighbors=5)

In [51]:
# melatih model
knn.fit(x_train, y_train)

In [52]:
def get_collaborative_based(user_id, review, meta, knn, gmap_to_gmap_encoded, user_to_user_encoded, gmap_encoded_to_gmap):
    # Mendapatkan tempat yang sudah dikunjungi oleh pengguna
    place_visited_by_user = review[review.user_id == user_id]

    # Menemukan tempat yang belum dikunjungi oleh pengguna
    place_not_visited = meta[~meta['gmap_id'].isin(place_visited_by_user.gmap_id.values)]['gmap_id'] 
    place_not_visited = list(
        set(place_not_visited)
        .intersection(set(gmap_to_gmap_encoded.keys()))
    )
    place_not_visited = [[gmap_to_gmap_encoded.get(x)] for x in place_not_visited]

    # Encode pengguna dan buat array untuk prediksi
    user_encoder = user_to_user_encoded.get(user_id)
    user_place_array = np.hstack(
        ([[user_encoder]] * len(place_not_visited), place_not_visited)
    )

    # Mendapatkan rating prediksi untuk data tempat pengguna
    ratings = knn.predict(user_place_array).flatten()

    # Mendapatkan indeks dari 10 rating tertinggi
    top_ratings_indices = ratings.argsort()[-10:][::-1]

    # Mempetakan ID tempat yang direkomendasikan
    recommended_place_ids = [
        gmap_encoded_to_gmap.get(place_not_visited[x][0]) for x in top_ratings_indices
    ]

    # Menampilkan rekomendasi untuk pengguna
    print(f'Showing recommendations for user: {user_id}')

    # Menampilkan tempat yang telah dikunjungi oleh pengguna dengan rating tertinggi
    top_place_user = (
        place_visited_by_user.sort_values(by='rating', ascending=False)
        .head(5)
        .gmap_id.values
    )

    # Filter DataFrame untuk tempat yang telah dikunjungi
    place_df_rows = meta[meta['gmap_id'].isin(top_place_user)]

    # Filter DataFrame untuk tempat yang direkomendasikan
    recommended_place = meta[meta['gmap_id'].isin(recommended_place_ids)]

    # Mengembalikan 10 rekomendasi tempat teratas
    return place_visited_by_user, recommended_place

In [53]:
user_id = review.user_id.sample(1).iloc[0]
visited, top_10_places = get_collaborative_based(user_id, review, meta, knn, gmap_to_gmap_encoded, user_to_user_encoded, gmap_encoded_to_gmap)

Showing recommendations for user: 1.157050115529756e+20


In [54]:
visited['gmap_id']

878507    0x56c89427df8203bf:0xd978c612a604bd27
599384    0x56c8bcd7fa147091:0xdf45a73cf0e05ac0
54432     0x56c897c6d611594b:0xb13b7f4ed0247baa
775014    0x56c8979536128307:0x4a5e418075e45572
197460    0x56c897db8634dc07:0x686d08ed7b099f3c
613110    0x56c8965ee2fb87a1:0x559736347bd48842
883262    0x56c897b7b3795b1b:0x2ca60b12e944b837
599395    0x56c8bcd7fa147091:0xdf45a73cf0e05ac0
856542    0x56c899b83a9d0ce1:0xe85390405ae6bbf9
Name: gmap_id, dtype: object

In [55]:
top_10_places[['gmap_id', 'name']]

Unnamed: 0,gmap_id,name
907,0x540467a3802bb6b7:0xd895c7f5818a3f51,Sitka Sound Science Center
1387,0x56c89644557df92b:0x587b142b879b65fd,Denali Emergency Medicine
1642,0x51348b307278da67:0x58f262dc27fe9fb,Big Delta Brewing Co.
1704,0x56c12d896cdb6b63:0x6683c0fc7406f043,Homer Electric Association
1744,0x56c79bc7ed4b4f11:0x39da29a762031829,Brewed Awakenings
1930,0x56c8966638376403:0x59679d68416a0847,Gardens at Bragaw
1939,0x540c2523eb5e9503:0x3bbe7f9327611e01,Misty Fjords Air & Outfitting Inc
7043,0x56c8963355e1366d:0xcd9d72bab11a2fa5,Anchorage RNC Tree Service
9445,0x56c897dbe294c7f1:0x3af6a90b65fd3549,The Bead Shack
11611,0x56c89994493f3431:0xaed81c90bd2a135a,South Anchorage Farmers Market (O'Malley)


# Evaluation

In [169]:
# mengambil 1 sample gmap_id
sample_gmap_id = meta.gmap_id.sample(1).iloc[0]
sample_user_id = review.user_id.sample(1).iloc[0]

## Content-Based Filter

In [170]:
# input sample
meta[meta['gmap_id'] == sample_gmap_id][['gmap_id', 'name', 'category']]

Unnamed: 0,gmap_id,name,category
7034,0x56c661612cd6c8e9:0xab51061bd1fce2c6,Nick’s Auto Glass,"[Auto glass shop, Glass repair service]"


In [177]:
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_colwidth', None)

In [171]:
recommendations_sample_content_based = get_content_based(sample_gmap_id, data=meta, similarity_matrix=similarity_matrix)
recommendations_sample_content_based

Unnamed: 0,gmap_id,name,category
5266,0x51325a962c628aed:0xd38974136565bde3,Speedy Glass,"[Auto glass shop, Glass repair service]"
6504,0x56c8eb75763e06c5:0x513b63e443562949,Splashes Autospa,"[Auto glass shop, Glass repair service]"
6784,0x56c8eb737157fa45:0x58de944de7fecb91,Speedy Glass,"[Auto glass shop, Glass repair service]"
7034,0x56c661612cd6c8e9:0xab51061bd1fce2c6,Nick’s Auto Glass,"[Auto glass shop, Glass repair service]"
7260,0x56c899d930e5e3f7:0xcf74f00d1e177986,Novus Glass,"[Auto glass shop, Glass repair service]"
7569,0x56c8999ae1910e87:0x233ea2130e9cdd71,Speedy Glass,"[Auto glass shop, Glass repair service]"
7614,0x56c8de7bbab952fd:0xb9a12991f0413b17,Speedy Glass,"[Auto glass shop, Glass repair service]"
9058,0x56c8e094818233c1:0xc4e1cecb77a2944c,Acme Auto Glass,"[Auto glass shop, Glass repair service]"
12547,0x56c897da8047ac59:0xf668867469ce395d,Speedy Glass,"[Auto glass shop, Glass repair service]"
1321,0x56c91e024450d3c7:0x94dc4cc3493b44f0,Basin Street Auto Glass,[Auto glass shop]


Terlihat bahwa semua data category juga memiliki persamaan dari category yang ada pada input sample

## Collaborative-Based Filter

In [182]:
x_val.shape

(210147, 2)

In [172]:
# membuat prediksi
y_pred = knn.predict(x_val)
 
# hitung MSE
mse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'MSE: {mse}')

MSE: 0.2945686290028723
