# Yelp Recommender Systems
[참고 노트북 | Yelp Dataset: SurpriseMe Recommendation System](https://www.kaggle.com/code/fahd09/yelp-dataset-surpriseme-recommendation-system)
### 사용 라이브러리

In [2]:
import os
import re
import string

import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer

### Data Load

In [3]:
df_yelp_business = pd.read_json('./data//yelp_academic_dataset_business.json', lines=True)
df_yelp_business.fillna('NA', inplace=True)
df_yelp_business = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurants')]
print('Final Shape: ',df_yelp_business.shape)

Final Shape:  (52268, 14)


In [4]:
df_yelp_review_iter = pd.read_json("./data/yelp_academic_dataset_review.json", chunksize=100000, lines=True)

df_yelp_review = pd.DataFrame()
i=0
for df in df_yelp_review_iter:
    df = df[df['business_id'].isin(df_yelp_business['business_id'])]
    df_yelp_review = pd.concat([df_yelp_review, df])
    i=i+1
    print(i)
    if i==4: break

1
2
3
4


In [5]:
df_yelp_business = df_yelp_business[df_yelp_business['business_id'].isin(df_yelp_review['business_id'])]

print('Final businesses shape: ', df_yelp_business.shape)
print('Final review shape: ', df_yelp_review.shape)

Final businesses shape:  (4937, 14)
Final review shape:  (283029, 9)


### Preprocessing

In [6]:
def clean_text(text):
    ## 구두점 제거
    text = text.translate(string.punctuation)
    
    ## 소문자 변경 후 분리
    text = text.lower().split()
    
    ## 불용어 제거
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

In [7]:
%%time
df_yelp_review['text'] = df_yelp_review['text'].apply(clean_text)

CPU times: total: 1min 9s
Wall time: 1min 9s


### Top 100 Vocabulariesb

In [8]:
vectorizer_reviews = CountVectorizer(min_df = .01,max_df = .99, tokenizer = WordPunctTokenizer().tokenize)
vectorized_reviews = vectorizer_reviews.fit_transform(df_yelp_review['text'])

vectorized_reviews.shape

(283029, 886)

In [10]:
' | '.join(vectorizer_reviews.get_feature_names_out()[:100])

'! | + | - | 00 | 1 | 10 | 12 | 15 | 2 | 20 | 3 | 30 | 4 | 5 | 50 | 6 | 7 | 8 | : | ; | a | able | about | absolutely | accommodating | across | actually | add | added | addition | afternoon | again | ago | all | almost | along | already | also | although | always | am | amazing | ambiance | american | amount | and | another | anyone | anything | anyway | anywhere | appetizer | appetizers | are | area | around | arrived | as | ask | asked | ate | atmosphere | attention | attentive | authentic | available | average | avocado | away | awesome | awful | back | bacon | bad | baked | bar | bartender | based | basically | bbq | be | beans | beautiful | beef | beer | beers | before | behind | believe | best | better | beyond | big | bill | birthday | bit | bite | black | bland | blue'

### Top 100 Categoreis

In [12]:
vectorizer_categories = CountVectorizer(min_df = 1, max_df = 1., tokenizer = lambda x: x.split(', '))
vectorized_categories = vectorizer_categories.fit_transform(df_yelp_business['categories'])

vectorized_categories.shape

(4937, 387)

In [13]:
' | '.join(vectorizer_categories.get_feature_names_out()[:100])

"acai bowls | accessories | active life | adult entertainment | afghan | african | american (new) | american (traditional) | amusement parks | appliances & repair | arabic | arcades | argentine | armenian | art galleries | arts & crafts | arts & entertainment | asian fusion | austrian | auto detailing | auto glass services | auto repair | automotive | bagels | bakeries | banks & credit unions | bar crawl | barbeque | barbers | bars | bartenders | basque | battery stores | batting cages | beaches | beauty & spas | bed & breakfast | beer | beer bar | beer gardens | beer tours | beverage store | bistros | boat charters | boat tours | boating | body shops | books | bookstores | bowling | brasseries | brazilian | breakfast & brunch | breweries | brewpubs | british | bubble tea | buffets | building supplies | burgers | burmese | business consulting | butcher | cabaret | cafes | cafeteria | cajun/creole | calabrian | cambodian | canadian (new) | candy stores | cannabis dispensaries | cantones

### 희소 행렬 생성

In [14]:
%%time
from scipy import sparse
businessxreview = sparse.csr_matrix(pd.get_dummies(df_yelp_review['business_id']).values)

CPU times: total: 14.6 s
Wall time: 14.7 s


In [15]:
print('restuarants x categories: \t', vectorized_categories.shape) 
print('restuarants x reviews: \t\t' , businessxreview.shape) 
print('reviews x words: \t\t', vectorized_reviews.shape)

restuarants x categories: 	 (4937, 387)
restuarants x reviews: 		 (283029, 4937)
reviews x words: 		 (283029, 886)


### 리뷰와 평점이 좋은 다른 식당 추천

In [19]:
df_yelp_business.sample(5)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
6273,DptW6vZmrd7ttS0RCaWx2w,Xwrecks Restaurant & Lounge,9303 50th Street NW,Edmonton,AB,T6B 2L5,53.530919,-113.417837,2.0,7,0,"{'Alcohol': 'u'full_bar'', 'RestaurantsPriceRa...","Restaurants, Bars, Nightlife, American (Tradit...","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ..."
12352,4w6Z5v0uVt08oSBaA3342A,Wawa,600 Cinnaminson Ave,Palmyra,NJ,08065,39.998409,-75.035118,3.5,5,1,"{'RestaurantsPriceRange2': '4', 'BusinessAccep...","Convenience Stores, Automotive, Coffee & Tea, ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
3136,N44roXfLNkBdpINQDjEFOQ,Carisilo's Mexican Restaurant,1978 Vandalia St,Collinsville,IL,62234,38.695337,-89.966691,4.0,65,1,"{'RestaurantsDelivery': 'False', 'Alcohol': ''...","Mexican, Restaurants","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
9834,-SFSt3FkjGfavnyMpHsZPA,Enjoi Sweets & Company,"4707 W Gandy Blvd, Ste 7",Tampa,FL,33611,27.89376,-82.525167,4.5,9,0,"{'NoiseLevel': 'u'quiet'', 'BusinessAcceptsBit...","Desserts, Food, Cafes, Restaurants, Food Truck...","{'Thursday': '12:0-21:0', 'Friday': '12:0-21:0..."
1427,jLaPtjlLfRSaoBWIcHcSQg,The Mad Crab,8080 Olive Blvd,University City,MO,63130,38.672734,-90.345018,3.5,156,1,"{'Caters': 'False', 'Alcohol': 'u'beer_and_win...","Seafood, Cajun/Creole, American (New), Restaur...","{'Monday': '15:0-22:0', 'Tuesday': '15:0-22:0'..."


In [20]:
business_choose = '-SFSt3FkjGfavnyMpHsZPA' # Desserts, Food, Cafes, Restaurants ...

In [21]:
new_reviews = df_yelp_review.loc[df_yelp_review['business_id'] == business_choose, 'text']
print('\n'.join([r[:100] for r in new_reviews.tolist()]))

wow probably best cupcakes i have since moved tampa + + i stopped guys came flicks food trucks heard
pleasure experiencing enjoi sweets recent food truck rally work later day dessert truck best place e
delicious cupcakes review say much liked place went tried red velvet chocolate chip brownie fresh yu
one word delectable ! + + stumbled upon food truck which also storefront flicks food trucks past mon
tried cupcakes food truck family ordered following : + + chocolate chocolate delicious moist cake ch
unable contact month left facebook review told anything nice say keep myself understand things come 
used enjoi sweets company event fantastic ! everything setting event food itself joi jon pleasure wo
tried italian mango drink super delicious got get enough ! 
enjoi sweets one favorite food trucks love design course delicious cupcakes catered events say serve


In [22]:
new_categories = df_yelp_business.loc[df_yelp_business['business_id'] == business_choose, 'categories']
new_categories.tolist()

['Desserts, Food, Cafes, Restaurants, Food Trucks, American (Traditional)']

### 유사도 계산

In [23]:
from scipy.spatial.distance import cdist
# find most similar reviews
dists1 = cdist(vectorizer_reviews.transform(new_reviews).todense().mean(axis=0), 
              vectorized_reviews.T.dot(businessxreview).T.todense(), 
               metric='correlation')
# find most similar categories
dists2 = cdist(vectorizer_categories.transform(new_categories).todense().mean(axis=0), 
              vectorized_categories.todense(), 
               metric='correlation')

In [25]:
dists_together = np.vstack([dists1.ravel(), dists2.ravel()]).T

dists = dists_together.mean(axis=1)
dists

array([0.54952985, 0.50191353, 0.56616524, ..., 0.69466944, 0.64917578,
       0.4334572 ])

In [26]:
# 가장 유사한 10개의 레스토랑
closest = dists.argsort().ravel()[:10]

#### 기준 레스토랑

In [27]:
df_yelp_business.loc[df_yelp_business['business_id']== business_choose, ['business_id', 'categories', 'name', 'stars']]

Unnamed: 0,business_id,categories,name,stars
9834,-SFSt3FkjGfavnyMpHsZPA,"Desserts, Food, Cafes, Restaurants, Food Truck...",Enjoi Sweets & Company,4.5


#### 추천된 레스토랑 목록

In [28]:
df_yelp_business.loc[df_yelp_business['business_id'].isin(df_yelp_business['business_id'].iloc[closest]), ['business_id', 'categories', 'name', 'stars']]

Unnamed: 0,business_id,categories,name,stars
742,dD2p903p8lU0IgXT3OFluA,"Breakfast & Brunch, Restaurants, Food, Cafes, ...",Edgehill Cafe,3.5
2548,dcpWZ6Yk_S0HqTlNBi8jiA,"Food, Coffee & Tea, Restaurants, Desserts, Cafes",The Woodrack Cafe,4.0
4710,qLrTiIPDlnNX6FYTs29rmg,"Restaurants, American (Traditional)",Buddy's Grill,3.5
6720,jVdYRED2iztNaNCoTAhVMA,"Restaurants, Salad, Food, Desserts",Have A Greener Day,5.0
8244,iHTL6BPlaPK6xvOa5MIKaQ,"American (Traditional), Restaurants, Food, Ame...",Essentially Fries,4.0
9834,-SFSt3FkjGfavnyMpHsZPA,"Desserts, Food, Cafes, Restaurants, Food Truck...",Enjoi Sweets & Company,4.5
10337,hQcAPRwuYFPAbhbpeNPEgA,"Bakeries, American (Traditional), Food, Restau...",Apple Farm Diner and Bakery,2.5
11701,tYCok-NtWvg8_k7woeB83w,"Desserts, American (Traditional), Cafes, Resta...",Grand Lux Cafe,3.5
11748,newkruvn1rhEvueEc9y1Mw,"Food, Restaurants, Desserts, Ice Cream & Froze...",Moo Moo Milk Bar,3.5
12506,9dW3CVyvnTXdkXg2AOyBfw,"Desserts, Coffee & Tea, Cafes, Donuts, Food, S...",Birds Nest Cafe,4.5
