### Recommendation using reviews - NLP (TF-IDF)

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import dateutil.parser as parser
from datetime import datetime, date, timedelta
import torch
import skorch
import scipy
import torch.nn as nn
import torch
import torch.nn.functional as F
import sys
from skorch.helper import DataFrameTransformer
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn import metrics
from sklearn.preprocessing import FunctionTransformer
from skorch.callbacks import EarlyStopping
from sklearn.pipeline import Pipeline
from skorch import NeuralNetRegressor
import pickle
import emoji
import requests
from bs4 import BeautifulSoup
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.tokenize.treebank import TreebankWordDetokenizer

In [18]:
# loading merged dataset
initial_df = pd.read_csv('Data/Merged_TA_Gmaps_Dataset_2.0.csv')



In [19]:
df = initial_df.drop(['Unnamed: 0'],axis=1)

# Converting to Lowercase
df["review_full"] = df["review_full"].str.lower()

# df['review_full'] = df['review_full'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))


# Removing punctuation and replacing with a space - this is to take care of any events where
# a reviewer has not left a space between their punctuation and words
import string
df['review_full'] = df['review_full'].apply(lambda x: ''.join([' ' if char in string.punctuation else char for char in x]))

# Removing all double spaces that may be created by the above, or occur naturally in the dataset
df['review_full'] = df['review_full'].str.replace('  ', ' ')


# Removing stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df["review_full"] = df["review_full"].apply(lambda text: stopwords(text))

# Removing URLS
def urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df["review_full"] = df["review_full"].apply(lambda text: urls(text))

# Implementing Lemmatisation to get the base form of each word

# lemmatizer = WordNetLemmatizer()
# def lemming(text):
#     a = word_tokenize(text)
#     answer = list(map(lambda x: lemmatizer.lemmatize(x), a))
#     return answer

# df["review_full"] = df["review_full"].apply(lambda text: lemming(text))

df[['review_full']].sample(5)

Unnamed: 0,review_full
352116,amazing service fantastic food brilliant drink...
12139,fish n chips pro level go ahead check big guys...
72505,searching good fish chip found place tripadvis...
468606,excellent piece cod ages really delicious also...
684660,looking forward bit let staff friendly food ok...


In [20]:
# summary of the number of reviews for each type of restaurant, as defined by Google Maps
df.groupby('primaryType').agg({'primaryType': 'count'})

Unnamed: 0_level_0,primaryType
primaryType,Unnamed: 1_level_1
american_restaurant,15443
art_gallery,271
bakery,1835
bar,104554
barbecue_restaurant,4110
beauty_salon,298
brazilian_restaurant,1849
breakfast_restaurant,3439
brunch_restaurant,847
cafe,15107


In [23]:
restaurant_names = list(df['restaurant_name_clean'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

df_percent = df.loc[df['primaryType']  == 'japanese_restaurant'] 
# df_percent = df_percent.sample(frac=0.01)

df_percent.set_index('restaurant_name_clean', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['review_full'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)



In [26]:
df_percent['restaurant_name'].unique()

array(['Kanada_Ya_Upper_Street', 'Tanakatsu', '3AKE',
       'Ginza_Onodera_Japanese_Restaurant', 'Robata',
       'Fushan_Japanese_Restaurant', 'Tokyo_Sukiyaki_Tei', 'Zaibatsu',
       'Oka_Restaurant', 'Zuma_London', 'Abeno',
       'Yo_Sushi_Harvey_Nichols', 'Chisou_Japanese_Restaurant',
       'ROKA_Charlotte_Street', 'Umami', 'ROKA_Mayfair', 'ROKA_Aldwych',
       'Mai_Sushi', 'Dinings', 'Tokyo_Diner', 'Hotaru', 'Jin_Kichi',
       'Ichi_Riki', 'Kappa', 'Misato', 'Asakusa', 'Umu', 'Kiku',
       'Shackfuyu', 'SUSHISAMBA_Heron_Tower', 'Chotto_Matte', 'Tomoe',
       'Flesh_Buns_Covent_Garden', 'Yama_Momo', 'Sake_no_Hana', 'Taro',
       'Itsu'], dtype=object)

In [27]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['price_level', 'rating','user_ratings_total','overview'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['price_level', 'rating','user_ratings_total','overview'], keep=False)
    df_new = df_new.sort_values(by='user_ratings_total', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new
recommend('ROKA Mayfair')

TOP 5 RESTAURANTS LIKE ROKA Mayfair WITH SIMILAR REVIEWS: 


  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == ea

Unnamed: 0,price_level,rating,user_ratings_total,overview
Tokyo Diner,2.0,4.5,2758.0,"Simple Japanese diner serving sushi, bento, no..."
Shackfuyu,2.0,4.3,1735.0,Izakaya-style dining for Japanese comfort food...
Zaibatsu,1.0,4.6,1392.0,Functional venue serving Japanese classics suc...
Ginza Onodera Japanese Restaurant,4.0,4.2,599.0,"Slick, stone-clad space showcasing sushi, tepp..."
Umami,,4.2,95.0,


In [13]:
# running a recommendation on the overviews instead

df = initial_df.drop(['Unnamed: 0'],axis=1)

df = df.drop_duplicates(subset=['overview'], keep='first')

# Converting to Lowercase
df["overview"] = df["overview"].str.lower()

# df['review_full'] = df['review_full'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))


# Removing punctuation and replacing with a space - this is to take care of any events where
# a reviewer has not left a space between their punctuation and words
import string
# df['overview'] = df['overview'].apply(lambda x: ''.join([' ' if char in string.punctuation else char for char in x]))

# Removing all double spaces that may be created by the above, or occur naturally in the dataset
df['overview'] = df['overview'].str.replace('  ', ' ')


# Removing stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df["overview"] = df["overview"].apply(lambda text: stopwords(text))

# Removing URLS
def urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df["overview"] = df["overview"].apply(lambda text: urls(text))

# Implementing Lemmatisation to get the base form of each word

# lemmatizer = WordNetLemmatizer()
# def lemming(text):
#     a = word_tokenize(text)
#     answer = list(map(lambda x: lemmatizer.lemmatize(x), a))
#     return answer

# df["review_full"] = df["review_full"].apply(lambda text: lemming(text))

df[['overview']].sample(5)

Unnamed: 0,overview
719499,"american plates cocktails offered chill, indus..."
75876,"filipino cuisine, cocktails & tasting menus se..."
421967,menu featuring traditional ethiopian dishes se...
220029,"stylish contemporary restaurant framed art, im..."
767987,artisan cheesemonger serving modern european c...


In [16]:
restaurant_names = list(df['restaurant_name_clean'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

df_percent = df.sample(frac=0.9)

df_percent.set_index('restaurant_name_clean', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['overview'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)



In [17]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['price_level', 'rating','user_ratings_total','overview'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['price_level', 'rating','user_ratings_total','overview'], keep=False)
    df_new = df_new.sort_values(by='user_ratings_total', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new
recommend('Cocotte Notting Hill')

TOP 10 RESTAURANTS LIKE Cocotte Notting Hill WITH SIMILAR REVIEWS: 


  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['price_level', 'rating','user_ratings_total','overview']][df_percent.index == ea

Unnamed: 0,price_level,rating,user_ratings_total,overview
Gokyuzu Restaurant,2.0,4.5,5421.0,informal family-run turkish spot serving class...
Kanada Ya Ramen Bar,2.0,4.5,3987.0,lively venue serving range traditional japanes...
Elvet Steakhouse,3.0,4.4,3492.0,"relaxed establishment serving steaks, burgers ..."
Coq d Argent,3.0,4.3,2199.0,mellow restaurant & bar serving french cuisine...
Blacklock City,2.0,4.7,2180.0,atmospheric subterranean spot serving meat-foc...
Troubadour London,2.0,4.4,1823.0,long-standing restaurant & music venue serving...
Boro Bistro,2.0,4.4,1607.0,cozy french spot garden terrace serving cheese...
The Walrus and The Carpenter,2.0,4.3,1327.0,traditional corner pub upstairs dining room ce...
Shah Tandoori,2.0,4.6,1289.0,indian cuisine street-corner contemporary rest...
Byron Waterloo,2.0,4.2,1279.0,american-inspired chain diner serving posh ham...
