A Restaurant recommendation system uses content-based filtering.The dataset I’ll be using here consists of restaurants in Bangalore, India, collected from Zomato. You can download the dataset from https://www.kaggle.com/chanakyavivekkapoor/bangalore-restaurants-analysis

I will try to create a content-based recommendation system where when I enter the name of a restaurant, the Restaurant recommendation system will look at reviews from other restaurants, and System will recommend us to the other restaurants with similar reviews and address sort them from the top-rated. 

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
#import Python Libraries
import numpy as np
import pandas as pd
import warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [5]:
#load and read the dataset:
df = pd.read_csv("/content/drive/MyDrive/Datasets/Restaurant Recommendation System/zomato.csv")
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          51717 non-null  object
 1   address                      51717 non-null  object
 2   name                         51717 non-null  object
 3   online_order                 51717 non-null  object
 4   book_table                   51717 non-null  object
 5   rate                         43942 non-null  object
 6   votes                        51717 non-null  int64 
 7   phone                        50509 non-null  object
 8   location                     51696 non-null  object
 9   rest_type                    51490 non-null  object
 10  dish_liked                   23639 non-null  object
 11  cuisines                     51672 non-null  object
 12  approx_cost(for two people)  51371 non-null  object
 13  reviews_list                 51

In [7]:
df.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')

In [8]:
#Deleting Unnnecessary Columns
cleaning_df=df.drop(['url','dish_liked','phone','rest_type', 'listed_in(type)', 'menu_item', 'votes','approx_cost(for two people)'],axis=1) 

In [9]:
#Removing the Duplicates
cleaning_df.duplicated().sum()
cleaning_df.drop_duplicates(inplace=True)

In [10]:
#Remove the NaN values from the dataset
cleaning_df.isnull().sum()
cleaning_df.dropna(how='any',inplace=True)

In [11]:
#Changing the column names
cleaning_df = cleaning_df.rename(columns={'reviews_list':'review','listed_in(city)':'city'})


In [12]:
#Removing '/5' from Rates
cleaning_df = cleaning_df.loc[cleaning_df.rate !='NEW']
cleaning_df = cleaning_df.loc[cleaning_df.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
cleaning_df.rate = cleaning_df.rate.apply(remove_slash).str.strip().astype('float')


In [13]:
## Lower Casing
cleaning_df["review"] = cleaning_df["review"].str.lower()

In [14]:
## Removal of Puctuations
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

cleaning_df["review"] = cleaning_df["review"].apply(lambda text: remove_punctuation(text))




In [None]:
## Removal of Stopwords
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

cleaning_df["review"] = cleaning_df["review"].apply(lambda text: remove_stopwords(text))

In [16]:
## Removal of URLS
import re

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

cleaning_df["review"] = cleaning_df["review"].apply(lambda text: remove_urls(text))

cleaning_df[['review', 'cuisines']].sample(5)

Unnamed: 0,review,cuisines
20340,rated 30 ratedn ordered gobi little spicy but...,Kerala
24254,rated 50 ratedn bar is quiet gud fr pork stat...,"North Indian, Chinese"
4075,rated 40 ratedn to all those tea lovers this ...,"Continental, Cafe, Desserts, Salad, Momos"
30164,rated 40 ratedn we order hariyani chicken and...,"North Indian, Mughlai, Chinese, Rolls"
20250,rated 40 ratedn my favorite chai place was op...,"Cafe, Tea, North Indian"


In [17]:
# RESTAURANT NAMES:
print("Total number of restaurants in Bengaluru are: ", len(df['name'].unique()))

Total number of restaurants in Bengaluru are:  8792


In [18]:
# RESTAURANT NAMES:
restaurant_names = list(cleaning_df['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [19]:
# Randomly sample 50% of your dataframe
final_df = cleaning_df.sample(frac=0.5)
final_df.set_index('name', inplace=True)
indices = pd.Series(final_df.index)


In [20]:
# Creating tf-idf matrix
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(final_df['review'])

from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:

def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 10 restaurant indexes with a similar cosine-sim value
    top10_indexes = list(score_series.iloc[0:11].index)
    
    # Names of the top 10 restaurants
    for each in top10_indexes:
        recommend_restaurant.append(list(final_df.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'rate', 'address'])
    
    # Create the top 10 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(final_df[['cuisines','rate', 'address']][final_df.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 5 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','rate', 'address'], keep=False)
    df_new = df_new.sort_values(by='rate', ascending=False).head(5)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new
recommend('Pizza Stop')

TOP 5 RESTAURANTS LIKE Pizza Stop WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,rate,address
Pizza Stop,"Pizza, Italian",3.9,"#4, Madras Bank Road, In Airlines Hotel Campus..."
Pizza Stop,"Pizza, Italian",3.8,"1090, 10th Main Road, 100 Feet Road, Banaswadi..."
Midnight Pizza Slurpp,"Italian, Pizza",3.7,"104, Sadananda Buliding, Buchappa Layout, Kora..."
Sbarro,Pizza,3.3,"Prestige Shantiniketan, B Towers, Food Court, ..."
Pizza Stop,"Pizza, Italian",3.2,"20, KR Colony, Near Haiku Honda Service Center..."
