# Restaurant Recommender

## Project Overview




In this Project, I am using the reviews Dataset provided by Zomato to recommend similar restaurants. I am using `TF-IDF` to process the reviews of customers. The model recommends on the basis of cosine similarity between two reviews and ranks them on the basis of restaurant rating.

These are the major components to this project:

* removing unneccessary columns and cleaning the columns to make it uniform across all rows
* removing punctuation, stopwords from reviews column and vectorizing it using `TF-IDF`
* calculating the cosine similarities between the reviews, and outputting restaurants in descending order of cosine similarities


## 1. Data Cleaning and Preprocessing

### `1.1` Importing all libraries

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

### `1.2` Reading in the dataset

In [3]:
# reading the data in chunks to prevent memory issues
# dataset size is 500 MB
mylist = []

for chunk in  pd.read_csv('zomato.csv', chunksize=2000):
    mylist.append(chunk)

zomato_real = pd.concat(mylist, axis= 0)
del mylist

### `1.3` Cleaning the columns and removing unnecessary ones

In [None]:
# dropping columns which are not significant
zomato=zomato_real.drop(['url','dish_liked','phone'],axis=1)

# dropping duplicates
zomato.drop_duplicates(inplace=True)

# Remove missing values
zomato.dropna(inplace=True)

# renaming columns
zomato = zomato.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

# basic preprocessing
zomato['cost'] = zomato['cost'].astype(str) 
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',','.')) #cleaning cost column
zomato['cost'] = zomato['cost'].astype(float) #converting type to float

# removing reviews for hotel which dont have a rating or is new
zomato = zomato.loc[zomato.rate !='NEW']
zomato = zomato.loc[zomato.rate !='-'].reset_index(drop=True)

# removing /5 from every rating 
remove_slash = lambda x: x.replace('/5','') 
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

# standardising the name of restaurants by making first letter of each word capital and others small
zomato.name = zomato.name.apply(lambda x:x.title())

# making online order column boolean
zomato.online_order = zomato.online_order.replace('Yes',True)
zomato.online_order = zomato.online_order.replace('No',False)


### `1.4` Feature scaling and Engineering

In [None]:
# getting the mean rating a a restaurant from all its reviews and forming a new column
restaurants = list(zomato['name'].unique())
zomato['mean_ratings'] = 0
for i in range(len(restaurants)):
    zomato['mean_ratings'][zomato['name']==restaurants[i]] = zomato['rate'][zomato['name']==restaurants[i]].mean()
    
    
# scaling the mean ratings column between 1-5 to have uniform range of values which gives a better idea of its actual rating
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1,5))
zomato[['mean_ratings']] = scaler.fit_transform(zomato[['mean_ratings']]).round(2)

## `2.` Tokenizing the reviews column 

Cleaning the column and using TFIDF to tokenize the column

### `2.1` Cleaning the column

In [None]:
# the reviews column is being processed

zomato["reviews_list"] = zomato["reviews_list"].str.lower()

import string
punc_to_remove = string.punctuation


def remove_punctuation(text):
    
    '''
    removing punctuation from reviews
    '''
    return text.translate(str.maketrans('','',punc_to_remove))

zomato['reviews_list'] = zomato['reviews_list'].apply(remove_punctuation)


stopwords = set(stopwords.words('english'))

def remove_stopwords(text):
    
    '''
    removing stopwords which don't add much meaning to a sentence . Stopwords include a, and, the..
    '''
    return " ".join([word for word in str(text).split() if word not in stopwords])

zomato['reviews_list'] = zomato['reviews_list'].apply(remove_stopwords)

def remove_urls(text):
    '''
    removing the urls from reviews
    '''
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato['reviews_list'] = zomato['reviews_list'].apply(remove_urls)

In [None]:
# restaurant_names = list(zomato['name'].unique())
# def get_top_words(column, top_nu_of_words, nu_of_word):
    
#     '''
    
#     '''
#     vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
#     bag_of_words = vec.fit_transform(column)
#     sum_words = bag_of_words.sum(axis=0)
#     words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
#     words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#     return words_freq[:top_nu_of_words]

### `2.2` Vectorization

In [None]:
#dropping columns which will not be used
zomato=zomato.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)

#shuffling the dataset
df_percent = zomato.sample(frac=1)

#setting name column as index
df_percent.set_index('name', inplace=True)

#this is a series of names of restaurants
indices = pd.Series(df_percent.index)

#using TF_IDF to vectorize the reviews list column
tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words='english', min_df=0)
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

# calculating the cosine similarities between all the reviews
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

## `3` Recommendation based on cosine similarities

In [None]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    '''
        for every restaurant name, it takes the cosine similarities of its reviews 
        to all the other reviews, ranks them in descending order,and outputs the
        restaurant names corresponding to them
        
        input : restaurant for which you want reviews
        output : dataframe with restaurants name and its details
    '''
    
    recommend_restaurant = []
    
    
    idx = indices[indices == name].index[0] #first index where name matches 
    
    
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False) # taking the cosine similarities of that review and sorting in descending
    
   
    top30_indexes = list(score_series.iloc[0:31].index) #taking top 30 indices
    
   
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each]) #taking names corresponding to those top 30 reviews
    
 
    df_new = pd.DataFrame(columns=['cuisines', 'mean_ratings', 'cost','location'])
    
    # taking out the (cuisines','mean_ratings', 'cost','location') for that particular name and dropping duplicates, and then sorting by mean ratings
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','mean_ratings', 'cost','location']][df_percent.index == each].sample()))
    
    
    df_new = df_new.drop_duplicates(subset=['cuisines','mean_ratings', 'cost','location'], keep=False)
    df_new = df_new.sort_values(by='mean_ratings', ascending=False)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new