In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
mylist = []

for chunk in  pd.read_csv('zomato.csv', chunksize=2000):
    mylist.append(chunk)

zomato_real = pd.concat(mylist, axis= 0)
del mylist

In [None]:
zomato_real = pd.DataFrame()
for chunk in pd.read_csv('zomato.csv', chunksize=1000):
    zomato_real = pd.concat([zomato_real, chunk], ignore_index=True)

In [None]:
zomato_real.head()

In [None]:
zomato=zomato_real.drop(['url','dish_liked','phone'],axis=1)

In [None]:
zomato.duplicated().sum()

In [None]:
zomato.drop_duplicates(inplace=True)

In [None]:
zomato.head()

In [None]:
zomato.isnull().sum()

In [None]:
zomato.dropna(inplace=True)

In [None]:
zomato = zomato.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

In [None]:
zomato.head()

In [None]:
zomato['cost'] = zomato['cost'].astype(str)

In [None]:
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',','.'))

In [None]:
zomato['cost'] = zomato['cost'].astype(float)

In [None]:
zomato = zomato.loc[zomato.rate !='NEW']
zomato = zomato.loc[zomato.rate !='-'].reset_index(drop=True)

In [None]:
zomato.head()

In [None]:
remove_slash = lambda x: x.replace('/5','') 
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

In [None]:
zomato.name = zomato.name.apply(lambda x:x.title())

In [None]:
zomato.head()

In [None]:
zomato.online_order = zomato.online_order.replace('Yes',True)
zomato.online_order = zomato.online_order.replace('No',False)

In [None]:
zomato.info()

In [None]:
list(zomato.name.unique())

In [None]:
zomato.shape

In [None]:
zomato.name.value_counts()

In [None]:
zomato.columns

In [None]:
restaurants = list(zomato['name'].unique())
zomato['mean_ratings'] = 0


In [None]:
for i in range(len(restaurants)):
    zomato['mean_ratings'][zomato['name']==restaurants[i]] = zomato['rate'][zomato['name']==restaurants[i]].mean()

In [None]:
zomato.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1,5))
zomato[['mean_ratings']] = scaler.fit_transform(zomato[['mean_ratings']]).round(2)

In [None]:
zomato.head()

In [None]:
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

In [None]:
import string
punc_to_remove = string.punctuation

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('','',punc_to_remove))

In [None]:
zomato['reviews_list'] = zomato['reviews_list'].apply(remove_punctuation)

In [None]:
zomato.head()

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

In [None]:
len(stopwords)

In [None]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords])
    

In [None]:
zomato['reviews_list'] = zomato['reviews_list'].apply(remove_stopwords)

In [None]:
zomato.head()

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
zomato['reviews_list'] = zomato['reviews_list'].apply(remove_urls)

In [None]:
restaurant_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [None]:
zomato=zomato.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)

In [None]:
df_percent = zomato.sample(frac=1)

In [None]:
df_percent

In [None]:
df_percent.set_index('name', inplace=True)

In [None]:
indices = pd.Series(df_percent.index)

In [None]:
indices

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,1), stop_words='english', min_df=0)
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])
# cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
tfidf_matrix

In [None]:
np.multiply(tfidf_matrix, np.transpose(tfidf_matrix))

In [None]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
tfidf_matrix.shape

In [None]:

def recommend(name, cosine_similarities = cosine_similarities):
    
    recommend_restaurant = []
    
    
    idx = indices[indices == name].index[0]
    
    
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
   
    top30_indexes = list(score_series.iloc[0:31].index)
    
   
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
 
    df_new = pd.DataFrame(columns=['cuisines', 'mean_ratings', 'cost','location'])
    
   
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','mean_ratings', 'cost','location']][df_percent.index == each].sample()))
    
    
    df_new = df_new.drop_duplicates(subset=['cuisines','mean_ratings', 'cost','location'], keep=False)
    df_new = df_new.sort_values(by='mean_ratings', ascending=False)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new

In [None]:
recommend('Shawarma Inc')

In [None]:
list(indices)