# Zomato Restaurant Clustering And Sentiment Analysis project
**By : Md Ismail Quraishi**

### Problem Statement :
The company wants to know what kind of product should be recommend to the different types of customers so that the company can target them for better profits.

### Objectives :
1. Firstly I will perform EDA on both the dataset to understand the datasets and get insights.
2. Preprocessing the data
3. Data transformation from words to tags
4. Calculation of cosine similarity
5. build a recommendation system.

## Let's Begin

### Know your data

In [1]:
# Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import commonFunctions as cf
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the datasets
path_names = './preprocessed_data/restaurant.csv'
path_reviews ='./preprocessed_data/enough_interaction.csv'
names = pd.read_csv(path_names)
interaction = pd.read_csv(path_reviews)

In [3]:
names.head(3)

Unnamed: 0.1,Unnamed: 0,Name,Cost,Cuisines
0,0,Beyond Flavours,800,"Chinese, Continental, Kebab, European, South I..."
1,1,Paradise,800,"Biryani, North Indian, Chinese"
2,2,Flechazo,1300,"Asian, Mediterranean, North Indian, Desserts"


In [4]:
interaction.head(3)

Unnamed: 0.1,Unnamed: 0,Restaurant,Reviewer,Rating
0,73,Beyond Flavours,Saswati Kar Choudhury,3.0
1,99,Beyond Flavours,Aditya,4.0
2,160,Paradise,Sravani,5.0


In [5]:
# Drop unnamed columns
names.drop('Unnamed: 0',axis=1, inplace=True)
interaction.drop('Unnamed: 0',axis=1, inplace=True)
names.rename(columns={'Name':'Restaurant'}, inplace=True) # Rename the column so that datasets can be merged easily
merged_df = pd.merge(interaction, names, on='Restaurant') # Data merging

In [6]:
print('# of duplicated rows : ',merged_df.duplicated().sum())
merged_df.drop_duplicates(inplace=True) # Drop duplicated rows

# of duplicated rows :  5


In [7]:
cf.showInfo(merged_df) # This funciton is defined inside commonFunctions module

shape :  (935, 5)

# of duplicated rows :  0

# of unique values in each column : 
Restaurant    100
Reviewer      147
Rating          8
Cost           28
Cuisines       89
dtype: int64

# of missing/nan values in each column : 
Restaurant    0
Reviewer      0
Rating        0
Cost          0
Cuisines      0
dtype: int64

Calling info method : 
<class 'pandas.core.frame.DataFrame'>
Index: 935 entries, 0 to 939
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Restaurant  935 non-null    object 
 1   Reviewer    935 non-null    object 
 2   Rating      935 non-null    float64
 3   Cost        935 non-null    object 
 4   Cuisines    935 non-null    object 
dtypes: float64(1), object(4)
memory usage: 43.8+ KB
None


### Preprocessing and recommendation system building for user based

In [8]:
# User based calcuclation
user_based_pivot = interaction.pivot_table(index='Reviewer', columns='Restaurant', values='Rating').fillna(0) # Pivot table
similarity_score_for_people = cosine_similarity(user_based_pivot) # similarity score calculation
print('user similarity shape',similarity_score_for_people.shape)

user similarity shape (147, 147)


In [9]:
user_based_pivot.head(3)

Restaurant,10 Downing Street,13 Dhaba,"3B's - Buddies, Bar & Barbecue",AB's - Absolute Barbecues,Absolute Sizzlers,Al Saba Restaurant,American Wild Wings,Amul,Arena Eleven,Aromas@11SIX,...,The Tilt Bar Republic,Tiki Shack,Triptify,Udipi's Upahar,Ulavacharu,Urban Asia - Kitchen & Bar,Yum Yum Tree - The Arabian Food Court,Zega - Sheraton Hyderabad Hotel,Zing's Northeast Kitchen,eat.fit
Reviewer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
Aamir Nawaz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
Aanchal Khemka,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Building users based recommendatioin system
def recommend_similar_user(user_name, top_n):
    index = user_based_pivot.index.get_loc(user_name)
    similar_people = sorted(list(enumerate(similarity_score_for_people[index])),key=lambda x:x[1],reverse=True)[1:top_n+1]
    recommendations = []
    for i in similar_people:
        recommendations.append(user_based_pivot.index[i[0]])
    return recommendations

### Preprocessing and recommendation system building for restaurant based

In [11]:
def str_merge(string_):
    result = [s.lower() for s in string_.split()]
    return ''.join(result)
    
names['name'] = names['Restaurant'].apply(str_merge)
names['tags'] = names['name']+' '+names['Cuisines']

In [12]:
# Textual data preprocessing
def textualt_preprocessing(df, col):
    df[col] = df[col].apply(cf.remove_punctuations)
    df[col] = df[col].apply(cf.lower_casing)
    df[col] = df[col].apply(cf.stemming)
    df[col] = df[col].apply(cf.remove_numeric)
    return df

preprocessed_restaurant_df = textualt_preprocessing(names.copy(),'tags')[['Restaurant', 'tags']] # calling the above function

In [13]:
# Word vectorizations
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
vectors = tfidf.fit_transform(preprocessed_restaurant_df['tags']).toarray()
vocabulary = tfidf.vocabulary_

In [14]:
vector_df = pd.DataFrame(vectors, index=names['Restaurant'], columns=tfidf.get_feature_names_out())
restaurant_similarity_scores = cosine_similarity(vectors)
restaurant_similarity_scores.shape

(105, 105)

In [15]:
# Restaurant based recommendation system
def recommend_similar_restaurants(restaurant_name, topn):
    result = []
    index = vector_df.index.get_loc(restaurant_name) # Retriving index no. for the given restaurant
    indices = restaurant_similarity_scores[index].argsort()[-topn-1:-1] # sorting the array based on scores
    for i in indices[::-1]: # Iterating through each index
        result.append(vector_df.index[i])
    return result

In [16]:
recommend_similar_restaurants("Udipi's Upahar",10)

['Mathura Vilas',
 'Banana Leaf Multicuisine Restaurant',
 'Pot Pourri',
 'Beyond Flavours',
 'Royal Spicy Restaurant',
 'Mazzo - Marriott Executive Apartments',
 'Hyper Local',
 'eat.fit',
 'Delhi-39',
 'Desi Bytes']

In [17]:
recommend_similar_user('Vedant Killa',10)

['Aditi Gupta',
 'Ashish',
 'Chirag Gupta ?',
 'Shreyoshi Mandal',
 'Neha Aggarwal',
 'Rishabh Wahi',
 'Abhishek Mandal',
 'Foodie On Wheels...',
 'Kiran',
 'Puja Agarwal']