# Zomato Restaurants EDA

**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
!pip install openpyxl
df = pd.read_csv('../input/zomato-restaurants-data/zomato.csv', encoding = 'latin-1')
sheet = pd.read_excel('../input/zomato-restaurants-data/Country-Code.xlsx')

In [None]:
df.head(2)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Restaurant ID'].duplicated().sum()

In [None]:
sheet.head()

In [None]:
sheet.shape

In [None]:
sheet.isnull().sum()

In [None]:
final_df = pd.merge(df,sheet,on = 'Country Code', how='left')
final_df.head(2)

In [None]:
final_df.drop('Country Code', axis=1,inplace=True)

In [None]:
final_df.columns

### Currencies used in Countries

In [None]:
final_df.groupby(['Country','Currency']).size().reset_index().drop(0, axis=1)

### TOP 3 Countries as per the count of Restraunts

In [None]:
country_counts = final_df['Country'].value_counts()
country_counts.index[:3]

In [None]:
plt.pie(final_df['Country'].value_counts()[:3], labels=country_counts.index[:3],autopct='%1.2f%%')
plt.plot()

In [None]:
Indian_res = final_df[final_df['Country']=='India']

In [None]:
city_counts = Indian_res['City'].value_counts()
city_counts[:3]

In [None]:
plt.pie(city_counts.values[:3], labels= city_counts.index[:3],autopct='%1.2f%%')
plt.plot()

In [None]:
Indian_res[Indian_res['City']=='New Delhi']['Locality'].value_counts()[:3]

> **Observations:**
> * India is having max number of Zomato Associated Restaurants as Zomato is India Based Company.
> * India is followed by United States and United Kingdom.
> * Talking about the Indian Restaurants New Delhi has maximum Restaurants associated with zomato.
> * Which is followed by Gurgaon and Noida.
> * In New Delhi Connaught Place(122) , Rajouri Garden(99) and Shahdara(87) are the top 3 localities where Zomato is functioning.

### Ratings 

In [None]:
ratings = final_df.groupby(['Aggregate rating','Rating color','Rating text']).size().reset_index().rename(columns={0:'Rating Count'})
ratings

> **Observations:**
> 
> * Whenever the Rating is from 4.5 to 4.9 it Indicates that the Restraunt is Excellent
> * When Ratings are between 4.0 and 4.4 it Indicates that Restraunts are Very Good.
> * When ratings are between 3.5 to 3.9 it Indicates that Restraunts are Good.
> * When ratings are between 2.5 to 3.4 it Indicates that Restraunts are Average.
> * When ratings are between 1.8 to 2.4 it Indicates that Restraunts are Poor.

In [None]:
from numpy import median

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(ratings['Aggregate rating'],ratings['Rating Count'],hue=ratings['Rating color'],estimator=median,palette=['white','red','orange','yellow','seagreen','green'])
plt.title('Rating Text')
plt.plot()

> **Observations:**
> * In rated Restaurants maximum rating count is for Average Rating (color = Yellow and value = 3.2)
> * In Good Category 3.5 was the rating given by maximum users
> 

In [None]:
final_df['Rating color'].unique()

In [None]:
sns.countplot(final_df['Rating color'],palette=['green','seagreen','yellow','orange','gray','red'])
plt.title('Color Count')
plt.plot()

### Table Bookings

In [None]:
sns.countplot(final_df['Has Table booking'],palette=['green','red'])
plt.title('Availability of Table Booking')
plt.plot()

**Observation:**
* Mejority of Restaurants are not providing Table Bookings

### Average Cost for Two

In [None]:
avg_cost_table_booking = final_df.groupby('Has Table booking')['Average Cost for two'].mean()
avg_cost_table_booking 

> **Observation:**
> * For the **Availability** of Table Booking Average cost of two is **1153**.
> * For the **unavailability** of Table Booking Average cost of two is **1536**.

In [None]:
final_df.groupby('Country')['Average Cost for two'].mean().sort_values(ascending =False)

*But these digits will be taken into consideration according to currency of that perticular country*

In [None]:
Indian_res.groupby('City')['Average Cost for two'].mean().sort_values(ascending =False)[-3:]

> **Observations:**
> * Top 3 cities with heighest Average Cost for two are **Panchkula,Hydrabad and Pune**
> * Bottom 3 cities with heighest Average Cost for two are **Faridabad, Amritsar,Varanasi**

In [None]:
columns = final_df.columns
columns

### Online Delivery

In [None]:
online_delivery = final_df['Has Online delivery'].value_counts()
online_delivery

In [None]:
plt.pie(online_delivery.values, labels= online_delivery.index, autopct='%1.2f%%')
plt.title('Online Delivery')
plt.plot()

**74% of Restaurants are providing online Delivery**

In [None]:
final_df.groupby(['Has Online delivery','Is delivering now']).size().reset_index()

### Average Votes

In [None]:
final_df.groupby('Country')['Votes'].sum().sort_values(ascending = False)

In [None]:
final_df.groupby('Country')['Votes'].mean().sort_values(ascending = False)

> **Observations:**
> * India is having Heighest Number of Votes
> * Indonesia is having max number of average Votes

In [None]:
columns

### Price Range

In [None]:
final_df['Price range'].unique()

> **Assumptions:**
> 
> 1 - Low rates
> 
> 2 - Average rates
> 
> 3- High rates
> 
> 4- very high rates

In [None]:
sns.countplot(final_df['Price range'],palette='rocket_r')
plt.title('Restaurants with Different Price Range')

In [None]:
sns.countplot(Indian_res['Price range'],palette='rocket_r')
plt.title('Restaurants with Different Price Range in India')

### Cuisines

In [None]:
final_df['Cuisines'].value_counts()[:3]

In [None]:
Indian_res['Cuisines'].value_counts()[:3]

In [None]:
print('Number of Cuisines present Internationally:', len(final_df['Cuisines'].unique()))
print('Number of Cuisines present in India:', len(Indian_res['Cuisines'].unique()))

In [None]:
print('Cuisines Present in Mumbai :',len(Indian_res[Indian_res['City']=='Mumbai']['Cuisines'].unique()))