# Additional Zomato Dataset Analysis

This notebook contains additional analysis of the Zomato dataset focusing on important business metrics and relationships.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

In [None]:
# Read the dataset
df = pd.read_csv('zomato.csv')
# Basic data cleaning
df.dropna(inplace=True)
# Convert rate to numeric
df['rate'] = df['rate'].apply(lambda x: float(str(x).split('/')[0]) if isinstance(x, str) else x)
# Convert cost to numeric
df['approx_cost(for two people)'] = pd.to_numeric(df['approx_cost(for two people)'].str.replace(',',''), errors='coerce')

## 1. Cost Analysis by Location and Restaurant Type

In [None]:
# Average cost by location
location_cost = df.groupby('location')['approx_cost(for two people)'].agg(['mean', 'count']).reset_index()
location_cost = location_cost.sort_values('mean', ascending=False).head(15)

plt.figure(figsize=(15, 8))
sns.barplot(data=location_cost, x='location', y='mean')
plt.xticks(rotation=45)
plt.title('Average Cost for Two by Location (Top 15 Most Expensive Areas)')
plt.xlabel('Location')
plt.ylabel('Average Cost for Two')
plt.show()

In [None]:
# Cost distribution by restaurant type
plt.figure(figsize=(15, 8))
sns.boxplot(data=df, x='rest_type', y='approx_cost(for two people)', order=df.groupby('rest_type')['approx_cost(for two people)'].median().sort_values(ascending=False).head(10).index)
plt.xticks(rotation=45)
plt.title('Cost Distribution by Restaurant Type (Top 10)')
plt.xlabel('Restaurant Type')
plt.ylabel('Cost for Two')
plt.show()

## 2. Cuisine Popularity Analysis

In [None]:
# Split cuisines and get count
cuisine_counts = pd.Series([cuisine.strip() for cuisines in df['cuisines'].str.split(',') for cuisine in cuisines]).value_counts()

plt.figure(figsize=(15, 8))
sns.barplot(x=cuisine_counts.head(15).values, y=cuisine_counts.head(15).index)
plt.title('Most Popular Cuisines (Top 15)')
plt.xlabel('Number of Restaurants')
plt.show()

In [None]:
# Average rating by cuisine
cuisine_ratings = df.explode('cuisines').groupby('cuisines')['rate'].agg(['mean', 'count']).reset_index()
cuisine_ratings = cuisine_ratings[cuisine_ratings['count'] > 50].sort_values('mean', ascending=False)

plt.figure(figsize=(15, 8))
sns.barplot(data=cuisine_ratings.head(15), x='cuisines', y='mean')
plt.xticks(rotation=45)
plt.title('Average Rating by Cuisine (Top 15, min 50 restaurants)')
plt.xlabel('Cuisine')
plt.ylabel('Average Rating')
plt.show()

## 3. Rating Distribution and Correlation Analysis

In [None]:
# Rating distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='rate', bins=30)
plt.title('Distribution of Restaurant Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# Correlation between votes and rating
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='votes', y='rate')
plt.title('Correlation between Votes and Rating')
plt.xlabel('Number of Votes')
plt.ylabel('Rating')
plt.show()

## 4. Online Order and Table Booking Impact Analysis

In [None]:
# Compare ratings for restaurants with/without online ordering
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='online_order', y='rate')
plt.title('Rating Distribution by Online Order Availability')
plt.show()

# Compare ratings for restaurants with/without table booking
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='book_table', y='rate')
plt.title('Rating Distribution by Table Booking Availability')
plt.show()

In [None]:
# Statistical test for online ordering impact
online_yes = df[df['online_order'] == 'Yes']['rate']
online_no = df[df['online_order'] == 'No']['rate']
t_stat, p_val = stats.ttest_ind(online_yes, online_no)
print(f'T-test results for online ordering impact on ratings:')
print(f't-statistic: {t_stat:.4f}')
print(f'p-value: {p_val:.4f}')

## 5. Location-based Restaurant Distribution

In [None]:
# Restaurant count by location
location_counts = df['location'].value_counts()

plt.figure(figsize=(15, 8))
sns.barplot(x=location_counts.head(15).values, y=location_counts.head(15).index)
plt.title('Number of Restaurants by Location (Top 15)')
plt.xlabel('Number of Restaurants')
plt.show()

In [None]:
# Average rating by location
location_ratings = df.groupby('location')['rate'].agg(['mean', 'count']).reset_index()
location_ratings = location_ratings[location_ratings['count'] > 50].sort_values('mean', ascending=False)

plt.figure(figsize=(15, 8))
sns.barplot(data=location_ratings.head(15), x='location', y='mean')
plt.xticks(rotation=45)
plt.title('Average Rating by Location (Top 15, min 50 restaurants)')
plt.xlabel('Location')
plt.ylabel('Average Rating')
plt.show()