In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
file_path = 'C:/Users/Mahad/Downloads/zomato_restaurants_in_India.csv'
data = pd.read_csv(file_path)

In [None]:
data.info()

In [None]:
data.head()

# Data Cleaning and Preparation Steps:

In [None]:
data['address'].fillna('Unknown', inplace=True)  
data['timings'].fillna('Not available', inplace=True)  
data['cuisines'].fillna(data['cuisines'].mode()[0], inplace=True)  


data['votes'] = data['votes'].apply(lambda x: max(x, 0))  


import ast  
data['establishment'] = data['establishment'].apply(ast.literal_eval)
data['highlights'] = data['highlights'].apply(ast.literal_eval)


data['number_of_highlights'] = data['highlights'].apply(len)


desc_stats = data.describe()


import seaborn as sns
import matplotlib.pyplot as plt


sns.set(style="whitegrid")


fig, axes = plt.subplots(1, 2, figsize=(14, 6))


sns.histplot(data['aggregate_rating'], kde=True, ax=axes[0])
axes[0].set_title('Distribution of Aggregate Ratings')


sns.countplot(x='price_range', data=data, ax=axes[1])
axes[1].set_title('Distribution of Price Range')

plt.show()


correlation_matrix = data[['aggregate_rating', 'average_cost_for_two', 'votes', 'number_of_highlights']].corr()


desc_stats, correlation_matrix

# Detailed Exploratory Data Analysis (EDA) Code

In [None]:
central_tendency = data[['aggregate_rating', 'average_cost_for_two', 'number_of_highlights']].agg(['mean', 'median'])


dispersion = data[['aggregate_rating', 'average_cost_for_two', 'number_of_highlights']].agg(['std', 'min', 'max'])

shape = data[['aggregate_rating', 'average_cost_for_two', 'number_of_highlights']].agg(['skew', 'kurt'])


fig, axes = plt.subplots(3, 1, figsize=(10, 15))


sns.histplot(data['aggregate_rating'], bins=20, kde=True, ax=axes[0])
axes[0].set_title('Distribution of Aggregate Ratings')


sns.countplot(x='price_range', data=data, ax=axes[1])
axes[1].set_title('Distribution of Price Range')


top_cuisines = data['cuisines'].value_counts().head(10)
sns.barplot(x=top_cuisines.values, y=top_cuisines.index, ax=axes[2])
axes[2].set_title('Top 10 Most Common Cuisines')

plt.tight_layout()


correlation_matrix = data[['aggregate_rating', 'average_cost_for_two', 'votes', 'number_of_highlights']].corr()


plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')

plt.show()


central_tendency, dispersion, shape

# Regional Analysis: EDA Code

In [None]:
city_group = data.groupby('city').agg({
    'res_id': 'count',  
    'average_cost_for_two': 'mean',  
    'aggregate_rating': 'mean',  
    'number_of_highlights': 'mean'  
}).rename(columns={'res_id': 'number_of_restaurants'}).sort_values(by='number_of_restaurants', ascending=False)

# Top 10 cities by number of restaurant.
top_cities = city_group.head(10)

# Plotting the distribution of restaurants, average cost, and ratings in these cities
fig, ax = plt.subplots(3, 1, figsize=(10, 15))

# Number of restaurants per city
sns.barplot(x=top_cities['number_of_restaurants'], y=top_cities.index, ax=ax[0])
ax[0].set_title('Number of Restaurants per City')

# Average cost for two per city
sns.barplot(x=top_cities['average_cost_for_two'], y=top_cities.index, ax=ax[1])
ax[1].set_title('Average Cost for Two per City')

# Average rating per city
sns.barplot(x=top_cities['aggregate_rating'], y=top_cities.index, ax=ax[2])
ax[2].set_title('Average Aggregate Rating per City')

plt.tight_layout()

# Unique characteristics per city: using mode of cuisines and highlights
city_characteristics = data.groupby('city').agg({
    'cuisines': lambda x: pd.Series.mode(x)[0],  
    'highlights': lambda x: pd.Series.mode(x)[0]  
})

city_characteristics.head(10)

# Customer Preference Analysis:


In [None]:
# Analyzing types of cuisines that are popular in different regions
city_cuisine_counts = data.groupby(['city', 'cuisines']).size().reset_index(name='count')
most_popular_cuisine_per_city = city_cuisine_counts.groupby('city').apply(lambda x: x.loc[x['count'].idxmax()])

# Examine the relationship between restaurant ratings, price range, and popularity
plt.figure(figsize=(10, 6))
sns.scatterplot(x='average_cost_for_two', y='aggregate_rating', hue='price_range', data=data, palette='viridis')
plt.title('Relationship between Restaurant Ratings, Price Range, and Popularity')
plt.xlabel('Average Cost for Two')
plt.ylabel('Aggregate Rating')
plt.legend(title='Price Range')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='votes', y='aggregate_rating', data=data, color='blue')
plt.title('Relationship between Restaurant Ratings and Popularity (Votes)')
plt.xlabel('Votes')
plt.ylabel('Aggregate Rating')
plt.show()

# Correlation Analysis
correlation_matrix = data[['aggregate_rating', 'average_cost_for_two', 'votes']].corr()
print(correlation_matrix)

# Display the most popular cuisine per city
print(most_popular_cuisine_per_city.head())

# Competitive Analysis:


In [None]:
# Grouping data by city, cuisine, and price range to get an average rating for each group
competitive_groups = data.groupby(['city', 'cuisines', 'price_range']).agg({
    'aggregate_rating': 'mean',
    'votes': 'sum'  # Sum of votes to gauge popularity
}).reset_index()

# Identifying the top competitors in each city by highest average rating and most votes within each price range
top_competitors_per_city = competitive_groups.groupby(['city', 'price_range']).apply(
    lambda x: x.nlargest(1, ['aggregate_rating', 'votes'])
).reset_index(drop=True)

# Calculate average metrics for each city (for comparison)
average_metrics_city = data.groupby('city').agg({
    'aggregate_rating': 'mean',
    'average_cost_for_two': 'mean',
    'votes': 'mean'
}).reset_index()

# Merge top competitors with city averages to analyze strengths and weaknesses
competitive_analysis = pd.merge(top_competitors_per_city, average_metrics_city, on='city', suffixes=('_comp', '_avg'))

# Calculate differences to identify strengths and weaknesses
competitive_analysis['rating_diff'] = competitive_analysis['aggregate_rating_comp'] - competitive_analysis['aggregate_rating_avg']
competitive_analysis['votes_diff'] = competitive_analysis['votes_comp'] - competitive_analysis['votes_avg']

# Visualizing the top competitors in a sample city for illustration
sample_city_data = competitive_analysis[competitive_analysis['city'] == 'Bangalore']
plt.figure(figsize=(12, 6))
sns.barplot(x='cuisines', y='rating_diff', hue='price_range', data=sample_city_data)
plt.title('Competitive Strengths and Weaknesses in Bangalore')
plt.xlabel('Cuisine')
plt.ylabel('Rating Difference From City Average')
plt.legend(title='Price Range')
plt.xticks(rotation=45)
plt.show()

# Market Gap Analysis:

In [None]:
# Count of cuisines by city to find underrepresented cuisines
cuisine_distribution = data.groupby(['city', 'cuisines']).size().reset_index(name='count')
city_cuisine_totals = cuisine_distribution.groupby('city')['count'].sum().reset_index(name='total_cuisines')
cuisine_market_share = pd.merge(cuisine_distribution, city_cuisine_totals, on='city')
cuisine_market_share['market_share'] = cuisine_market_share['count'] / cuisine_market_share['total_cuisines']

# Finding cities with the lowest market share for popular cuisines
# This indicates potential gaps where these cuisines are underrepresented
potential_gaps = cuisine_market_share[cuisine_market_share['market_share'] < 0.01]  # Threshold for underrepresentation

# Analyzing price ranges across different cities to find underrepresented price ranges
price_range_distribution = data.groupby(['city', 'price_range']).size().reset_index(name='count')
city_price_totals = price_range_distribution.groupby('city')['count'].sum().reset_index(name='total_price_counts')
price_market_share = pd.merge(price_range_distribution, city_price_totals, on='city')
price_market_share['market_share'] = price_market_share['count'] / price_market_share['total_price_counts']

# Finding cities with the lowest market share for certain price ranges
# This identifies price ranges that might be underexploited
underrepresented_prices = price_market_share[price_market_share['market_share'] < 0.05]  # Threshold for underrepresentation

# Visualizing the findings for potential market gaps
plt.figure(figsize=(12, 6))
sns.barplot(x='cuisines', y='market_share', hue='city', data=potential_gaps)
plt.title('Underrepresented Cuisines by City')
plt.xlabel('Cuisine')
plt.ylabel('Market Share')
plt.xticks(rotation=45)
plt.legend(title='City')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x='price_range', y='market_share', hue='city', data=underrepresented_prices)
plt.title('Underrepresented Price Ranges by City')
plt.xlabel('Price Range')
plt.ylabel('Market Share')
plt.xticks(rotation=45)
plt.legend(title='City')
plt.tight_layout()
plt.show()