Task 1 - Data Immersion & Wrangling

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("swiggy.csv") 
df.head()

Unnamed: 0,ID,Area,City,Restaurant,Price,Avg ratings,Total ratings,Food type,Address,Delivery time
0,211,Koramangala,Bangalore,Tandoor Hut,300.0,4.4,100,"Biryani,Chinese,North Indian,South Indian",5Th Block,59
1,221,Koramangala,Bangalore,Tunday Kababi,300.0,4.1,100,"Mughlai,Lucknowi",5Th Block,56
2,246,Jogupalya,Bangalore,Kim Lee,650.0,4.4,100,Chinese,Double Road,50
3,248,Indiranagar,Bangalore,New Punjabi Hotel,250.0,3.9,500,"North Indian,Punjabi,Tandoor,Chinese",80 Feet Road,57
4,249,Indiranagar,Bangalore,Nh8,350.0,4.0,50,"Rajasthani,Gujarati,North Indian,Snacks,Desser...",80 Feet Road,63


Basic Dataset Overview

In [3]:
df.shape
df.columns
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8680 entries, 0 to 8679
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             8680 non-null   int64  
 1   Area           8680 non-null   object 
 2   City           8680 non-null   object 
 3   Restaurant     8680 non-null   object 
 4   Price          8680 non-null   float64
 5   Avg ratings    8680 non-null   float64
 6   Total ratings  8680 non-null   int64  
 7   Food type      8680 non-null   object 
 8   Address        8680 non-null   object 
 9   Delivery time  8680 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 678.3+ KB


Unnamed: 0,ID,Price,Avg ratings,Total ratings,Delivery time
count,8680.0,8680.0,8680.0,8680.0,8680.0
mean,244812.071429,348.44447,3.655104,156.634793,53.967051
std,158671.617188,230.940074,0.647629,391.448014,14.292335
min,211.0,0.0,2.0,20.0,20.0
25%,72664.0,200.0,2.9,50.0,44.0
50%,283442.0,300.0,3.9,80.0,53.0
75%,393425.25,400.0,4.2,100.0,64.0
max,466928.0,2500.0,5.0,10000.0,109.0


In [5]:
# Check Missing Values
df.duplicated().sum()

# Remove duplicates if needed
df = df.drop_duplicates()

In [8]:
# Check Unique Categories
df["City"].unique()
df["Area"].nunique()
df["Food type"].head()

0            Biryani,Chinese,North Indian,South Indian
1                                     Mughlai,Lucknowi
2                                              Chinese
3                 North Indian,Punjabi,Tandoor,Chinese
4    Rajasthani,Gujarati,North Indian,Snacks,Desser...
Name: Food type, dtype: object

In [7]:
# Detect Outliers
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    print(col, ":", df[col].describe(), "\n")

ID : count      8680.000000
mean     244812.071429
std      158671.617188
min         211.000000
25%       72664.000000
50%      283442.000000
75%      393425.250000
max      466928.000000
Name: ID, dtype: float64 

Price : count    8680.000000
mean      348.444470
std       230.940074
min         0.000000
25%       200.000000
50%       300.000000
75%       400.000000
max      2500.000000
Name: Price, dtype: float64 

Avg ratings : count    8680.000000
mean        3.655104
std         0.647629
min         2.000000
25%         2.900000
50%         3.900000
75%         4.200000
max         5.000000
Name: Avg ratings, dtype: float64 

Total ratings : count     8680.000000
mean       156.634793
std        391.448014
min         20.000000
25%         50.000000
50%         80.000000
75%        100.000000
max      10000.000000
Name: Total ratings, dtype: float64 

Delivery time : count    8680.000000
mean       53.967051
std        14.292335
min        20.000000
25%        44.000000
50%      

Data Cleaning & Standardization

In [9]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [10]:
df = df.apply(lambda x: x.str.strip() if x.dtype=="object" else x)

In [11]:
df["city"] = df["city"].str.title()
df["restaurant"] = df["restaurant"].str.title()

Create Price Category

In [12]:
df["price_category"] = pd.cut(
    df["price"],
    bins=[0,200,400,700,2000],
    labels=["Low","Medium","High","Premium"]
)

Extract Main Food Type

In [13]:
df["primary_food"] = df["food_type"].str.split(",").str[0]

 Final Quality Check

In [14]:
df.info()
df.describe()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8680 entries, 0 to 8679
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   id              8680 non-null   int64   
 1   area            8680 non-null   object  
 2   city            8680 non-null   object  
 3   restaurant      8680 non-null   object  
 4   price           8680 non-null   float64 
 5   avg_ratings     8680 non-null   float64 
 6   total_ratings   8680 non-null   int64   
 7   food_type       8680 non-null   object  
 8   address         8680 non-null   object  
 9   delivery_time   8680 non-null   int64   
 10  price_category  8672 non-null   category
 11  primary_food    8680 non-null   object  
dtypes: category(1), float64(2), int64(3), object(6)
memory usage: 754.7+ KB


Unnamed: 0,id,area,city,restaurant,price,avg_ratings,total_ratings,food_type,address,delivery_time,price_category,primary_food
0,211,Koramangala,Bangalore,Tandoor Hut,300.0,4.4,100,"Biryani,Chinese,North Indian,South Indian",5Th Block,59,Medium,Biryani
1,221,Koramangala,Bangalore,Tunday Kababi,300.0,4.1,100,"Mughlai,Lucknowi",5Th Block,56,Medium,Mughlai
2,246,Jogupalya,Bangalore,Kim Lee,650.0,4.4,100,Chinese,Double Road,50,High,Chinese
3,248,Indiranagar,Bangalore,New Punjabi Hotel,250.0,3.9,500,"North Indian,Punjabi,Tandoor,Chinese",80 Feet Road,57,Medium,North Indian
4,249,Indiranagar,Bangalore,Nh8,350.0,4.0,50,"Rajasthani,Gujarati,North Indian,Snacks,Desser...",80 Feet Road,63,Medium,Rajasthani


Save Cleaned Dataset

In [15]:
df.to_csv("cleaned_restaurant_data.csv", index=False)