# Loading libraries

In [1]:
import pandas as pd

# Load data

In [2]:
# Load the sample restaurant menus dataset (update the file path as needed)
file_path = "../data/raw/restaurant-menus.csv"
df = pd.read_csv(file_path)


display(df.head())  # Show the first few rows
display(df.info())  # Check for missing values and data types


Unnamed: 0,restaurant_id,category,name,description,price
0,1,Extra Large Pizza,Extra Large Meat Lovers,Whole pie.,15.99 USD
1,1,Extra Large Pizza,Extra Large Supreme,Whole pie.,15.99 USD
2,1,Extra Large Pizza,Extra Large Pepperoni,Whole pie.,14.99 USD
3,1,Extra Large Pizza,Extra Large BBQ Chicken &amp; Bacon,Whole Pie,15.99 USD
4,1,Extra Large Pizza,Extra Large 5 Cheese,Whole pie.,14.99 USD


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5117217 entries, 0 to 5117216
Data columns (total 5 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   restaurant_id  int64 
 1   category       object
 2   name           object
 3   description    object
 4   price          object
dtypes: int64(1), object(4)
memory usage: 195.2+ MB


None

# Data cleaning

In [3]:
# Data Cleaning
df = df.drop_duplicates()  # Remove duplicate rows
df = df.dropna(subset=["restaurant_id", "name", "price", "category"])  # Drop rows with missing values

# Ensure price is a string before replacing "USD"
if df["price"].dtype == "object":
    df["price"] = df["price"].str.replace("USD", "", regex=False).str.strip()
    
# Convert price column to numeric
df["price"] = pd.to_numeric(df["price"], errors="coerce")

# Drop rows where price couldn't be converted
df = df.dropna(subset=["price"])

# ✅ Define a whitelist of **food-related** categories
food_categories = [
    "Appetizers", "Beverages", "Breakfast", "Burgers", "Desserts", "Drinks", "Fast Food",
    "Italian", "Japanese", "Mexican", "Pizza", "Salads", "Sandwiches", "Seafood", "Sides",
    "Soups", "Steaks", "Sushi", "Vegan", "Vegetarian"
]

# Keep only food-related categories
df = df[df["category"].isin(food_categories)]

# Exploratory Data Analysis (EDA)
print("Summary Statistics:")
display(df.describe())

print("\nTop 10 Menu Categories:")
print(df["category"].value_counts().head(10))

print("\nTop 10 Most Expensive Menu Items:")
display(df.sort_values(by="price", ascending=False).head(10))


Summary Statistics:


Unnamed: 0,restaurant_id,price
count,966780.0,966780.0
mean,30974.553889,6.86488
std,18578.475867,5.657743
min,2.0,-2.5
25%,15592.0,2.98
50%,29469.0,5.5
75%,46946.0,9.69
max,63469.0,1049.0



Top 10 Menu Categories:
category
Beverages     156993
Sides         149116
Sandwiches    146753
Drinks        138209
Appetizers     98707
Salads         79665
Desserts       52330
Breakfast      50160
Burgers        30216
Seafood        21631
Name: count, dtype: int64

Top 10 Most Expensive Menu Items:


Unnamed: 0,restaurant_id,category,name,description,price
3451775,41586,Seafood,Mojarra Frita,"Whole fried tilapia fish served with fries, ri...",1049.0
3076629,36135,Sides,Greek Bread,"Basil pesto, tomatoes, olive oil, feta, and mo...",599.0
4188725,51495,Breakfast,Sunfolk Whole Peeled Straw Mushroom,,488.75
3262613,38625,Desserts,Pudding,,450.0
233128,3031,Appetizers,Wings Smoked (100),Smoked Wings (100),224.94
3026626,35368,Breakfast,Breakfast Taco Family Packages,100 Taco Packages \nChoose up to 5 different t...,205.0
4289193,52957,Seafood,Full Order King Crab Legs,drawn butter | shell split,204.0
4861817,60357,Breakfast,Tacos,,200.0
1226132,15252,Drinks,Coke,,200.0
3971269,48382,Beverages,3 Day Cleanse,Start your day off right. Green veggies detoxi...,195.0


In [4]:
# Exploratory Data Analysis (EDA)
print("Summary Statistics:")
display(df.describe())

print("\nTop 10 Menu Categories:")
print(df["category"].value_counts().head(10))

print("\nTop 10 Most Expensive Menu Items:")
display(df.sort_values(by="price", ascending=False).head(10))

Summary Statistics:


Unnamed: 0,restaurant_id,price
count,966780.0,966780.0
mean,30974.553889,6.86488
std,18578.475867,5.657743
min,2.0,-2.5
25%,15592.0,2.98
50%,29469.0,5.5
75%,46946.0,9.69
max,63469.0,1049.0



Top 10 Menu Categories:
category
Beverages     156993
Sides         149116
Sandwiches    146753
Drinks        138209
Appetizers     98707
Salads         79665
Desserts       52330
Breakfast      50160
Burgers        30216
Seafood        21631
Name: count, dtype: int64

Top 10 Most Expensive Menu Items:


Unnamed: 0,restaurant_id,category,name,description,price
3451775,41586,Seafood,Mojarra Frita,"Whole fried tilapia fish served with fries, ri...",1049.0
3076629,36135,Sides,Greek Bread,"Basil pesto, tomatoes, olive oil, feta, and mo...",599.0
4188725,51495,Breakfast,Sunfolk Whole Peeled Straw Mushroom,,488.75
3262613,38625,Desserts,Pudding,,450.0
233128,3031,Appetizers,Wings Smoked (100),Smoked Wings (100),224.94
3026626,35368,Breakfast,Breakfast Taco Family Packages,100 Taco Packages \nChoose up to 5 different t...,205.0
4289193,52957,Seafood,Full Order King Crab Legs,drawn butter | shell split,204.0
4861817,60357,Breakfast,Tacos,,200.0
1226132,15252,Drinks,Coke,,200.0
3971269,48382,Beverages,3 Day Cleanse,Start your day off right. Green veggies detoxi...,195.0


In [5]:
# ✅ Step 1: Identify the **Top 20 Most Expensive Menu Items**
top_expensive_items = df.sort_values(by="price", ascending=False).head(20)

# ✅ Step 2: Check for **Unusual Price Formats** (e.g., commas, misplaced decimals)
df["price_str"] = df["price"].astype(str)  # Convert to string for checking unusual patterns
unusual_prices = df[df["price_str"].str.contains(",", regex=False)]  # Check for commas in prices

# ✅ Display results
print("\nTop 20 Most Expensive Menu Items:")
display(top_expensive_items)

print("\nItems with Unusual Price Formats:")
display(unusual_prices)


Top 20 Most Expensive Menu Items:


Unnamed: 0,restaurant_id,category,name,description,price
3451775,41586,Seafood,Mojarra Frita,"Whole fried tilapia fish served with fries, ri...",1049.0
3076629,36135,Sides,Greek Bread,"Basil pesto, tomatoes, olive oil, feta, and mo...",599.0
4188725,51495,Breakfast,Sunfolk Whole Peeled Straw Mushroom,,488.75
3262613,38625,Desserts,Pudding,,450.0
233128,3031,Appetizers,Wings Smoked (100),Smoked Wings (100),224.94
3026626,35368,Breakfast,Breakfast Taco Family Packages,100 Taco Packages \nChoose up to 5 different t...,205.0
4289193,52957,Seafood,Full Order King Crab Legs,drawn butter | shell split,204.0
4861817,60357,Breakfast,Tacos,,200.0
1226132,15252,Drinks,Coke,,200.0
3971269,48382,Beverages,3 Day Cleanse,Start your day off right. Green veggies detoxi...,195.0



Items with Unusual Price Formats:


Unnamed: 0,restaurant_id,category,name,description,price,price_str


In [6]:
# Define the file path to save the cleaned dataset
cleaned_file_path = "../data/raw/cleaned_restaurant_menus.csv"
# Save the cleaned dataset
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved successfully as: {cleaned_file_path}")

Cleaned dataset saved successfully as: ../data/raw/cleaned_restaurant_menus.csv
