In [1]:
import pandas as pd
import numpy as np


In [2]:
# Load the Zomato dataset
df = pd.read_csv(r"C:\Users\Admin\OneDrive\Documents\Desktop\ip bonus\archive (1)\zomato.csv", encoding='latin-1')
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [3]:
# Remove duplicates
df.drop_duplicates(inplace=True)

# Drop rows with missing important values
df = df.dropna(subset=['Aggregate rating', 'City', 'Cuisines', 'Average Cost for two'])

# Remove rows with zero rating
df = df[df['Aggregate rating'] != 0]

# Convert ratings to float
df['Aggregate rating'] = df['Aggregate rating'].astype(float)

# Convert cost column to float after removing commas
df['Average Cost for two'] = df['Average Cost for two'].astype(str).str.replace(',', '')
df['Average Cost for two'] = df['Average Cost for two'].astype(float)

# Normalize cuisines to lowercase
df['Cuisines'] = df['Cuisines'].astype(str).str.lower()

In [4]:
def filter_restaurants(df, preferred_cuisine=None, preferred_cost=None, preferred_city=None):
    """
    Filters the restaurant DataFrame based on user's preferences:
    cuisine, cost category, and city.
    """
    filtered_df = df.copy()

    # Filter by primary cuisine
    if preferred_cuisine:
        filtered_df = filtered_df[filtered_df['Primary Cuisine'].str.contains(preferred_cuisine.lower())]

    # Filter by cost category
    if preferred_cost:
        filtered_df = filtered_df[filtered_df['Cost Category'] == preferred_cost.lower()]

    # Filter by city
    if preferred_city:
        filtered_df = filtered_df[filtered_df['City'].str.lower() == preferred_city.lower()]

    return filtered_df


In [5]:
def rank_restaurants(filtered_df, top_n=10):
    """
    Ranks restaurants based on rating and number of votes.
    Normalizes the score and returns top N.
    """
    if filtered_df.empty:
        return pd.DataFrame()

    # Normalize rating and votes for scoring
    filtered_df['Rating Norm'] = (filtered_df['Rating'] - filtered_df['Rating'].min()) / (filtered_df['Rating'].max() - filtered_df['Rating'].min())
    filtered_df['Votes Norm'] = (filtered_df['Votes'] - filtered_df['Votes'].min()) / (filtered_df['Votes'].max() - filtered_df['Votes'].min())

    # Compute final score (weighted)
    filtered_df['Score'] = (filtered_df['Rating Norm'] * 0.7) + (filtered_df['Votes Norm'] * 0.3)

    # Sort by score
    ranked_df = filtered_df.sort_values(by='Score', ascending=False).head(top_n)

    return ranked_df


In [8]:
# Categorize cost into buckets
def cost_category(cost):
    if cost < 300:
        return 'low'
    elif 300 <= cost < 700:
        return 'medium'
    else:
        return 'high'

df['Cost Category'] = df['Average Cost for two'].apply(cost_category)

# Extract primary cuisine (first cuisine)
df['Primary Cuisine'] = df['Cuisines'].apply(lambda x: x.split(',')[0].strip())

# Rename 'Aggregate rating' to 'Rating' for clarity
df.rename(columns={'Aggregate rating': 'Rating'}, inplace=True)

# Normalize votes for later ranking (between 0 and 1)
df['Votes Normalized'] = (df['Votes'] - df['Votes'].min()) / (df['Votes'].max() - df['Votes'].min())


In [9]:
# Example user preferences
cuisine_input = "italian"
cost_input = "medium"
city_input = "new delhi"

# Apply filter and rank
filtered = filter_restaurants(df, cuisine_input, cost_input, city_input)
top_results = rank_restaurants(filtered)

# Show the top recommended restaurants
top_results[['Restaurant Name', 'City', 'Primary Cuisine', 'Average Cost for two', 'Rating', 'Votes', 'Score']]


Unnamed: 0,Restaurant Name,City,Primary Cuisine,Average Cost for two,Rating,Votes,Score
4945,Play Pizza,New Delhi,italian,600.0,3.8,270,0.766667
3705,Sinyora's,New Delhi,italian,500.0,4.0,51,0.72395
4339,Starvin' Marvin,New Delhi,italian,600.0,3.8,170,0.640616
5170,FreshMenu,New Delhi,italian,600.0,3.7,132,0.47605
5174,Hunger Must Die,New Delhi,italian,500.0,3.7,99,0.434454
4221,The Tangy Tomatoes,New Delhi,italian,400.0,3.7,32,0.35
4446,FreshMenu,New Delhi,italian,600.0,3.6,36,0.238375
6607,Spooky Sky,New Delhi,italian,650.0,3.5,38,0.12423
6554,Flashback Midnight Hunger,New Delhi,italian,600.0,3.4,50,0.022689
4591,Spooky Sky,New Delhi,italian,650.0,3.4,44,0.015126
