In [17]:
import pandas as pd

# Step 1: Load the dataset with proper encoding
df = pd.read_csv('zomato.csv', encoding='latin1')

# Step 2: Normalize column names (lowercase, replace spaces with underscores)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Step 3: Drop duplicates
df.drop_duplicates(inplace=True)

# Step 4: Drop irrelevant columns if present
columns_to_drop = [
    'restaurant_id', 'country_code', 'address', 'locality_verbose',
    'longitude', 'latitude', 'has_table_booking', 'has_online_delivery',
    'is_delivering_now', 'switch_to_order_menu', 'rating_color', 'rating_text'
]
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

# Step 5: Rename columns for easier handling 
df.rename(columns={
    'average_cost_for_two': 'cost',
    'aggregate_rating': 'rating'
}, inplace=True)

# Step 6: Clean cost and rating columns
df['cost'] = pd.to_numeric(df['cost'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Step 7: Fill missing numeric values with column-wise mean
df.fillna(df.mean(numeric_only=True), inplace=True)

# Step 8: Drop rows with missing critical text fields
df.dropna(subset=['cuisines', 'city'], inplace=True)

# Step 9: Normalize text columns
df['cuisines'] = df['cuisines'].str.lower().str.strip()
df['city'] = df['city'].str.lower().str.strip()
df['restaurant_name'] = df['restaurant_name'].str.strip()

# Step 10: Feature engineering

# Cost bucket
def cost_bucket(cost):
    if cost < 300:
        return 'Low'
    elif cost < 700:
        return 'Medium'
    else:
        return 'High'

df['cost_bucket'] = df['cost'].apply(cost_bucket)

# Extract primary cuisine
df['primary_cuisine'] = df['cuisines'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else x)

# Round rating
df['rating'] = df['rating'].round(1)

# Save cleaned data
df.to_csv('zomato_cleaned.csv', index=False)

# Check result
print(df.info())
print(df.head())
 

<class 'pandas.core.frame.DataFrame'>
Index: 9542 entries, 0 to 9550
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   restaurant_name  9542 non-null   object 
 1   city             9542 non-null   object 
 2   locality         9542 non-null   object 
 3   cuisines         9542 non-null   object 
 4   cost             9542 non-null   int64  
 5   currency         9542 non-null   object 
 6   price_range      9542 non-null   int64  
 7   rating           9542 non-null   float64
 8   votes            9542 non-null   int64  
 9   cost_bucket      9542 non-null   object 
 10  primary_cuisine  9542 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 894.6+ KB
None
          restaurant_name              city  \
0        Le Petit Souffle       makati city   
1        Izakaya Kikufuji       makati city   
2  Heat - Edsa Shangri-La  mandaluyong city   
3                    Ooma  mandaluyong city

In [10]:
def filter_and_rank(df, selected_cuisines, budget_range, selected_location, top_n=10):

    """
    Filters and ranks restaurants based on user preferences.
    Parameters:
    - df (DataFrame): Cleaned restaurant dataset.
    - selected_cuisines (list of str): User's preferred cuisines.
    - budget_range (str): One of ['Low', 'Medium', 'High'].
    - selected_location (str): Preferred city or location.
    - top_n (int): Number of top results to return.
    Returns:
    - DataFrame with top N recommendations and explanation column.
    """
    # --- Filtering ---
    filtered_df = df[
        (df['cost_bucket'] == budget_range) &
        (df['city'].str.contains(selected_location.lower(), na=False)) &
        (df['primary_cuisine'].isin([c.lower() for c in selected_cuisines]))
    ]

    if filtered_df.empty:
        return pd.DataFrame(columns=['restaurant_name', 'primary_cuisine', 'cost', 'rating', 'votes', 'explanation'])
    # --- Ranking ---
    # Normalize 'rating' and 'votes' to bring them to a common scale
    filtered_df = filtered_df.copy()
    filtered_df['norm_rating'] = filtered_df['rating'] / 5.0
    filtered_df['norm_votes'] = filtered_df['votes'] / (filtered_df['votes'].max() if filtered_df['votes'].max() != 0 else 1)

    # Weighted score (60% rating, 40% votes)

    filtered_df['score'] = (0.6 * filtered_df['norm_rating']) + (0.4 * filtered_df['norm_votes'])

    # Sort by score
    ranked_df = filtered_df.sort_values(by='score', ascending=False).head(top_n)
    # --- Explainability ---
    ranked_df['explanation'] = ranked_df.apply(
        lambda row: f"Matched on {row['primary_cuisine'].title()} cuisine, {budget_range} budget, and rating {row['rating']}",
        axis=1
    )
    # Select relevant columns
    result = ranked_df[[
        'restaurant_name', 'primary_cuisine', 'cost', 'rating', 'votes', 'city', 'explanation'
    ]].reset_index(drop=True)
    return result
 

In [9]:
user_cuisines = ['Indian', 'Chinese']

user_budget = 'Medium'

user_location = 'Delhi'

recommendations = filter_and_rank(df, user_cuisines, user_budget, user_location)

print(recommendations)
 

              restaurant_name primary_cuisine  cost  rating  votes       city  \
0                Munch Nation         chinese   350     3.8    727  new delhi   
1                   Kennedy's         chinese   400     4.1    491  new delhi   
2                   Wow! Momo         chinese   350     3.4    491  new delhi   
3  Jughead's Fast Food Corner         chinese   500     3.6    438  new delhi   
4                Scorpio Cafe         chinese   400     3.7    410  new delhi   
5                    Nikashee         chinese   600     3.7    408  new delhi   
6                 Happy Hakka         chinese   650     3.8    312  new delhi   
7                     Hawkers         chinese   600     3.4    398  new delhi   
8                   Casa Asia         chinese   650     3.8    306  new delhi   
9                 Happy Hakka         chinese   650     3.7    270  new delhi   

                                         explanation  
0  Matched on Chinese cuisine, Medium budget, and... 

In [25]:
import pandas as pd
import numpy as np

class RestaurantRecommender:
    def __init__(self, df):
        """
        Initialize the recommender with a preprocessed dataframe
        Args:
            df: Pandas DataFrame with cleaned restaurant data from your Step 1
        """
        self.df = df
        required_cols = ['restaurant_name', 'cuisines', 'cost', 'rating',
                         'votes', 'city', 'cost_bucket', 'primary_cuisine']

        # 'locality' is often used for location, check if it's in your df
        if 'locality' not in df.columns:
            print("Warning: 'locality' column not found in the input DataFrame. Location filtering will only use 'city'.")
  
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Input DataFrame is missing required columns for the recommender: {missing_cols}. "
                             "Please ensure your Step 1 preprocessing produces these columns.")

    def filter_restaurants(self, cuisines=None, location=None, budget=None):
        filtered_df = self.df.copy()

        # Filter by cuisine
        if cuisines and len(cuisines) > 0:
            cuisines_lower = [cuisine.lower().strip() for cuisine in cuisines]
            # Ensure 'cuisines' column in df is string type for '.apply'
            cuisine_mask = filtered_df['cuisines'].astype(str).apply(
                lambda x: any(cuisine in x for cuisine in cuisines_lower)
            )
            filtered_df = filtered_df[cuisine_mask]

        # Filter by location
        if location:
            location_lower = location.lower().strip()
            # Ensure 'city' column in df is string type
            city_mask = filtered_df['city'].astype(str).str.contains(location_lower, na=False, case=False)
            locality_mask = pd.Series(False, index=filtered_df.index) # Default to False

            if 'locality' in filtered_df.columns:
                # Ensure 'locality' column in df is string type
                locality_mask = filtered_df['locality'].astype(str).str.contains(location_lower, na=False, case=False)
            
            combined_location_mask = city_mask | locality_mask
            filtered_df = filtered_df[combined_location_mask]

        # Filter by budget
        if budget:
            if isinstance(budget, str):
                budget_lower = budget.lower().strip()
                # Ensure 'cost_bucket' column in df is string type and lowercased
                filtered_df = filtered_df[filtered_df['cost_bucket'].astype(str).str.lower() == budget_lower]
            elif isinstance(budget, (list, tuple)) and len(budget) == 2:
                min_cost, max_cost = budget
                # Ensure 'cost' is numeric
                filtered_df = filtered_df[(pd.to_numeric(filtered_df['cost'], errors='coerce') >= min_cost) &
                                         (pd.to_numeric(filtered_df['cost'], errors='coerce') <= max_cost)]
        return filtered_df

    def calculate_scores(self, filtered_df, rating_weight=0.7, votes_weight=0.3):
        if filtered_df.empty:
            filtered_df['score'] = pd.Series(dtype='float64') # Ensure 'score' column exists
            return filtered_df

        # Ensure 'rating' is numeric and handle potential NaNs
        ratings_numeric = pd.to_numeric(filtered_df['rating'], errors='coerce').fillna(0)
        max_rating = 5.0 # Assuming a 0-5 scale for Zomato ratings
        normalized_ratings = ratings_numeric / max_rating
        normalized_ratings = normalized_ratings.clip(0, 1)

        # Ensure 'votes' is numeric and handle potential NaNs
        if 'votes' in filtered_df.columns:
            votes_numeric = pd.to_numeric(filtered_df['votes'], errors='coerce').fillna(0)
            log_votes = np.log1p(votes_numeric)
            max_log_votes = log_votes.max()
            if max_log_votes > 0:
                normalized_votes = log_votes / max_log_votes
            else:
                normalized_votes = pd.Series(0.0, index=filtered_df.index)
        else:
            print("Warning: 'votes' column not found for scoring. Votes weight will effectively be 0.")
            normalized_votes = pd.Series(0.0, index=filtered_df.index)

        filtered_df['score'] = (
            normalized_ratings * rating_weight +
            normalized_votes * votes_weight
        )
        return filtered_df

    def rank_restaurants(self, filtered_df, top_n=10):
        if filtered_df.empty or 'score' not in filtered_df.columns:
            return filtered_df

        sorted_df = filtered_df.sort_values(by='score', ascending=False)
        return sorted_df.head(top_n)

    def generate_explanation(self, restaurant_row):
        try:
            # Use .get() for safer access, provide defaults if key might be missing
            cuisine_val = restaurant_row.get('primary_cuisine', 'N/A')
            cost_bucket_val = restaurant_row.get('cost_bucket', 'N/A')
            cost_val = restaurant_row.get('cost', 0)
            rating_val = restaurant_row.get('rating', 0)
            votes_val = restaurant_row.get('votes', 0)

            explanation = f"Matched on {str(cuisine_val).title()} cuisine"
            explanation += f" and {str(cost_bucket_val).lower()} budget (₹{float(cost_val):.0f} for two)"
            explanation += f" with {float(rating_val):.1f}/5 rating"
            if votes_val > 0:
                explanation += f" based on {int(votes_val)} votes."
            else:
                explanation += "."
        except Exception as e:
            explanation = "Could not generate detailed explanation due to an issue with restaurant data."
            print(f"Error generating explanation for a row: {e}. Row data: {restaurant_row.to_dict()}")
        return explanation

    def filter_and_rank(self, cuisines=None, location=None, budget=None, top_n=10):

        filtered_df = self.filter_restaurants(cuisines, location, budget)

        # Define the columns expected for the final output, especially for printing or UI
        # This helps ensure the DataFrame structure is consistent even if empty.
        # It should include all columns from the original df that you want to keep, plus 'score' and 'explanation'.
        final_output_columns = list(self.df.columns)
        if 'score' not in final_output_columns:
            final_output_columns.append('score')
        if 'explanation' not in final_output_columns:
            final_output_columns.append('explanation')


        if filtered_df.empty:
            print("No restaurants matched the filter criteria.")
            # Return an empty DataFrame but with all expected columns (including 'score' and 'explanation')
            return pd.DataFrame(columns=final_output_columns)

        scored_df = self.calculate_scores(filtered_df)
        ranked_df = self.rank_restaurants(scored_df, top_n)

        if not ranked_df.empty:
            ranked_df['explanation'] = ranked_df.apply(self.generate_explanation, axis=1)
        else:
            # If ranked_df is empty (e.g. top_n=0 or some other issue), create an empty DF with correct columns
            return pd.DataFrame(columns=final_output_columns)
            
        # Ensure the final ranked_df has all the expected columns.
        # This is mostly for consistency if some columns were dropped during intermediate steps,
        # though current logic shouldn't do that.
        # Reindex can add missing columns with NaN.
        # return ranked_df.reindex(columns=final_output_columns)
        return ranked_df


# ---------------------------------------------------------------------------
# EXAMPLE USAGE (Assuming 'df' is your preprocessed DataFrame from Step 1)
# ---------------------------------------------------------------------------

# --- YOU NEED TO LOAD/CREATE YOUR 'df' HERE using your Step 1 code ---
# Example:
# df = pd.read_csv('zomato_cleaned.csv') # Load your cleaned data
# Or, if your Step 1 function is `my_step1_preprocess()`:
# df = my_step1_preprocess('zomato.csv')

# For demonstration, let's create a minimal dummy 'df' that your Step 1 might produce
# This dummy DF must have the columns checked in `Recommender.__init__`
# and columns used by the filtering/scoring logic.
_data_for_dummy_df = {
    'restaurant_name': ['Pizza Heaven', 'Curry King', 'Burger Barn', 'Sushi World', 'Taco Town'],
    'cuisines': ['italian, pizza', 'north indian, mughlai', 'american, burger', 'japanese, sushi', 'mexican, fast food, spicy'],
    'cost': [700, 1200, 500, 1500, 350],
    'rating': [4.1, 4.5, 3.9, 4.2, 3.8],
    'votes': [550, 1100, 320, 780, 210],
    'city': ['mumbai', 'mumbai', 'delhi', 'mumbai', 'delhi'],
    'locality': ['bandra', 'andheri', 'connaught place', 'juhu', 'saket'], # Optional
    'cost_bucket': ['medium', 'high', 'medium', 'high', 'medium'], # Make sure this matches your Step 1 output
    'primary_cuisine': ['italian', 'north indian', 'american', 'japanese', 'mexican']
}
df = pd.DataFrame(_data_for_dummy_df)
# --- End of dummy df creation ---

if df.empty:
    print("The input DataFrame 'df' is empty. Cannot proceed with recommender.")
else:
    print("\n--- Input DataFrame for Recommender (Head) ---")
    print(df.head())
    print(f"\nInput DataFrame info:")
    df.info()


    recommender = RestaurantRecommender(df)

    print("\n--- Test Case 1: Specific Search (Expected Match) ---")
    recommendations1 = recommender.filter_and_rank(
        cuisines=['north indian', 'mughlai'],
        location='mumbai',
        budget='High', # Matches 'high' in cost_bucket
        top_n=5
    )
    if not recommendations1.empty:
        # Select only relevant columns for concise printing
        cols_to_print = ['restaurant_name', 'cuisines', 'cost', 'rating', 'score', 'explanation']
        # Ensure all selected columns exist in recommendations1 before printing
        existing_cols_to_print = [col for col in cols_to_print if col in recommendations1.columns]
        print(recommendations1[existing_cols_to_print])
    else:
        print("No recommendations found for Test Case 1.")

    print("\n--- Test Case 2: Cuisine with 'spicy' (Substring Match Test) ---")
    recommendations2 = recommender.filter_and_rank(
        cuisines=['spicy'], # Should match 'mexican, fast food, spicy'
        location='delhi',
        top_n=3
    )
    if not recommendations2.empty:
        cols_to_print = ['restaurant_name', 'cuisines', 'cost', 'rating', 'score', 'explanation']
        existing_cols_to_print = [col for col in cols_to_print if col in recommendations2.columns]
        print(recommendations2[existing_cols_to_print])

    else:
        print("No recommendations found for Test Case 2.")

    print("\n--- Test Case 3: No Matches Expected ---")
    recommendations3 = recommender.filter_and_rank(
        cuisines=['ethiopian'],
        location='moonbase alpha',
        budget='low',
        top_n=5
    )
    if not recommendations3.empty:
        cols_to_print = ['restaurant_name', 'cuisines', 'cost', 'rating', 'score', 'explanation']
        existing_cols_to_print = [col for col in cols_to_print if col in recommendations3.columns]
        print(recommendations3[existing_cols_to_print])
    else:
        print("No recommendations found for Test Case 3 (as expected).")
        # Check if the empty DF has the 'explanation' column:
        if 'explanation' in recommendations3.columns:
            print("Empty recommendations DataFrame correctly includes 'explanation' column.")
        else:
            print("Error: Empty recommendations DataFrame MISSING 'explanation' column.")


--- Input DataFrame for Recommender (Head) ---
  restaurant_name                   cuisines  cost  rating  votes    city  \
0    Pizza Heaven             italian, pizza   700     4.1    550  mumbai   
1      Curry King      north indian, mughlai  1200     4.5   1100  mumbai   
2     Burger Barn           american, burger   500     3.9    320   delhi   
3     Sushi World            japanese, sushi  1500     4.2    780  mumbai   
4       Taco Town  mexican, fast food, spicy   350     3.8    210   delhi   

          locality cost_bucket primary_cuisine  
0           bandra      medium         italian  
1          andheri        high    north indian  
2  connaught place      medium        american  
3             juhu        high        japanese  
4            saket      medium         mexican  

Input DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           ----