In [1]:
import pandas as pd
import numpy as np
import re

def clean_swiggy_data(file_path, output_path="cleaned_swiggy.csv"):
    """
    Clean the Swiggy restaurant dataset:
    1. Remove invalid ratings ('--', 'Too Few Ratings')
    2. Convert rating to float
    3. Clean and convert rating_count to int
    4. Clean and convert cost to float
    5. Fill missing license numbers
    6. Trim and standardize text columns
    7. Remove duplicates
    8. Reset index
    """

    # Load dataset
    df = pd.read_csv(file_path)
    df_clean = df.copy()

    # 1Ô∏è‚É£ Remove rows with invalid ratings
    df_clean = df_clean[~df_clean['rating'].isin(['--', 'Too Few Ratings'])]

    # 2Ô∏è‚É£ Convert rating to numeric (float)
    df_clean['rating'] = pd.to_numeric(df_clean['rating'], errors='coerce')

    # 3Ô∏è‚É£ Clean and convert rating_count
    def clean_rating_count(val):
        if isinstance(val, str):
            val = re.sub(r'[^0-9]', '', val)
            return int(val) if val.isdigit() else np.nan
        return val

    df_clean['rating_count'] = df_clean['rating_count'].apply(clean_rating_count)
    df_clean = df_clean.dropna(subset=['rating_count'])

    # 4Ô∏è‚É£ Clean and convert cost
    df_clean['cost'] = (
        df_clean['cost']
        .astype(str)
        .str.replace('‚Çπ', '', regex=False)
        .str.replace(',', '', regex=False)
        .str.strip()
        .replace('nan', np.nan)
    )
    df_clean['cost'] = pd.to_numeric(df_clean['cost'], errors='coerce')

    # 5Ô∏è‚É£ Handle missing license numbers
    df_clean['lic_no'] = df_clean['lic_no'].fillna('Unknown')

    # 6Ô∏è‚É£ Trim spaces and standardize text columns
    text_cols = ['name', 'city', 'cuisine', 'address']
    for col in text_cols:
        df_clean[col] = df_clean[col].astype(str).str.strip()

    # 7Ô∏è‚É£ Drop duplicates
    df_clean = df_clean.drop_duplicates()

    # 8Ô∏è‚É£ Reset index
    df_clean = df_clean.reset_index(drop=True)

    # ‚úÖ Save cleaned file
    df_clean.to_csv(output_path, index=False)
    print(f"‚úÖ Cleaning complete! Saved to: {output_path}")
    print(f"Rows before: {len(df)}, after cleaning: {len(df_clean)}")

    return df_clean


# Example usage:
if __name__ == "__main__":
    cleaned_df = clean_swiggy_data("swiggy.csv")
    print(cleaned_df.head())


‚úÖ Cleaning complete! Saved to: cleaned_swiggy.csv
Rows before: 14998, after cleaning: 6760
       id               name    city  rating  rating_count   cost  \
0  531342  Janta Sweet House  Abohar     4.4            50  200.0   
1  158203  theka coffee desi  Abohar     3.8           100  100.0   
2  187912          Singh Hut  Abohar     3.7            20  250.0   
3  158204          Sam Uncle  Abohar     3.6            20  200.0   
4  156588   shere punjab veg  Abohar     4.0           100  150.0   

            cuisine    lic_no  \
0     Sweets,Bakery  1.21E+13   
1         Beverages  2.21E+13   
2  Fast Food,Indian  2.21E+13   
3       Continental  2.21E+13   
4      North Indian  2.21E+13   

                                                link  \
0  https://www.swiggy.com/restaurants/janta-sweet...   
1  https://www.swiggy.com/restaurants/theka-coffe...   
2  https://www.swiggy.com/restaurants/singh-hut-n...   
3  https://www.swiggy.com/restaurants/sam-uncle-c...   
4  https://ww

In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import pickle
import sklearn


def preprocess_swiggy_data(cleaned_file, encoded_output="encoded_data.csv", encoder_output="encoder.pkl"):
    """
    Data Preprocessing Pipeline for Swiggy Dataset

    Steps:
    1. Load cleaned data
    2. Apply One-Hot Encoding to categorical features (name, city, cuisine)
    3. Save encoder as pickle file (encoder.pkl)
    4. Save preprocessed encoded dataset (encoded_data.csv)
    5. Ensure indices match with cleaned_data.csv
    """

    # 1Ô∏è‚É£ Load the cleaned dataset
    cleaned_data = pd.read_csv('cleaned_data.csv')

    # 2Ô∏è‚É£ Select categorical columns for encoding
    categorical_cols = ['name', 'city', 'cuisine']

    # 3Ô∏è‚É£ Initialize OneHotEncoder (version-safe)
    try:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # for sklearn >=1.4
    except TypeError:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')  # for older versions

    # 4Ô∏è‚É£ Fit and transform categorical features
    encoded_array = encoder.fit_transform(cleaned_data[categorical_cols])

    # 5Ô∏è‚É£ Convert encoded data to DataFrame
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

    # 6Ô∏è‚É£ Drop original categorical columns and merge encoded columns
    encoded_data = pd.concat(
        [cleaned_data.drop(columns=categorical_cols).reset_index(drop=True),
         encoded_df.reset_index(drop=True)],
        axis=1
    )

    # 7Ô∏è‚É£ Ensure all features are numeric
    encoded_data = encoded_data.apply(pd.to_numeric, errors='ignore')

    # 8Ô∏è‚É£ Save the encoded data and encoder
    encoded_data.to_csv(encoded_output, index=False)
    with open(encoder_output, "wb") as f:
        pickle.dump(encoder, f)

    # 9Ô∏è‚É£ Verify index alignment
    index_match = cleaned_data.reset_index(drop=True).index.equals(encoded_data.reset_index(drop=True).index)

    # ‚úÖ Print summary
    print("‚úÖ Data Preprocessing Complete!")
    print(f"Rows: {len(encoded_data)} | Columns: {encoded_data.shape[1]}")
    print(f"Encoded data saved as: {encoded_output}")
    print(f"Encoder saved as: {encoder_output}")
    print(f"Index alignment with cleaned data: {index_match}")
    print(f"scikit-learn version: {sklearn.__version__}")

    return encoded_data, encoder


# Example usage:
if __name__ == "__main__":
    cleaned_file = "cleaned_swiggy.csv"
    encoded_data, encoder = preprocess_swiggy_data(cleaned_file)


  encoded_data = encoded_data.apply(pd.to_numeric, errors='ignore')


‚úÖ Data Preprocessing Complete!
Rows: 6760 | Columns: 5667
Encoded data saved as: encoded_data.csv
Encoder saved as: encoder.pkl
Index alignment with cleaned data: True
scikit-learn version: 1.7.2


In [9]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer


def restaurant_recommendation_system(cleaned_file, encoded_file, method="kmeans", n_clusters=10):
    """
    Build a Restaurant Recommendation System using either K-Means or Cosine Similarity.

    Parameters:
        cleaned_file: str -> path to cleaned_data.csv
        encoded_file: str -> path to encoded_data.csv
        method: str -> "kmeans" or "cosine"
        n_clusters: int -> number of clusters for K-Means

    Returns:
        cleaned_data (with cluster labels if KMeans used)
        recommend_function
    """

    # Load datasets
    cleaned_data = pd.read_csv('cleaned_data.csv')
    encoded_data = pd.read_csv('encoded_data.csv')

    # Ensure both datasets have aligned indices
    cleaned_data = cleaned_data.reset_index(drop=True)
    encoded_data = encoded_data.reset_index(drop=True)

    # Keep only numeric columns
    numeric_data = encoded_data.select_dtypes(include=[np.number])

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    numeric_data_imputed = pd.DataFrame(imputer.fit_transform(numeric_data),
                                        columns=numeric_data.columns)

    print(f"‚úÖ Using {numeric_data_imputed.shape[1]} numeric features for {method.upper()} method.")

    # ------------------ K-Means Method ------------------ #
    if method.lower() == "kmeans":
        print("üîπ Using K-Means Clustering...")

        # Apply K-Means clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cleaned_data["cluster"] = kmeans.fit_predict(numeric_data_imputed)

        print(f"‚úÖ K-Means clustering completed ‚Äî created {n_clusters} clusters.")

        # Define recommendation function
        def recommend_kmeans(restaurant_name, top_n=5):
            if restaurant_name not in cleaned_data["name"].values:
                return f"‚ùå Restaurant '{restaurant_name}' not found in dataset."

            # Get cluster of the input restaurant
            target_cluster = cleaned_data.loc[
                cleaned_data["name"] == restaurant_name, "cluster"
            ].values[0]

            # Get other restaurants in same cluster
            cluster_members = cleaned_data[cleaned_data["cluster"] == target_cluster]
            recommendations = cluster_members[cluster_members["name"] != restaurant_name].head(top_n)

            return recommendations[["name", "city", "cuisine", "rating", "cost"]]

        return cleaned_data, recommend_kmeans

    # ------------------ Cosine Similarity Method ------------------ #
    elif method.lower() == "cosine":
        print("üîπ Using Cosine Similarity...")

        # Compute cosine similarity matrix
        similarity_matrix = cosine_similarity(numeric_data_imputed)

        def recommend_cosine(restaurant_name, top_n=5):
            if restaurant_name not in cleaned_data["name"].values:
                return f"‚ùå Restaurant '{restaurant_name}' not found in dataset."

            idx = cleaned_data[cleaned_data["name"] == restaurant_name].index[0]
            scores = list(enumerate(similarity_matrix[idx]))
            scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n + 1]
            indices = [i[0] for i in scores]

            return cleaned_data.loc[indices, ["name", "city", "cuisine", "rating", "cost"]].reset_index(drop=True)

        return cleaned_data, recommend_cosine

    else:
        raise ValueError("Invalid method. Choose either 'kmeans' or 'cosine'.")


# ------------------ Example Usage ------------------ #
if __name__ == "__main__":
    cleaned_file = "cleaned_data.csv"
    encoded_file = "encoded_data.csv"

    # Choose method: "kmeans" or "cosine"
    cleaned_data, recommend = restaurant_recommendation_system(cleaned_file, encoded_file, method="cosine", n_clusters=10)

    # Example recommendation
    restaurant_name = "Singh Hut"
    print(f"\nüçΩÔ∏è Recommendations similar to '{restaurant_name}':")
    print(recommend(restaurant_name, top_n=5))


‚úÖ Using 5663 numeric features for COSINE method.
üîπ Using Cosine Similarity...

üçΩÔ∏è Recommendations similar to 'Singh Hut':
               name                    city             cuisine  rating   cost
0  The Biryani Life           HSR,Bangalore  Biryani,Hyderabadi     3.8  250.0
1  The Biryani Life   Indiranagar,Bangalore  Biryani,Hyderabadi     3.5  250.0
2  The Biryani Life      JP Nagar,Bangalore  Biryani,Hyderabadi     3.5  250.0
3  The Biryani Life  Geddalahalli,Bangalore  Biryani,Hyderabadi     3.2  250.0
4  The Biryani Life   Mahadevpura,Bangalore  Biryani,Hyderabadi     3.3  250.0
