In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

# --- 1. Load and preprocess dataset ---
df = pd.read_csv("/content/restaurants(in).csv")
df['category'] = df['category'].astype(str).str.split(',')

# Save original metadata
metadata_cols = ['name', 'score', 'ratings', 'full_address', 'lat', 'lng']
metadata = df[metadata_cols]

# Encode categories
mlb = MultiLabelBinarizer()
category_encoded = pd.DataFrame(mlb.fit_transform(df['category']), columns=mlb.classes_)

# Combine numerical + category features
df_features = pd.concat([df[['score', 'ratings', 'lat', 'lng']], category_encoded], axis=1)
df_features = df_features.dropna()

# Align metadata with clean feature set
metadata = metadata.loc[df_features.index].reset_index(drop=True)

# --- 2. Scale and PCA ---
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_features)

pca = PCA(n_components=0.95)
df_pca = pca.fit_transform(scaled_features)

# --- 3. Clustering ---
num_clusters = 7
random_seed = 30
kmeans = KMeans(n_clusters=num_clusters, random_state=random_seed)
clusters = kmeans.fit_predict(df_pca)

# Final combined DataFrame
df_with_categories = pd.concat([metadata, df_features.reset_index(drop=True), pd.DataFrame({'cluster': clusters})], axis=1)

# --- 4. Cluster Summary ---
category_columns = mlb.classes_
cluster_summary = df_with_categories.groupby('cluster')[category_columns].mean()
#print(  cluster_summary)

# Get top categories per cluster (for display
top_categories_per_cluster = cluster_summary.apply(lambda x: x.sort_values(ascending=False).index[:7], axis=1)
print("Top categories per cluster:\n", top_categories_per_cluster)

# --- 5. 3D Cluster Plot ---
"""fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(df_pca[:, 0], df_pca[:, 1], df_pca[:, 2], c=clusters, cmap='viridis', s=50, alpha=0.7, edgecolors='k')
fig.colorbar(scatter, label="Cluster Labels")
ax.set_xlabel("PCA Component 1")
ax.set_ylabel("PCA Component 2")
ax.set_zlabel("PCA Component 3")
ax.set_title("3D Clusters of Food Preferences")
plt.show()"""



Top categories per cluster:
 cluster
0    Index([' Burritos', 'Mexican', ' Tacos', ' Sal...
1    Index([' Comfort Food', ' Dinner', ' salad', '...
2    Index(['American', ' American', ' Family Meals...
3    Index([' Sports Bar', ' Family Friendly', 'Ame...
4    Index([' Portuguese', ' Halal', 'Chicken', ' W...
5    Index([' BBQ', ' Family Meals', 'Appetizers', ...
6    Index([' Asian', ' Noodles', ' Asian Fusion', ...
dtype: object


'fig = plt.figure(figsize=(10, 7))\nax = fig.add_subplot(111, projection=\'3d\')\nscatter = ax.scatter(df_pca[:, 0], df_pca[:, 1], df_pca[:, 2], c=clusters, cmap=\'viridis\', s=50, alpha=0.7, edgecolors=\'k\')\nfig.colorbar(scatter, label="Cluster Labels")\nax.set_xlabel("PCA Component 1")\nax.set_ylabel("PCA Component 2")\nax.set_zlabel("PCA Component 3")\nax.set_title("3D Clusters of Food Preferences")\nplt.show()'

In [7]:
def recommend_restaurants(input_categories, cluster_summary, df_with_categories, n=5):
    # Normalize column names for case-insensitive matching
    category_map = {col.lower(): col for col in cluster_summary.columns}

    # Map input categories to their actual case-sensitive names in the data
    valid_categories = []
    ignored_categories = []

    for cat in input_categories:
        key = cat.lower()
        if key in category_map:
            valid_categories.append(category_map[key])
        else:
            ignored_categories.append(cat)

    if not valid_categories:
        print(f"No matching categories found for: {input_categories}")
        return pd.DataFrame()

    if ignored_categories:
        print(f"Ignored categories (not found): {ignored_categories}")

    # Score clusters
    cluster_scores = cluster_summary[valid_categories].sum(axis=1)
    top_clusters = cluster_scores.sort_values(ascending=False).index.tolist()

    # Recommend from top clusters
    recommendations = []
    for cluster in top_clusters:
        cluster_data = df_with_categories[df_with_categories['cluster'] == cluster]
        match_filter = cluster_data[valid_categories].sum(axis=1) > 0
        filtered = cluster_data[match_filter]
        recommendations.append(filtered)
        if len(pd.concat(recommendations)) >= n:
            break

    return pd.concat(recommendations).head(n)
recommend = ['african']
print(recommend_restaurants(recommend, cluster_summary, df_with_categories, n=50))

                                                    name  score  ratings  \
1106                              Alem Ethiopian Village    4.8     67.0   
3549                                     Dijah's Kitchen    5.0     27.0   
5358                      Sengatera Ethiopian Restaurant    4.3     49.0   
5359                                        Enat Kitchen    4.8     25.0   
5660                         Galos Flame Grilled Chicken    4.4     19.0   
6362                                  Massawa restaurant    4.6     43.0   
6363                      Blue Nile Ethiopian Restaurant    4.4     79.0   
9368           Nyame Ye African and Caribbean Restaurant    4.5    129.0   
9369                           Rahama African Restaurant    4.2    138.0   
10397                                        BETHEL CAFE    4.6     30.0   
10398           Appioo African Bar &amp; Grill (Cardozo)    4.3     16.0   
10400                    Swahili Village - The Consulate    4.5    118.0   
10401       

35238 386
