In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('mumbai_all_cafes.csv')
df.sample(5)

Unnamed: 0,Area,Name,Rating,Total Ratings,Address,Latitude,Longitude
299,Bandra,Madras Diaries,3.9,3179,"Shop No. 7, Ground Floor, Muzaffar Manor, 28th...",19.059939,72.833992
1002,Powai,Third Wave Coffee,4.8,786,"R City Mall, G 73A, Lal Bahadur Shastri Marg, ...",19.099698,72.916669
846,Kharghar,Hugs And Mugs,4.1,1273,"Bhoomi Tower, Simran Residency Service Road, S...",19.033795,73.063767
35,Colaba,Koyla - Ethnic Cuisine,3.9,4866,"Kamal Mansion, 3/23, N.A. Azmi Road, next to R...",18.918294,72.830874
730,Nerul,EIGHTEEN PLUS CAFE,4.6,124,"91, Jagatguru Aadi Shankracharya Marg, Nerul E...",19.024753,73.022224


### Data Cleaning & Preprocessing

In [4]:
# missing values
df.isnull().sum()

Area             0
Name             0
Rating           0
Total Ratings    0
Address          0
Latitude         0
Longitude        0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Area           1200 non-null   object 
 1   Name           1200 non-null   object 
 2   Rating         1200 non-null   float64
 3   Total Ratings  1200 non-null   int64  
 4   Address        1200 non-null   object 
 5   Latitude       1200 non-null   float64
 6   Longitude      1200 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 65.8+ KB


In [6]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1195    False
1196    False
1197    False
1198    False
1199    False
Length: 1200, dtype: bool

In [7]:
df['Area'].unique()

array(['Colaba', 'Churchgate', 'Dadar', 'Worli', 'Bandra', 'Andheri',
       'Goregaon', 'Borivali', 'Kurla', 'Ghatkopar', 'Mulund', 'Vashi',
       'Nerul', 'Panvel', 'Kharghar', 'Dharavi', 'Powai', 'Chembur',
       'Sion', 'Thane'], dtype=object)

In [15]:
df['Address'].map(df['Address'].value_counts())

0       1
1       2
2       2
3       2
4       1
       ..
1195    1
1196    1
1197    1
1198    1
1199    1
Name: Address, Length: 1200, dtype: int64

### Content-Based Filtering (Cosine Similarity)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a combined feature (Address + Rating + Total Ratings)
df["features"] = df["Address"] + " " + df["Rating"].astype(str) + " " + df["Total Ratings"].astype(str)

# Convert text data into numerical feature vectors
vectorizer = TfidfVectorizer(stop_words="english")
feature_matrix = vectorizer.fit_transform(df["features"])

print("✅ Features vectorized!")

✅ Features vectorized!


In [17]:
# Create a function to recommend similar cafes:
def recommend_similar_cafes(cafe_name, top_n=5):
    if cafe_name not in df["Name"].values:
        return "❌ Cafe not found in the dataset!"

    # Find the index of the cafe
    index = df[df["Name"] == cafe_name].index[0]
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(feature_matrix, feature_matrix)
    
    # Get similar cafes
    similar_cafes = list(enumerate(cosine_sim[index]))
    similar_cafes = sorted(similar_cafes, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    # Get cafe names
    cafe_indices = [i[0] for i in similar_cafes]
    return df.iloc[cafe_indices][["Name", "Rating", "Total Ratings", "Address"]]

# Example usage
print(recommend_similar_cafes("Starbucks"))

                                 Name  Rating  Total Ratings  \
62                          Starbucks     4.5            937   
78            Earth Cafe @ Churchgate     4.3             29   
106  Relish International Veg Cuisine     4.4           5385   
73                              The J     4.2           4235   
75                      The Beer Café     3.9           1897   

                                               Address  
62   Block No 1, Ram Mahal, Building No 8, Ground F...  
78   Ground Floor, Ram Mahal, Dinshaw Vacha Rd, nea...  
106  1, Dinshaw Vacha Rd, Churchgate, Mumbai, Mahar...  
73   Shop 3, Ground Floor, Vaswani Mansion, Dinshaw...  
75   Shop No 6,7, Ground Floor, Cambata Building, C...  


###  Collaborative Filtering (KNN Model)

In [20]:
# Convert the data into a user-cafe matrix:
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Create a pivot table (User x Cafe rating matrix)
cafe_matrix = df.pivot_table(index="Name", values="Rating", aggfunc=np.mean).fillna(0)

# Convert to NumPy array
cafe_matrix_array = cafe_matrix.values

  cafe_matrix = df.pivot_table(index="Name", values="Rating", aggfunc=np.mean).fillna(0)


In [21]:
# Train a KNN model
knn = NearestNeighbors(metric="cosine", algorithm="brute")
knn.fit(cafe_matrix_array)

print("✅ KNN model trained!")

✅ KNN model trained!


In [22]:
# Create a function to recommend cafes using KNN:
def recommend_cafes_knn(cafe_name, top_n=5):
    if cafe_name not in cafe_matrix.index:
        return "❌ Cafe not found in the dataset!"

    # Find the index of the cafe
    index = cafe_matrix.index.get_loc(cafe_name)

    # Find nearest cafes
    distances, indices = knn.kneighbors(cafe_matrix_array[index].reshape(1, -1), n_neighbors=top_n+1)

    # Get cafe names
    cafe_names = cafe_matrix.index[indices.flatten()][1:]
    return df[df["Name"].isin(cafe_names)][["Name", "Rating", "Total Ratings", "Address"]]

# Example usage
print(recommend_cafes_knn("Starbucks"))

                               Name  Rating  Total Ratings  \
326                 Ohh My Dog Pefe     4.6           3035   
726   OCD-Obsessive Coffee Disorder     4.8            237   
815                    Offline Cafe     4.6             38   
853                   Odós Kaffeina     4.7            112   
1115                      Oh My Tea     4.9             20   

                                                Address  
326   Pratisthan Bunglow,Yamuna Nagar, Back Road, op...  
726   Shop No.1, Sahara CHS, Plot 17, Seawoods West,...  
815   Pratik Harmony CHS Shop no. 6, Plot no 35 Sec ...  
853   Shop No. 32, GROWMORE TOWER, Sector 2, Khargha...  
1115  Shop No. 12, Rameshwar Bhavan, Mahatma Gandhi ...  
