In [153]:
import pandas as pd
import warnings
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
warnings.filterwarnings("ignore")

import seaborn as sns
import joblib
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pickle

In [155]:
data = pd.read_csv("swiggy.csv")

In [157]:
data.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [159]:
data.shape

(148541, 11)

In [161]:
data.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')

In [163]:
data.dtypes

id               int64
name            object
city            object
rating          object
rating_count    object
cost            object
cuisine         object
lic_no          object
link            object
address         object
menu            object
dtype: object

In [165]:
data.drop(['lic_no', 'link', 'menu'], axis= 1, inplace= True)

In [167]:
data['name'].value_counts()

name
Domino's Pizza                                     442
Pizza Hut                                          319
KFC                                                309
Kwality Walls Frozen Dessert and Ice Cream Shop    300
Baskin Robbins                                     274
                                                  ... 
Kathmandu Kitchen                                    1
Pizza deewane                                        1
Shree Mangalam                                       1
PUNJABI CHASKA                                       1
Lazeez kitchen                                       1
Name: count, Length: 112818, dtype: int64

In [169]:
data['city'].value_counts()

city
Bikaner                 1666
Noida-1                 1428
Indirapuram,Delhi       1279
BTM,Bangalore           1161
Rohini,Delhi            1136
                        ... 
Alwarpet,Chennai           1
Naharlagun                 1
Mahim Dadar,Mumbai         1
Starbucks_BKC,Mumbai       1
Rangpo                     1
Name: count, Length: 821, dtype: int64

In [171]:
data['name'].unique()

array(['AB FOODS POINT', 'Janta Sweet House', 'theka coffee desi', ...,
       'Cafe Bella Ciao', 'GRILL ZILLA', 'Lazeez kitchen'], dtype=object)

In [173]:
data['city'].unique()

array(['Abohar', 'Adilabad', 'Adityapur', 'Adoni', 'Agartala', 'Agra',
       'Vastrapur,Ahmedabad', 'GOTA,Ahmedabad',
       'Paldi & Ambawadi,Ahmedabad', 'Ghatlodia,Ahmedabad',
       'Bopal,Ahmedabad', 'Gandhinagar,Ahmedabad', 'LalDarwaja,Ahmedabad',
       'Naranpura,Ahmedabad', 'Navrangpura,Ahmedabad',
       'Science City,Ahmedabad', 'Maninagar,Ahmedabad',
       'Chandkheda,Ahmedabad', 'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola',
       'Alappuzha', 'Aligarh', 'Alipurduar', 'Allahabad', 'Alwar',
       'Ambala', 'Ambikapur', 'Ambur', 'Amravati', 'Amreli', 'Amritsar',
       'Anand', 'Anantapur', 'Ankleshwar', 'Arakkonam', 'Arambagh',
       'Arrah', 'Aruppukottai', 'Asansol', 'Aurangabad',
       'Aurangabad_bihar', 'Azamgarh', 'Baddi', 'Bagalkot', 'Bagdogra',
       'Bahadurgarh', 'Bahraich', 'Balaghat', 'Balangir', 'Balasore',
       'Ballari', 'Balrampur', 'Balurghat', 'Banda',
       'Yeshwanthpur,Bangalore', 'Geddalahalli,Bangalore',
       'Koramangala,Bangalore', 'JP Nagar,B

In [175]:
df_split = data['city'].str.split(',', expand=True)
df_split.columns = ['Location', 'Location1', 'City']

In [177]:
df_split['Location1'] = df_split['Location1'].replace(' New BEL Road', 'Bangalore')
df_split['Location1'] = df_split['Location1'].replace(' Nacharam & Malkajigiri', 'Hyderabad')

In [179]:
df_split[df_split['Location'] == 'Sanjay Nagar']

Unnamed: 0,Location,Location1,City
17917,Sanjay Nagar,Bangalore,Bangalore
17918,Sanjay Nagar,Bangalore,Bangalore
17919,Sanjay Nagar,Bangalore,Bangalore
17920,Sanjay Nagar,Bangalore,Bangalore
17921,Sanjay Nagar,Bangalore,Bangalore
...,...,...,...
18439,Sanjay Nagar,Bangalore,Bangalore
18440,Sanjay Nagar,Bangalore,Bangalore
18441,Sanjay Nagar,Bangalore,Bangalore
18442,Sanjay Nagar,Bangalore,Bangalore


In [181]:
df_split.drop(['City'], axis= 1, inplace= True)

In [183]:
df_split

Unnamed: 0,Location,Location1
0,Abohar,
1,Abohar,
2,Abohar,
3,Abohar,
4,Abohar,
...,...,...
148536,Yavatmal,
148537,Yavatmal,
148538,Yavatmal,
148539,Yavatmal,


In [185]:
df_split.rename(columns={'Location1': 'City'}, inplace=True)

In [187]:
df_split

Unnamed: 0,Location,City
0,Abohar,
1,Abohar,
2,Abohar,
3,Abohar,
4,Abohar,
...,...,...
148536,Yavatmal,
148537,Yavatmal,
148538,Yavatmal,
148539,Yavatmal,


In [189]:
df_split['City'].value_counts()

City
Bangalore     14943
Delhi         14081
Pune          12441
Hyderabad     10015
Chennai        9957
Kolkata        8286
Mumbai         6076
Ahmedabad      3517
Chandigarh     3324
Lucknow        2853
Gurgaon        2768
Nagpur         1919
Indore         1650
Surat          1328
Coimbatore     1275
Vijayawada     1129
Vizag          1077
Noida           852
Guwahati        787
Dehradun        782
Mysore          656
Vadodara        630
Kochi           606
Ludhiana        552
Jaipur          208
Name: count, dtype: int64

In [191]:
df_split.isnull().sum() 

Location        0
City        46829
dtype: int64

In [193]:
df_split['City'] = df_split['City'].fillna(df_split['Location'])

In [195]:
df_split

Unnamed: 0,Location,City
0,Abohar,Abohar
1,Abohar,Abohar
2,Abohar,Abohar
3,Abohar,Abohar
4,Abohar,Abohar
...,...,...
148536,Yavatmal,Yavatmal
148537,Yavatmal,Yavatmal
148538,Yavatmal,Yavatmal
148539,Yavatmal,Yavatmal


In [197]:
df_split['City'].value_counts()

City
Bangalore     14943
Delhi         14081
Pune          12441
Hyderabad     10015
Chennai        9957
              ...  
Hampi             1
Manali            1
Kohima            1
Naharlagun        1
Rangpo            1
Name: count, Length: 552, dtype: int64

In [199]:
df_split.isnull().sum() 

Location    0
City        0
dtype: int64

In [201]:
data.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'address'],
      dtype='object')

In [203]:
df_split.columns

Index(['Location', 'City'], dtype='object')

In [205]:
df = pd.concat([data, df_split], axis=1)

In [207]:
df

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,address,Location,City
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas","AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Abohar,Abohar
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery","Janta Sweet House, Bazar No.9, Circullar Road,...",Abohar,Abohar
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,"theka coffee desi, sahtiya sadan road city",Abohar,Abohar
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian","Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Abohar,Abohar
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food","GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Abohar,Abohar
...,...,...,...,...,...,...,...,...,...,...
148536,553122,The Food Delight,Yavatmal,--,Too Few Ratings,₹ 200,"Fast Food,Snacks","The Food Delight, 94MC+X35, New Singhania Naga...",Yavatmal,Yavatmal
148537,562647,MAITRI FOODS & BEVERAGES,Yavatmal,--,Too Few Ratings,₹ 300,Pizzas,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY...",Yavatmal,Yavatmal
148538,559435,Cafe Bella Ciao,Yavatmal,--,Too Few Ratings,₹ 300,"Fast Food,Snacks","Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S...",Yavatmal,Yavatmal
148539,418989,GRILL ZILLA,Yavatmal,--,Too Few Ratings,₹ 250,Continental,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT...",Yavatmal,Yavatmal


In [209]:
df = df[['id', 'name', 'City', 'Location', 'rating', 'rating_count', 'cost', 'cuisine',
       'address']]


In [211]:
df

Unnamed: 0,id,name,City,Location,rating,rating_count,cost,cuisine,address
0,567335,AB FOODS POINT,Abohar,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas","AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI..."
1,531342,Janta Sweet House,Abohar,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery","Janta Sweet House, Bazar No.9, Circullar Road,..."
2,158203,theka coffee desi,Abohar,Abohar,3.8,100+ ratings,₹ 100,Beverages,"theka coffee desi, sahtiya sadan road city"
3,187912,Singh Hut,Abohar,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian","Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR"
4,543530,GRILL MASTERS,Abohar,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food","GRILL MASTERS, ADA Heights, Abohar - Hanumanga..."
...,...,...,...,...,...,...,...,...,...
148536,553122,The Food Delight,Yavatmal,Yavatmal,--,Too Few Ratings,₹ 200,"Fast Food,Snacks","The Food Delight, 94MC+X35, New Singhania Naga..."
148537,562647,MAITRI FOODS & BEVERAGES,Yavatmal,Yavatmal,--,Too Few Ratings,₹ 300,Pizzas,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY..."
148538,559435,Cafe Bella Ciao,Yavatmal,Yavatmal,--,Too Few Ratings,₹ 300,"Fast Food,Snacks","Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S..."
148539,418989,GRILL ZILLA,Yavatmal,Yavatmal,--,Too Few Ratings,₹ 250,Continental,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT..."


In [213]:
df['rating_count'].unique()

array(['Too Few Ratings', '50+ ratings', '100+ ratings', '20+ ratings',
       '500+ ratings', '1K+ ratings', '5K+ ratings', nan, '10K+ ratings'],
      dtype=object)

In [215]:
df['rating_count'].value_counts()

rating_count
Too Few Ratings    87014
20+ ratings        21636
100+ ratings       20548
50+ ratings        12009
500+ ratings        4396
1K+ ratings         2739
5K+ ratings           98
10K+ ratings          15
Name: count, dtype: int64

In [217]:
def map_ratings(value):
    if isinstance(value, str):
        if '50+ ratings' in value:
            return 55
        elif '100+' in value:
            return 150
        elif '20+' in value:
            return 25
        elif '500+' in value:
            return 750  
        elif '1K+' in value:
            return 1500
        elif '5K+' in value:
            return 7500
        elif '10K+' in value:
            return 15000
        elif 'Too Few' in value:
            return 5
        else:
            return None


df['rating_count_numeric'] = df['rating_count'].apply(map_ratings)

In [219]:
df

Unnamed: 0,id,name,City,Location,rating,rating_count,cost,cuisine,address,rating_count_numeric
0,567335,AB FOODS POINT,Abohar,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas","AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",5.0
1,531342,Janta Sweet House,Abohar,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery","Janta Sweet House, Bazar No.9, Circullar Road,...",55.0
2,158203,theka coffee desi,Abohar,Abohar,3.8,100+ ratings,₹ 100,Beverages,"theka coffee desi, sahtiya sadan road city",150.0
3,187912,Singh Hut,Abohar,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian","Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",25.0
4,543530,GRILL MASTERS,Abohar,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food","GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",5.0
...,...,...,...,...,...,...,...,...,...,...
148536,553122,The Food Delight,Yavatmal,Yavatmal,--,Too Few Ratings,₹ 200,"Fast Food,Snacks","The Food Delight, 94MC+X35, New Singhania Naga...",5.0
148537,562647,MAITRI FOODS & BEVERAGES,Yavatmal,Yavatmal,--,Too Few Ratings,₹ 300,Pizzas,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY...",5.0
148538,559435,Cafe Bella Ciao,Yavatmal,Yavatmal,--,Too Few Ratings,₹ 300,"Fast Food,Snacks","Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S...",5.0
148539,418989,GRILL ZILLA,Yavatmal,Yavatmal,--,Too Few Ratings,₹ 250,Continental,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT...",5.0


In [221]:
df['rating'].unique()

array(['--', '4.4', '3.8', '3.7', '3.6', '4.0', '4.2', '4.7', '4.1',
       '3.5', '3.1', '3.4', '3.3', '4.8', '3.9', '2.7', '4.3', '2.9',
       '4.5', '2.5', '3.2', '2.4', '4.6', '3.0', '2.8', '2.3', '5.0',
       '2.6', '2.2', '1.4', '1.9', '4.9', '2.1', '1.3', '2.0', '1.8',
       '1.6', '1.1', '1.5', nan, '1.0', '1.2', '1.7'], dtype=object)

In [223]:
df['rating'].value_counts()

rating
--     87014
4.0     6532
4.1     6296
4.2     5821
3.8     5736
3.9     5435
4.3     5011
3.7     4253
4.4     3149
3.5     2963
3.6     2925
3.4     1879
3.3     1801
4.5     1778
4.6     1334
3.2     1202
3.0      859
3.1      791
4.7      648
2.8      473
2.9      472
4.8      338
2.7      325
2.5      221
5.0      209
2.6      208
4.9      174
2.3      128
2.4      118
2.2       89
2.0       72
2.1       61
1.9       46
1.8       27
1.5       18
1.6       17
1.7       13
1.4       10
1.2        4
1.3        3
1.1        1
1.0        1
Name: count, dtype: int64

In [225]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').round(2)

In [227]:
df['rating'].fillna(df['rating'].mean(), inplace= True)

In [229]:
df['rating'].value_counts()

rating
3.894461    87100
4.000000     6532
4.100000     6296
4.200000     5821
3.800000     5736
3.900000     5435
4.300000     5011
3.700000     4253
4.400000     3149
3.500000     2963
3.600000     2925
3.400000     1879
3.300000     1801
4.500000     1778
4.600000     1334
3.200000     1202
3.000000      859
3.100000      791
4.700000      648
2.800000      473
2.900000      472
4.800000      338
2.700000      325
2.500000      221
5.000000      209
2.600000      208
4.900000      174
2.300000      128
2.400000      118
2.200000       89
2.000000       72
2.100000       61
1.900000       46
1.800000       27
1.500000       18
1.600000       17
1.700000       13
1.400000       10
1.200000        4
1.300000        3
1.100000        1
1.000000        1
Name: count, dtype: int64

In [231]:
df.drop(['rating_count'], axis= 1, inplace= True)
df.rename(columns={'rating_count_numeric': 'rating_count'}, inplace=True)

In [233]:
df['rating_count'].value_counts()

rating_count
5.0        87014
25.0       21636
150.0      20548
55.0       12009
750.0       4396
1500.0      2739
7500.0        98
15000.0       15
Name: count, dtype: int64

In [235]:
df['rating'].value_counts()

rating
3.894461    87100
4.000000     6532
4.100000     6296
4.200000     5821
3.800000     5736
3.900000     5435
4.300000     5011
3.700000     4253
4.400000     3149
3.500000     2963
3.600000     2925
3.400000     1879
3.300000     1801
4.500000     1778
4.600000     1334
3.200000     1202
3.000000      859
3.100000      791
4.700000      648
2.800000      473
2.900000      472
4.800000      338
2.700000      325
2.500000      221
5.000000      209
2.600000      208
4.900000      174
2.300000      128
2.400000      118
2.200000       89
2.000000       72
2.100000       61
1.900000       46
1.800000       27
1.500000       18
1.600000       17
1.700000       13
1.400000       10
1.200000        4
1.300000        3
1.100000        1
1.000000        1
Name: count, dtype: int64

In [237]:
df['rating'] = df['rating'].round(2)

In [239]:
df['rating'].value_counts()

rating
3.89    87100
4.00     6532
4.10     6296
4.20     5821
3.80     5736
3.90     5435
4.30     5011
3.70     4253
4.40     3149
3.50     2963
3.60     2925
3.40     1879
3.30     1801
4.50     1778
4.60     1334
3.20     1202
3.00      859
3.10      791
4.70      648
2.80      473
2.90      472
4.80      338
2.70      325
2.50      221
5.00      209
2.60      208
4.90      174
2.30      128
2.40      118
2.20       89
2.00       72
2.10       61
1.90       46
1.80       27
1.50       18
1.60       17
1.70       13
1.40       10
1.20        4
1.30        3
1.10        1
1.00        1
Name: count, dtype: int64

In [241]:
df.isnull().sum()

id                0
name             86
City              0
Location          0
rating            0
cost            131
cuisine          99
address          86
rating_count     86
dtype: int64

In [243]:
df['cost'].value_counts()

cost
₹ 200    38635
₹ 300    29701
₹ 250    19745
₹ 150    12096
₹ 400    11711
         ...  
₹ 132        1
₹ 396        1
₹ 102        1
₹ 38         1
₹ 64         1
Name: count, Length: 363, dtype: int64

In [245]:
df.dtypes

id                int64
name             object
City             object
Location         object
rating          float64
cost             object
cuisine          object
address          object
rating_count    float64
dtype: object

In [247]:
df['cost'] = df['cost'].str.replace('₹', '', regex=False)
#df['cost'] = pd.to_numeric(df['cost'], errors='coerce').astype('Int64')

In [249]:
df['cost'] = df['cost'].astype('Int64')

In [251]:
df['cost'].value_counts()

cost
200    38635
300    29701
250    19745
150    12096
400    11711
       ...  
132        1
396        1
102        1
38         1
64         1
Name: count, Length: 363, dtype: Int64

In [253]:
df.isnull().sum()

id                0
name             86
City              0
Location          0
rating            0
cost            131
cuisine          99
address          86
rating_count     86
dtype: int64

In [255]:
df['cost'].fillna(df['cost'].mean().round(), inplace= True)

In [257]:
df.isnull().sum()

id               0
name            86
City             0
Location         0
rating           0
cost             0
cuisine         99
address         86
rating_count    86
dtype: int64

In [316]:
df.shape

(148541, 9)

In [318]:
df.duplicated().sum() 

0

In [320]:
df1 = df.dropna()

In [322]:
df1['cuisine'] = df1['cuisine'].replace('8:15 To 11:30 Pm', 'Sweets,Bakery')

In [324]:
df1

Unnamed: 0,id,name,City,Location,rating,cost,cuisine,address,rating_count
0,567335,AB FOODS POINT,Abohar,Abohar,3.89,200,"Beverages,Pizzas","AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",5.0
1,531342,Janta Sweet House,Abohar,Abohar,4.40,200,"Sweets,Bakery","Janta Sweet House, Bazar No.9, Circullar Road,...",55.0
2,158203,theka coffee desi,Abohar,Abohar,3.80,100,Beverages,"theka coffee desi, sahtiya sadan road city",150.0
3,187912,Singh Hut,Abohar,Abohar,3.70,250,"Fast Food,Indian","Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",25.0
4,543530,GRILL MASTERS,Abohar,Abohar,3.89,250,"Italian-American,Fast Food","GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",5.0
...,...,...,...,...,...,...,...,...,...
148536,553122,The Food Delight,Yavatmal,Yavatmal,3.89,200,"Fast Food,Snacks","The Food Delight, 94MC+X35, New Singhania Naga...",5.0
148537,562647,MAITRI FOODS & BEVERAGES,Yavatmal,Yavatmal,3.89,300,Pizzas,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY...",5.0
148538,559435,Cafe Bella Ciao,Yavatmal,Yavatmal,3.89,300,"Fast Food,Snacks","Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S...",5.0
148539,418989,GRILL ZILLA,Yavatmal,Yavatmal,3.89,250,Continental,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT...",5.0


In [326]:
df1.shape

(148442, 9)

In [328]:
df1.isnull().sum()

id              0
name            0
City            0
Location        0
rating          0
cost            0
cuisine         0
address         0
rating_count    0
dtype: int64

In [330]:
df1.duplicated().sum() 

0

In [332]:
df1.describe(include='object') 

Unnamed: 0,name,City,Location,cuisine,address
count,148442,148442,148442,148442,148442
unique,112808,552,820,2131,148388
top,Domino's Pizza,Bangalore,Bikaner,"North Indian,Chinese","Gold Star Biryani, RS Road Dindigul - 624001"
freq,442,14943,1666,6471,3


In [334]:
df1.to_csv("cleaned_data.csv")

In [336]:
df1.dtypes

id                int64
name             object
City             object
Location         object
rating          float64
cost              Int64
cuisine          object
address          object
rating_count    float64
dtype: object

In [338]:
        multi_label_cols = ['City', 'Location', 'cuisine']
        
        # Convert comma-separated strings to lists (if needed)
        for col in multi_label_cols:
            df1[col] = df1[col].apply(lambda x: x.split(',') if isinstance(x, str) else [])
        
        # Dictionary to store encoders and encoded DataFrames
        mlb_dict = {}
        encoded_parts = []
        
        # Apply MultiLabelBinarizer to each column
        for col in multi_label_cols:
            mlb = MultiLabelBinarizer()
            encoded = mlb.fit_transform(df1[col])
            encoded_df = pd.DataFrame(encoded, columns=[f"{col}_{cls}" for cls in mlb.classes_]).reset_index(drop=True)
            mlb_dict[col] = mlb
            encoded_parts.append(encoded_df)
        
        # Combine all encoded parts with original DataFrame (excluding original multi-label columns)
        df1_nolabel = df1.drop(columns=multi_label_cols).reset_index(drop=True)
        df_final = pd.concat([df1_nolabel] + encoded_parts, axis=1)
        
        df_final.drop(['name', 'address'], axis= 1, inplace= True)
        
        # Save final encoded DataFrame
        df_final.to_csv('encoded_data.csv', index=False)
        
        # Save all encoders
        joblib.dump(mlb_dict, 'encoder.pkl')

['encoder.pkl']

In [342]:
df_final

Unnamed: 0,id,rating,cost,rating_count,City_Abohar,City_Adilabad,City_Adityapur,City_Adoni,City_Agartala,City_Agra,...,cuisine_Thai,cuisine_Thalis,cuisine_Tibetan,cuisine_Tribal,cuisine_Turkish,cuisine_Use Code JUMBO30 to avail,cuisine_Use code XPRESS121 to avail.,cuisine_Vietnamese,cuisine_Waffle,cuisine_indian
0,567335,3.89,200,5.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,531342,4.40,200,55.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,158203,3.80,100,150.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,187912,3.70,250,25.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,543530,3.89,250,5.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148437,553122,3.89,200,5.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
148438,562647,3.89,300,5.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
148439,559435,3.89,300,5.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
148440,418989,3.89,250,5.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [344]:
df_final.isnull().sum()

id                                      0
rating                                  0
cost                                    0
rating_count                            0
City_Abohar                             0
                                       ..
cuisine_Use Code JUMBO30 to avail       0
cuisine_Use code XPRESS121 to avail.    0
cuisine_Vietnamese                      0
cuisine_Waffle                          0
cuisine_indian                          0
Length: 1502, dtype: int64

In [346]:
# Fit KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(df_final)

# Add cluster labels to cleaned data
df_final['cluster'] = clusters

# Recommend restaurants in the same cluster
def recommend_by_cluster(restaurant_id):
    cluster_id = df_final.loc[df_final['id'] == restaurant_id, 'cluster'].values[0]
    return df_final[df_final['cluster'] == cluster_id]


In [350]:
df_final

Unnamed: 0,id,rating,cost,rating_count,City_Abohar,City_Adilabad,City_Adityapur,City_Adoni,City_Agartala,City_Agra,...,cuisine_Thalis,cuisine_Tibetan,cuisine_Tribal,cuisine_Turkish,cuisine_Use Code JUMBO30 to avail,cuisine_Use code XPRESS121 to avail.,cuisine_Vietnamese,cuisine_Waffle,cuisine_indian,cluster
0,567335,3.89,200,5.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,531342,4.40,200,55.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,158203,3.80,100,150.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,187912,3.70,250,25.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,543530,3.89,250,5.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148437,553122,3.89,200,5.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
148438,562647,3.89,300,5.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
148439,559435,3.89,300,5.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
148440,418989,3.89,250,5.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4


In [352]:
with open('/Users/muthu/OneDrive/Desktop/Python/Swiggy Recommendation system/kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)