## __Data Cleaning the Restaurant Data__

In [52]:
# import key libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### __Loading data from zomato.csv__

In [53]:
df = pd.read_csv('zomato.csv')
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


### Get the shape of the DataFrame: Rows and columns

In [54]:
df.shape

(51717, 17)

### Check columns

In [55]:
df.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')

### Drop redundant/unnecessary columns

In [56]:
# Remove url, address, phone, dish_liked, reviews_list, menu_item
df = df.drop(['url', 'address', 'phone', 'dish_liked', 'reviews_list', 'menu_item'], axis=1)

### View DataFrame once again

In [57]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),listed_in(type),listed_in(city)
0,Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,Buffet,Banashankari
1,Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,Buffet,Banashankari
2,San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,Buffet,Banashankari
3,Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,"South Indian, North Indian",300,Buffet,Banashankari
4,Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,Buffet,Banashankari


### Get summary info on the DataFrame: 
- After dropping columns, see remaining columns
- View non-null values in each column + data type

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   name                         51717 non-null  object
 1   online_order                 51717 non-null  object
 2   book_table                   51717 non-null  object
 3   rate                         43942 non-null  object
 4   votes                        51717 non-null  int64 
 5   location                     51696 non-null  object
 6   rest_type                    51490 non-null  object
 7   cuisines                     51672 non-null  object
 8   approx_cost(for two people)  51371 non-null  object
 9   listed_in(type)              51717 non-null  object
 10  listed_in(city)              51717 non-null  object
dtypes: int64(1), object(10)
memory usage: 4.3+ MB


### Drop any duplicate rows

In [59]:
df.shape

(51717, 11)

In [60]:
df.drop_duplicates(inplace=True)

In [61]:
df.shape

(51609, 11)

## METHODS OF FINDING NULL VALUES IN COLUMNS:

### Method 1: Using inbuilt info() method and deducing:
__Running df.info() columns' non-null value counts. The column with highest null value count is one:__
 - Whose non-null value count is far off the mark from the total entries in the range index
  - Running the command below shows that 'rate' column has most null values since...
     - ...its non-null value count is 43942 while total entries is 51717  

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51609 entries, 0 to 51716
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   name                         51609 non-null  object
 1   online_order                 51609 non-null  object
 2   book_table                   51609 non-null  object
 3   rate                         43854 non-null  object
 4   votes                        51609 non-null  int64 
 5   location                     51588 non-null  object
 6   rest_type                    51382 non-null  object
 7   cuisines                     51564 non-null  object
 8   approx_cost(for two people)  51265 non-null  object
 9   listed_in(type)              51609 non-null  object
 10  listed_in(city)              51609 non-null  object
dtypes: int64(1), object(10)
memory usage: 4.7+ MB


### Method 2: Using single line isnull().sum().idxmax() to return most nulls column:

In [63]:
most_nulls_column = df.isnull().sum().idxmax()
print(f"Column with most null values: {most_nulls_column}")

Column with most null values: rate


## Method 3: Custom function to get both column with most nulls and the count of null values:

In [64]:
max_null = 0
column_with_most_nulls = None
for col in df.columns:
    null_val_count = df[col].isnull().sum()
    if null_val_count > max_null:
        max_null = null_val_count
        column_with_most_nulls = col

print(f"Column with most null values is '{column_with_most_nulls}' and has {max_null} null values")

Column with most null values is 'rate' and has 7755 null values


### Confirming null value count in a column

In [65]:
print(f"The number of null values in rate column: {df['rate'].isnull().sum()}")

The number of null values in rate column: 7755


__After dropping the duplicates, the number of rows has reduced by about 100__

## DATA CLEANING SECTION
#### NOW START CLEANING THE COLUMNS
## DATA CLEANING SECTION

In [66]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),listed_in(type),listed_in(city)
0,Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,Buffet,Banashankari
1,Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,Buffet,Banashankari
2,San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,Buffet,Banashankari
3,Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,"South Indian, North Indian",300,Buffet,Banashankari
4,Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,Buffet,Banashankari


### Clean the 'rate' column

__Step 1: Check the data type__

In [67]:
type(df['rate'][0])

str

__Step 2: Check for unique values__
- To find if any outliers exist

In [68]:
df['rate'].unique().tolist()

['4.1/5',
 '3.8/5',
 '3.7/5',
 '3.6/5',
 '4.6/5',
 '4.0/5',
 '4.2/5',
 '3.9/5',
 '3.1/5',
 '3.0/5',
 '3.2/5',
 '3.3/5',
 '2.8/5',
 '4.4/5',
 '4.3/5',
 'NEW',
 '2.9/5',
 '3.5/5',
 nan,
 '2.6/5',
 '3.8 /5',
 '3.4/5',
 '4.5/5',
 '2.5/5',
 '2.7/5',
 '4.7/5',
 '2.4/5',
 '2.2/5',
 '2.3/5',
 '3.4 /5',
 '-',
 '3.6 /5',
 '4.8/5',
 '3.9 /5',
 '4.2 /5',
 '4.0 /5',
 '4.1 /5',
 '3.7 /5',
 '3.1 /5',
 '2.9 /5',
 '3.3 /5',
 '2.8 /5',
 '3.5 /5',
 '2.7 /5',
 '2.5 /5',
 '3.2 /5',
 '2.6 /5',
 '4.5 /5',
 '4.3 /5',
 '4.4 /5',
 '4.9/5',
 '2.1/5',
 '2.0/5',
 '1.8/5',
 '4.6 /5',
 '4.9 /5',
 '3.0 /5',
 '4.8 /5',
 '2.3 /5',
 '4.7 /5',
 '2.4 /5',
 '2.1 /5',
 '2.2 /5',
 '2.0 /5',
 '1.8 /5']

__Step 3: Replace "NEW" & "-" with NaN, then remove "/5" portion and return all values as floats__

In [69]:
def handle_rate(value):
    if value == '-' or value == 'NEW':
        return np.nan
    else:
        value = float(str(value).split('/')[0])
        return value

# Overwrite the existing 'rate' column
df['rate'] = df['rate'].apply(handle_rate)
df['rate'].head()

0    4.1
1    4.1
2    3.8
3    3.7
4    3.8
Name: rate, dtype: float64

__Check null values in rate column:__

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51609 entries, 0 to 51716
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         51609 non-null  object 
 1   online_order                 51609 non-null  object 
 2   book_table                   51609 non-null  object 
 3   rate                         41590 non-null  float64
 4   votes                        51609 non-null  int64  
 5   location                     51588 non-null  object 
 6   rest_type                    51382 non-null  object 
 7   cuisines                     51564 non-null  object 
 8   approx_cost(for two people)  51265 non-null  object 
 9   listed_in(type)              51609 non-null  object 
 10  listed_in(city)              51609 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 6.7+ MB


__Step 4: Fill null values, 'nan' with mean of 'rate' column__

In [71]:
# Pandas 3.0 doesn't allow this in future: df['rate'].fillna(df['rate'].mean(), inplace=True) 
# Nor this: df['rate'] = df['rate'].fillna(df['rate'].mean())
# So to opt into future behavior, I apply this method: result.infer_objects(copy=False)
# df['rate'] = df['rate'].fillna(df['rate'].mean()).infer_objects(copy=False)
# type(df['rate'])

df['rate'] = df['rate'].fillna(df['rate'].mean().round(4)) 
df['rate'].unique().tolist()

[4.1,
 3.8,
 3.7,
 3.6,
 4.6,
 4.0,
 4.2,
 3.9,
 3.1,
 3.0,
 3.2,
 3.3,
 2.8,
 4.4,
 4.3,
 3.7001,
 2.9,
 3.5,
 2.6,
 3.4,
 4.5,
 2.5,
 2.7,
 4.7,
 2.4,
 2.2,
 2.3,
 4.8,
 4.9,
 2.1,
 2.0,
 1.8]

__Step 5: Confirm all null values 'nan' have been filled with the mean__

In [72]:
# If the sum = 0, then all null values have been eliminated
df['rate'].isnull().sum()

0

### Now check DataFrame info
__- The null values have now been filled as shown in summary below__

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51609 entries, 0 to 51716
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         51609 non-null  object 
 1   online_order                 51609 non-null  object 
 2   book_table                   51609 non-null  object 
 3   rate                         51609 non-null  float64
 4   votes                        51609 non-null  int64  
 5   location                     51588 non-null  object 
 6   rest_type                    51382 non-null  object 
 7   cuisines                     51564 non-null  object 
 8   approx_cost(for two people)  51265 non-null  object 
 9   listed_in(type)              51609 non-null  object 
 10  listed_in(city)              51609 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 6.7+ MB


## Drop all rows with null values
#### dropna command should be run after replacing null values in numeric columns


In [74]:
df.dropna(inplace=True)

#### Check DataFrame info:
- After running the drop command above, about 600 rows are dropped comparing the info() below and before the dropna command
- Hypothesis, fill the null values in the cost for 2 plates column with average 1st then run the run dropna once more to see if dropped row count has reduced.

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51042 entries, 0 to 51716
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         51042 non-null  object 
 1   online_order                 51042 non-null  object 
 2   book_table                   51042 non-null  object 
 3   rate                         51042 non-null  float64
 4   votes                        51042 non-null  int64  
 5   location                     51042 non-null  object 
 6   rest_type                    51042 non-null  object 
 7   cuisines                     51042 non-null  object 
 8   approx_cost(for two people)  51042 non-null  object 
 9   listed_in(type)              51042 non-null  object 
 10  listed_in(city)              51042 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 4.7+ MB


In [76]:
df.head(10)

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),listed_in(type),listed_in(city)
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,Buffet,Banashankari
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,Buffet,Banashankari
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,Buffet,Banashankari
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300,Buffet,Banashankari
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,Buffet,Banashankari
5,Timepass Dinner,Yes,No,3.8,286,Basavanagudi,Casual Dining,North Indian,600,Buffet,Banashankari
6,Rosewood International Hotel - Bar & Restaurant,No,No,3.6,8,Mysore Road,Casual Dining,"North Indian, South Indian, Andhra, Chinese",800,Buffet,Banashankari
7,Onesta,Yes,Yes,4.6,2556,Banashankari,"Casual Dining, Cafe","Pizza, Cafe, Italian",600,Cafes,Banashankari
8,Penthouse Cafe,Yes,No,4.0,324,Banashankari,Cafe,"Cafe, Italian, Continental",700,Cafes,Banashankari
9,Smacznego,Yes,No,4.2,504,Banashankari,Cafe,"Cafe, Mexican, Italian, Momos, Beverages",550,Cafes,Banashankari
