In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

import re
from collections import Counter
from PIL import Image

%matplotlib inline

restaurants_data = pd.read_csv('TA_restaurants_curated.csv', encoding='utf8', index_col=0)

restaurants_data.head()

Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136.0,"[['Just like home', 'A Warm Welcome to Wintry ...",/Restaurant_Review-g188590-d11752080-Reviews-M...,d11752080
1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812.0,"[['Great food and staff', 'just perfect'], ['0...",/Restaurant_Review-g188590-d693419-Reviews-De_...,d693419
2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567.0,"[['Satisfaction', 'Delicious old school restau...",/Restaurant_Review-g188590-d696959-Reviews-La_...,d696959
3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564.0,"[['True five star dinner', 'A superb evening o...",/Restaurant_Review-g188590-d1239229-Reviews-Vi...,d1239229
4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316.0,"[['Best meal.... EVER', 'super food experience...",/Restaurant_Review-g188590-d6864170-Reviews-Li...,d6864170


In [7]:
restaurants_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125527 entries, 0 to 1666
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Name               125527 non-null  object 
 1   City               125527 non-null  object 
 2   Cuisine Style      94176 non-null   object 
 3   Ranking            115876 non-null  float64
 4   Rating             115897 non-null  float64
 5   Price Range        77672 non-null   object 
 6   Number of Reviews  108183 non-null  float64
 7   Reviews            115911 non-null  object 
 8   URL_TA             125527 non-null  object 
 9   ID_TA              125527 non-null  object 
dtypes: float64(3), object(7)
memory usage: 10.5+ MB


# Data Cleaning
- Turn ranking into categorical datatype and Reviews from float to int

In [8]:
restaurants_data['Ranking'] = restaurants_data['Ranking'].astype('category')
restaurants_data['Number of Reviews'] = restaurants_data['Number of Reviews'].fillna(0)
restaurants_data['Number of Reviews'] = restaurants_data['Number of Reviews'].round(0).astype('int')

- Remove duplicated rows by ID_TA

In [9]:
print(restaurants_data[restaurants_data.ID_TA.duplicated() == True].ID_TA.count())

201


In [10]:
restaurants_data = restaurants_data.drop_duplicates('ID_TA', keep='first')
restaurants_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125326 entries, 0 to 1666
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   Name               125326 non-null  object  
 1   City               125326 non-null  object  
 2   Cuisine Style      94047 non-null   object  
 3   Ranking            115710 non-null  category
 4   Rating             115734 non-null  float64 
 5   Price Range        77574 non-null   object  
 6   Number of Reviews  125326 non-null  int32   
 7   Reviews            115745 non-null  object  
 8   URL_TA             125326 non-null  object  
 9   ID_TA              125326 non-null  object  
dtypes: category(1), float64(1), int32(1), object(7)
memory usage: 10.0+ MB


- Rename columns removing blank spaces and capital letters

In [11]:
restaurants_data.rename(columns={'Name': 'name',
            'City': 'city',
            'Ranking': 'ranking',
            'Rating': 'rating',
            'Reviews': 'reviews',
            'Cuisine Style':'cuisine_style',
            'Price Range':'price_range',
            'Number of Reviews':'reviews_number'}, inplace=True)

- Replace reviews with rating == -1.0 and setting tem to 0

In [12]:
print(restaurants_data[restaurants_data.rating == -1.0].city.count())
restaurants_data.rating.replace(-1, 0, inplace=True)

41


- Normalize Review links

In [13]:
restaurants_data["URL_TA"] = "www.tripadvisor.com" + restaurants_data["URL_TA"]
restaurants_data.head()

Unnamed: 0,name,city,cuisine_style,ranking,rating,price_range,reviews_number,reviews,URL_TA,ID_TA
0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136,"[['Just like home', 'A Warm Welcome to Wintry ...",www.tripadvisor.com/Restaurant_Review-g188590-...,d11752080
1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812,"[['Great food and staff', 'just perfect'], ['0...",www.tripadvisor.com/Restaurant_Review-g188590-...,d693419
2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567,"[['Satisfaction', 'Delicious old school restau...",www.tripadvisor.com/Restaurant_Review-g188590-...,d696959
3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564,"[['True five star dinner', 'A superb evening o...",www.tripadvisor.com/Restaurant_Review-g188590-...,d1239229
4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316,"[['Best meal.... EVER', 'super food experience...",www.tripadvisor.com/Restaurant_Review-g188590-...,d6864170


- Delete restaurants without cuisine styles

In [32]:
restaurant_data = restaurants_data[restaurants_data.cuisine_style.isnull() == False]
print(restaurant_data.name.count())
restaurant_data.head()

94047


Unnamed: 0,name,city,cuisine_style,ranking,rating,price_range,reviews_number,reviews,URL_TA,ID_TA
0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136,"[['Just like home', 'A Warm Welcome to Wintry ...",www.tripadvisor.com/Restaurant_Review-g188590-...,d11752080
1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812,"[['Great food and staff', 'just perfect'], ['0...",www.tripadvisor.com/Restaurant_Review-g188590-...,d693419
2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567,"[['Satisfaction', 'Delicious old school restau...",www.tripadvisor.com/Restaurant_Review-g188590-...,d696959
3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564,"[['True five star dinner', 'A superb evening o...",www.tripadvisor.com/Restaurant_Review-g188590-...,d1239229
4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316,"[['Best meal.... EVER', 'super food experience...",www.tripadvisor.com/Restaurant_Review-g188590-...,d6864170


# Copy of the clean dataset

In [46]:
restaurants = restaurants_data.copy()