In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

import re
from collections import Counter
from PIL import Image

from reviews_scraping import get_restaurant_reviews

%matplotlib inline

restaurants_data = pd.read_csv('TA_restaurants_curated.csv', encoding='utf8', index_col=0)

restaurants_data.head()

Unnamed: 0,Name,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136.0,"[['Just like home', 'A Warm Welcome to Wintry ...",/Restaurant_Review-g188590-d11752080-Reviews-M...,d11752080
1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812.0,"[['Great food and staff', 'just perfect'], ['0...",/Restaurant_Review-g188590-d693419-Reviews-De_...,d693419
2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567.0,"[['Satisfaction', 'Delicious old school restau...",/Restaurant_Review-g188590-d696959-Reviews-La_...,d696959
3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564.0,"[['True five star dinner', 'A superb evening o...",/Restaurant_Review-g188590-d1239229-Reviews-Vi...,d1239229
4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316.0,"[['Best meal.... EVER', 'super food experience...",/Restaurant_Review-g188590-d6864170-Reviews-Li...,d6864170


In [22]:
restaurants_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125527 entries, 0 to 1666
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Name               125527 non-null  object 
 1   City               125527 non-null  object 
 2   Cuisine Style      94176 non-null   object 
 3   Ranking            115876 non-null  float64
 4   Rating             115897 non-null  float64
 5   Price Range        77672 non-null   object 
 6   Number of Reviews  108183 non-null  float64
 7   Reviews            115911 non-null  object 
 8   URL_TA             125527 non-null  object 
 9   ID_TA              125527 non-null  object 
dtypes: float64(3), object(7)
memory usage: 10.5+ MB


# Data 
- Name: name of the restaurant
- City: city location of the restaurant
- Cuisine Style: cuisine style(s) of the restaurant, in a Python list object (94 046 non-null)
    - French, Dutch, European, Vegetarian Friendly, etc.
- Ranking: rank of the restaurant among the total number of restaurants in the city as a float object (115 645 non-null)
    - Float type
- Rating: rate of the restaurant on a scale from 1 to 5, as a float object (115 658 non-null)
    - Float between -1 and 5
- Price Range: price range of the restaurant among 3 categories , as a categorical type (77 555 non-null)
    - $, $$, $$$, $$$$, $-$$, etc.
- Number of Reviews: number of reviews that customers have let to the restaurant, as a float object (108 020 non-null)
    - Float object
- Reviews: 2 reviews that are displayed on the restaurants scrolling page of the city, as a list of list object where the first list contains the 2 reviews, and the second le dates when these reviews were written (115 673 non-null)
    - Python List of two objects
- URL_TA: part of the URL of the detailed restaurant page that comes after 'www.tripadvisor.com' as a string object (124 995 non-null)
- ID_TA: identification of the restaurant in the TA database constructed a one letter and a number (124 995 non-null)

# Data Cleaning

## Turn ranking into categorical datatype 

In [23]:
restaurants_data['Ranking'] = restaurants_data['Ranking'].astype('category')

## Turn Reviews from float to int

In [24]:
restaurants_data['Number of Reviews'] = restaurants_data['Number of Reviews'].fillna(0)
restaurants_data['Number of Reviews'] = restaurants_data['Number of Reviews'].round(0).astype('int')

## Remove duplicated rows by ID_TA
    - There are duplicated rows by ID_TA (125 527 entries and 201 duplicated values)
    - We will remove the duplicated values and keep only the first ones

In [25]:
print(restaurants_data[restaurants_data.ID_TA.duplicated() == True].ID_TA.count())

201


In [26]:
restaurants_data = restaurants_data.drop_duplicates('ID_TA', keep='first')
restaurants_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125326 entries, 0 to 1666
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   Name               125326 non-null  object  
 1   City               125326 non-null  object  
 2   Cuisine Style      94047 non-null   object  
 3   Ranking            115710 non-null  category
 4   Rating             115734 non-null  float64 
 5   Price Range        77574 non-null   object  
 6   Number of Reviews  125326 non-null  int64   
 7   Reviews            115745 non-null  object  
 8   URL_TA             125326 non-null  object  
 9   ID_TA              125326 non-null  object  
dtypes: category(1), float64(1), int64(1), object(7)
memory usage: 10.4+ MB


In [27]:
print(restaurants_data[restaurants_data.ID_TA.duplicated() == True].ID_TA.count())

0


## Rename columns removing blank spaces and capital letters

In [28]:
restaurants_data.rename(columns={'Name': 'name',
            'City': 'city',
            'Cuisine Style':'cuisine_style',
            'Ranking': 'ranking',
            'Rating': 'rating',
            'Price Range':'price_range',
            'Number of Reviews':'reviews_number',
            'Reviews': 'reviews',
            'URL_TA': 'url_ta',
            'ID_TA': 'id_ta'}, inplace=True)

## Replace reviews with rating == -1.0 and setting them to 0
    - There are some reviews bellow 0, which is impossible

In [29]:
print(restaurants_data[restaurants_data.rating == -1.0].city.count())

41


In [30]:
restaurants_data.rating.replace(-1, 0, inplace=True)
print(restaurants_data[restaurants_data.rating == -1.0].city.count())

0


## Eliminate Null values in cuisine_style
    - There are several values of cuisine style null and we decided that these restaurants didn't have enough information, so we will remove them from the dataset.

In [31]:
restaurants_data = restaurants_data[restaurants_data.cuisine_style.isnull()== False]
print(restaurants_data.name.count())
restaurants_data.info()

94047
<class 'pandas.core.frame.DataFrame'>
Int64Index: 94047 entries, 0 to 1666
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   name            94047 non-null  object  
 1   city            94047 non-null  object  
 2   cuisine_style   94047 non-null  object  
 3   ranking         88947 non-null  category
 4   rating          88933 non-null  float64 
 5   price_range     77574 non-null  object  
 6   reviews_number  94047 non-null  int64   
 7   reviews         88942 non-null  object  
 8   url_ta          94047 non-null  object  
 9   id_ta           94047 non-null  object  
dtypes: category(1), float64(1), int64(1), object(7)
memory usage: 8.0+ MB


## Eliminate Null values in ranking
    - There are several values of ranking null and we decided that these restaurants didn't have enough information, so we will remove them from the dataset.

In [34]:
restaurants_data = restaurants_data[restaurants_data.ranking.isnull()== False]
print(restaurants_data.name.count())
restaurants_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88947 entries, 0 to 1596
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   name            88947 non-null  object  
 1   city            88947 non-null  object  
 2   cuisine_style   88947 non-null  object  
 3   ranking         88947 non-null  category
 4   rating          88853 non-null  float64 
 5   price_range     75406 non-null  object  
 6   reviews_number  88947 non-null  int64   
 7   reviews         88852 non-null  object  
 8   url_ta          88947 non-null  object  
 9   id_ta           88947 non-null  object  
dtypes: category(1), float64(1), int64(1), object(7)
memory usage: 7.6+ MB


## Eliminate Null values in rating
    - There are several values of rating null and we decided that these restaurants didn't have enough information, so we will remove them from the dataset.

In [35]:
restaurants_data = restaurants_data[restaurants_data.rating.isnull()== False]
print(restaurants_data.name.count())
restaurants_data.info()

88853
<class 'pandas.core.frame.DataFrame'>
Int64Index: 88853 entries, 0 to 1593
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   name            88853 non-null  object  
 1   city            88853 non-null  object  
 2   cuisine_style   88853 non-null  object  
 3   ranking         88853 non-null  category
 4   rating          88853 non-null  float64 
 5   price_range     75339 non-null  object  
 6   reviews_number  88853 non-null  int64   
 7   reviews         88852 non-null  object  
 8   url_ta          88853 non-null  object  
 9   id_ta           88853 non-null  object  
dtypes: category(1), float64(1), int64(1), object(7)
memory usage: 7.6+ MB


## Eliminate Null values in price_range
    - There are several values of price range null and we decided that these restaurants didn't have enough information, so we will remove them from the dataset.

In [36]:
restaurants_data = restaurants_data[restaurants_data.price_range.isnull()== False]
print(restaurants_data.name.count())
restaurants_data.info()

75339
<class 'pandas.core.frame.DataFrame'>
Int64Index: 75339 entries, 0 to 1593
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   name            75339 non-null  object  
 1   city            75339 non-null  object  
 2   cuisine_style   75339 non-null  object  
 3   ranking         75339 non-null  category
 4   rating          75339 non-null  float64 
 5   price_range     75339 non-null  object  
 6   reviews_number  75339 non-null  int64   
 7   reviews         75338 non-null  object  
 8   url_ta          75339 non-null  object  
 9   id_ta           75339 non-null  object  
dtypes: category(1), float64(1), int64(1), object(7)
memory usage: 6.5+ MB


## Normalize Review links
    - It is missing the "www.tripadvisor.com" from the URL_TA column

In [39]:
restaurants_data["url_ta"] = "www.tripadvisor.com" + restaurants_data["url_ta"]
restaurants_data.head()

Unnamed: 0,name,city,cuisine_style,ranking,rating,price_range,reviews_number,reviews,url_ta,id_ta
0,Martine of Martine's Table,Amsterdam,"['French', 'Dutch', 'European']",1.0,5.0,$$ - $$$,136,"[['Just like home', 'A Warm Welcome to Wintry ...",www.tripadvisor.comwww.tripadvisor.comwww.trip...,d11752080
1,De Silveren Spiegel,Amsterdam,"['Dutch', 'European', 'Vegetarian Friendly', '...",2.0,4.5,$$$$,812,"[['Great food and staff', 'just perfect'], ['0...",www.tripadvisor.comwww.tripadvisor.comwww.trip...,d693419
2,La Rive,Amsterdam,"['Mediterranean', 'French', 'International', '...",3.0,4.5,$$$$,567,"[['Satisfaction', 'Delicious old school restau...",www.tripadvisor.comwww.tripadvisor.comwww.trip...,d696959
3,Vinkeles,Amsterdam,"['French', 'European', 'International', 'Conte...",4.0,5.0,$$$$,564,"[['True five star dinner', 'A superb evening o...",www.tripadvisor.comwww.tripadvisor.comwww.trip...,d1239229
4,Librije's Zusje Amsterdam,Amsterdam,"['Dutch', 'European', 'International', 'Vegeta...",5.0,4.5,$$$$,316,"[['Best meal.... EVER', 'super food experience...",www.tripadvisor.comwww.tripadvisor.comwww.trip...,d6864170


# Copy of the clean dataset

In [40]:
restaurants = restaurants_data.copy()

# Web scraping the reviews of each one of the restaurants

In [44]:
for index, row in restaurants.iterrows():
    print('Getting reviews from ' + str(row.name))
    get_restaurant_reviews(row.id_ta, row.url_ta)

Getting reviews from 0
d11752080
Getting reviews from 1
d693419
Getting reviews from 2
d696959
Getting reviews from 3
d1239229
Getting reviews from 4
d6864170
Getting reviews from 5
d696902
Getting reviews from 6
d1014732
Getting reviews from 7
d697058
Getting reviews from 8
d697009
Getting reviews from 9
d1955652
Getting reviews from 10
d10275170
Getting reviews from 11
d1014753
Getting reviews from 12
d7695005
Getting reviews from 13
d3893242
Getting reviews from 14
d1408533
Getting reviews from 15
d3200493
Getting reviews from 16
d8562698
Getting reviews from 17
d8567150
Getting reviews from 18
d6022573
Getting reviews from 19
d10071792
Getting reviews from 20
d3589045
Getting reviews from 21
d8528923
Getting reviews from 22
d2213743
Getting reviews from 24
d7003171
Getting reviews from 25
d1309073
Getting reviews from 26
d2315768
Getting reviews from 27
d4177229
Getting reviews from 28
d1504781
Getting reviews from 29
d3198485
Getting reviews from 30
d2292732
Getting reviews from 3