# __Web Scraping Restaurants from Zomato__

In [32]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from ipywidgets import interactive, HBox, VBox, widgets, interact, interactive_output

## __Step 1: Scraping the 270 best restaurants in Lisbon__

Zomato requires the user to be indentified through their user-agent. 
For more context: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent

__Google "what is my user agent" to find out your user-agent string.__ This code will not work without your specific user agent.

In [33]:
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:95.0) Gecko/20100101 Firefox/95.0'

headers = {'User-Agent': user_agent}

url = 'https://www.zomato.com/grande-lisboa'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

The scraped website, https://www.zomato.com/grande-lisboa, displays a menu with the 30 most popular localities in Greater Lisbon, and a link for each to find restaurants in this locality. The 30 localities will be put in a list called ___area_names___ and the links for the restaurants page in each locality in ___restaurant_links___.

In [34]:
all_content = soup.findAll('div', {'class':'sc-bke1zw-0 fIuLDK'})[-1]

restaurants_info = all_content.findAll('a')
restaurant_links = []
for i in range(len(restaurants_info)):
    link = restaurants_info[i]['href']
    restaurant_links.append(link)

print(f"Number of links: {len(restaurant_links)}")
    
areas_info = all_content.findAll('h5')
area_names = []
for i in range(len(areas_info)):
    area = areas_info[i].text.split(' ')[:-2]
    name = ' '.join(area)
    area_names.append(name)

print(f"Number of areas: {len(areas_info)}")  

Number of links: 30
Number of areas: 30


In [35]:
# creating a list with the areas of the 270 restuarants for our final dataframe
full_area_names = []

for i in area_names:
    for num in range(9):
        full_area_names.append(i)

## __Step 2 - Getting 9 restaurants for each of the 30 areas__

In [36]:
restaurant_names = []
tags = []
ratings = []
prices = []
links_rest = []

# for each of the 30 areas
for (link, area) in zip(restaurant_links, area_names):
    resp2=requests.get(link, headers=headers)
    soup2=BeautifulSoup(resp2.text, "html.parser")
    all_contents2=soup2.findAll('div', {'class':'sc-iQtOjA dDlaSX'})[0]
    
    # get 9 restaurant names for each of the 30 areas
    restaurants_names_info = all_contents2.findAll('h4')
    for i in range(len(restaurants_names_info)):
        restaurant_names.append(restaurants_names_info[i].text)
        
    # get the tags for each of the 30 areas
    tags_info = all_contents2.findAll('p',  class_=re.compile("sc-1hez2tp-0 sc")) #the html class name is dynamic, so we need to compile all of these to find the right one
    tgs_info = [tags_info[i-1].text for i in range(len(tags_info)) if 'para dois' in tags_info[i].text]
    for i in range(len(tgs_info)):
        tags.append(tgs_info[i])
    
    # get the ratings for each of the 30 areas
    ratings_info = all_contents2.findAll('div', {'class':'sc-1q7bklc-1 cILgox'})
    for i in range(len(ratings_info)):
        ratings.append(ratings_info[i].text)
    
    # get the prices for each of the 30 areas
    prices_info = all_contents2.findAll('p',  class_=re.compile("sc-1hez2tp-0 sc"))
    prics_info = [i for i in prices_info if 'para dois' in i.text]
    for i in range(len(prics_info)):
        prices.append(float(prics_info[i].text[:2]))
        
    # get the links for each of the 30 areas
    links_info = all_contents2.findAll('a', class_=re.compile("sc-"))
    lnks_info = [i for i in links_info if 'info' in str(i)][::2]
    for i in range(len(lnks_info)):
        links_rest.append('https://www.zomato.com' + lnks_info[i]['href'])

In [37]:
# Checking we got the information for all 270 restaurants
print(len(restaurant_names))
print(len(tags))
print(len(ratings))
print(len(prices))
print(len(links_rest))

270
270
270
270
270


## __Step 3 - Creating dataframe and saving to .csv__

In [38]:
# creating a dictionary to turn into a dataframe
restaurant_data = {'Name': restaurant_names,
                   'City Area': full_area_names, 
                   'Price For Two': prices, 
                   'Ratings': ratings, 
                   'Tags': tags, 
                   'Link': links_rest}

df = pd.DataFrame(restaurant_data)
df['Ratings'] = pd.to_numeric(df['Ratings'],errors='coerce')
df.head()

Unnamed: 0,Name,City Area,Price For Two,Ratings,Tags,Link
0,Boa-Bao,Chiado,48.0,4.7,"Oriental, Tailandesa, Malaia, Filipina, Vietna...",https://www.zomato.com/pt/boabao/info
1,Páteo - Bairro do Avillez,Chiado,50.0,4.5,"Portuguesa, Mediterrânica, Marisqueira, Peixe ...",https://www.zomato.com/pt/grande-lisboa/páteo-...
2,Ao 26 Vegan Food Project,Chiado,40.0,4.7,"Vegetariana, Comida saudável, Vegan",https://www.zomato.com/pt/grande-lisboa/ao-26-...
3,Cantinho do Avillez,Chiado,80.0,4.6,"Portuguesa, Autor",https://www.zomato.com/pt/cantinhodoavillez/info
4,TOPO Chiado,Chiado,45.0,4.0,"Bebidas, Pizza, Hamburgueria, Snacks",https://www.zomato.com/pt/grande-lisboa/topo-c...


In [39]:
df.to_csv('restaurants_info.csv',index=False)

In [40]:
df = pd.read_csv('restaurants_info.csv').dropna()
df.head()

Unnamed: 0,Name,City Area,Price For Two,Ratings,Tags,Link
0,Boa-Bao,Chiado,48.0,4.7,"Oriental, Tailandesa, Malaia, Filipina, Vietna...",https://www.zomato.com/pt/boabao/info
1,Páteo - Bairro do Avillez,Chiado,50.0,4.5,"Portuguesa, Mediterrânica, Marisqueira, Peixe ...",https://www.zomato.com/pt/grande-lisboa/páteo-...
2,Ao 26 Vegan Food Project,Chiado,40.0,4.7,"Vegetariana, Comida saudável, Vegan",https://www.zomato.com/pt/grande-lisboa/ao-26-...
3,Cantinho do Avillez,Chiado,80.0,4.6,"Portuguesa, Autor",https://www.zomato.com/pt/cantinhodoavillez/info
4,TOPO Chiado,Chiado,45.0,4.0,"Bebidas, Pizza, Hamburgueria, Snacks",https://www.zomato.com/pt/grande-lisboa/topo-c...


## __Step 4 - Building a browser with ipy widgets__

In [41]:
# This generates a list of all the unique tags that have been assigned to the restuarants in our dataset
tags_list = []
for tags in df['Tags']:
    tags = tags.split(', ')
    for tag in tags:
        if tag not in tags_list:
            tags_list.append(tag)

# This defines a variable for when the user wants to choose "All" - this is only used in our restaurant Food Browser
all_option = '--All--'           

# This creates a list for the areas 
areas_list = sorted( list(set(list(df['City Area']))) + [all_option] )

restaurant_types = {'Asian': ['Japanese', 'Sushi', 'Chinese', 'Oriental','Vietnamese','Korean','Indian','Thai','Malaysian','Asian','Ramen'],
                   'Vegan/Vegetarian': ['Vegetarian', 'Vegan', 'Juices'],
                   'Italian': ['Pizza', 'Italian'],
                   'Fast Food': ['Pizza','Burger', 'Gourmet Fast Food', 'Street Food','Fast Food', 'BBQ', 'Ice Cream', 'Kebab', 'American', 'BBQ', 'Grill'],
                   'Healthy Food': ['Vegetarian', 'Vegan', 'Healthy Food', 'Mediterranean', 'Seafood', 'Fresh Fish', 'Juices', 'Tea'],
                   'European': ['Portuguese', 'Italian', 'Mediterranean', 'Spanish', 'Austrian', 'British', 'French','German','Tapas'],
                   'Drinks, Coffee & Snacks': ['Tea','Tapas','Street Food', 'Ice Cream','Beverages', 'Cafe','Desserts','Bakery', 'Coffee', 'Crepes','Finger Food', 'Juices'],
                   'Latin American': ['Mexican','Peruvian','Latin American', 'Argentine', 'Brazilian'],
                   'Portuguese': ['Portuguese', 'Petiscos', 'Madeiran'],
                   'Seafood': ['Fresh Fish','Seafood', 'Sushi'],
                    'African': ['Mozambican', 'Cape Verdean', 'Moroccan', 'African'],
                   'Middle Eastern': ['Kebab', 'Middle Eastern', 'Iranian', 'Lebanese']}

food_types = sorted(list(restaurant_types.keys()) + [all_option])

In [42]:
# Please run this code for the drop-down menus to work
def browser(a, f, p, r):
    # city area
    if a!=all_option:
        condition=(df['City Area']==a)
    else:
        condition=(~df['City Area'].isna())
    # food type
    if f!=all_option:
        index_list=[]
        for i in range(len(df)):
            if any( tag in restaurant_types[f] for tag in list(df['Tags'][i].split(', ')) ):
                index_list.append(i)    
        condition2=(df.index.isin(index_list))
    else:
        condition2=(~df['Tags'].isna())
    # price range
    condition3=(df['Price For Two']<=p)
    # minimum rating
    condition4=(df['Ratings']>=r)
    # displaying the dataframe
    df_to_display = df[ (condition) & (condition2) & (condition3) & (condition4)]
    if len(df_to_display) == 0:
        print("Oops! There are no restaurants that match your search. PLease try reducing the filters.")
    else:
        display(df_to_display)

city_area = widgets.Dropdown(options=areas_list, description='City Area') 
food_type = widgets.Dropdown(options=food_types, description='Food Type')
max_price = widgets.IntSlider(min=df['Price For Two'].min(), max=df['Price For Two'].max(), value=df['Price For Two'].max(), description='Max. Price For Two', style={'description_width': 'initial'})
min_rating = widgets.FloatSlider(min=df['Ratings'].min(), max=df['Ratings'].max(), value=df['Ratings'].min(), step=0.1, description='Minimum Rating', style={'description_width': 'initial'})

out = interactive_output(browser, {"a":city_area, "f":food_type, "p":max_price, "r":min_rating})

hbox1 = HBox([city_area, food_type])
hbox2 = HBox([max_price, min_rating])
ui = VBox([hbox1, hbox2])

display(ui,out)

VBox(children=(HBox(children=(Dropdown(description='City Area', options=('--All--', 'Alcântara', 'Algés', 'Alv…

Output()