# Web scrapping - Berlin Restaurants

Source webpage: *https://www.berlin.de/restaurants/stadtteile/*

##### Importing python libraries:

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys

##### Copying progress bar function:

In [2]:
# Progress bar function
# Source: https://stackoverflow.com/questions/3160699/python-progress-bar

def progressbar(it, prefix="", size=60, out=sys.stdout):
    count = len(it)
    def show(j):
        x = int(size*j/count)
        print("{}[{}{}] {}/{}".format(prefix, "#"*x, "."*(size-x), j, count), 
                end='\r', file=out, flush=True)
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    print("\n", flush=True, file=out)

##### Selecting main page using Beautiful Soup:

In [3]:
page_url = "https://www.berlin.de/restaurants/stadtteile/"
page = requests.get(page_url)

soup = BeautifulSoup(page.content, 'html.parser')
#print(soup.prettify())

##### Finding links for each cityzone:

In [4]:
page_list_zones = soup.find('ul', class_ = 'decoda-list')
list_all_zones = page_list_zones.find_all('a', href=True)

##### Creating sublinks for each cityzone:

In [5]:
zone_sublinks = []

for el in list_all_zones:
    zone_sublinks.append(el['href'])

zone_sublinks

['/restaurants/stadtteile/charlottenburg/',
 '/restaurants/stadtteile/friedrichshain/',
 '/restaurants/stadtteile/hellersdorf/',
 '/restaurants/stadtteile/hohenschoenhausen/',
 '/restaurants/stadtteile/koepenick/',
 '/restaurants/stadtteile/kreuzberg/',
 '/restaurants/stadtteile/lichtenberg/',
 '/restaurants/stadtteile/marzahn/',
 '/restaurants/stadtteile/mitte/',
 '/restaurants/stadtteile/neukoelln/',
 '/restaurants/stadtteile/pankow/',
 '/restaurants/stadtteile/prenzlauer-berg/',
 '/restaurants/stadtteile/reinickendorf/',
 '/restaurants/stadtteile/schoeneberg/',
 '/restaurants/stadtteile/spandau/',
 '/restaurants/stadtteile/steglitz/',
 '/restaurants/stadtteile/tempelhof/',
 '/restaurants/stadtteile/tiergarten/',
 '/restaurants/stadtteile/treptow/',
 '/restaurants/stadtteile/wedding/',
 '/restaurants/stadtteile/weissensee/',
 '/restaurants/stadtteile/wilmersdorf/',
 '/restaurants/stadtteile/zehlendorf/']

##### Creating complete links for each cityzone:

In [6]:
zone_links = []

for zone in zone_sublinks:
    zone_page_url = 'https://www.berlin.de' + zone
    zone_links.append(zone_page_url)

zone_links

['https://www.berlin.de/restaurants/stadtteile/charlottenburg/',
 'https://www.berlin.de/restaurants/stadtteile/friedrichshain/',
 'https://www.berlin.de/restaurants/stadtteile/hellersdorf/',
 'https://www.berlin.de/restaurants/stadtteile/hohenschoenhausen/',
 'https://www.berlin.de/restaurants/stadtteile/koepenick/',
 'https://www.berlin.de/restaurants/stadtteile/kreuzberg/',
 'https://www.berlin.de/restaurants/stadtteile/lichtenberg/',
 'https://www.berlin.de/restaurants/stadtteile/marzahn/',
 'https://www.berlin.de/restaurants/stadtteile/mitte/',
 'https://www.berlin.de/restaurants/stadtteile/neukoelln/',
 'https://www.berlin.de/restaurants/stadtteile/pankow/',
 'https://www.berlin.de/restaurants/stadtteile/prenzlauer-berg/',
 'https://www.berlin.de/restaurants/stadtteile/reinickendorf/',
 'https://www.berlin.de/restaurants/stadtteile/schoeneberg/',
 'https://www.berlin.de/restaurants/stadtteile/spandau/',
 'https://www.berlin.de/restaurants/stadtteile/steglitz/',
 'https://www.berl

##### Creating empty list for dictionaries with restaurant informations:

In [7]:
all_restaurants_list = []

##### Finding links for each restaurant in each cityzone (function):

In [8]:
def find_restaurant_links(zone_link):
    # Opening page of each restaurant:
    zone_page = requests.get(zone_link)
    local_soup = BeautifulSoup(zone_page.content, 'html.parser')
    #print(local_soup.prettify())
    # Finding links for each restarant from each zone by searching specific "text" in hrefs:
    restaurant_sublink = local_soup.find_all('a', href=lambda href: href and "restaurants/adressen" in href and "html" in href, title=False)

    # Creating temporary list of sublinks for each zone:
    subzone_sublinks = []

    # Adding all sublinks info list:
    for el in restaurant_sublink:
        subzone_sublinks.append(el['href'])

    # Adding "htpps - prefix" for each link:
    for zone in subzone_sublinks:
        zone_page_url = 'https://www.berlin.de' + zone
        all_restaurants_list.append(zone_page_url)
    #print(len(all_restaurants_list))

    return all_restaurants_list

##### Looping through all zones:

In [9]:
for link in zone_links:
    find_restaurant_links(link)

##### Number of the restaurants:


In [10]:
len(all_restaurants_list)

2734

##### Creating the empty main_list where all info dictionaries will be collected:

In [11]:
main_list = []

##### Getting dictionaries data - name, type of restaurant, address, zip-code & phone:

In [12]:
def restaurant_info(restaurant):
    
    # Exploring restaurant subpage:
    restaurant_page = requests.get(restaurant)
    local_soup = BeautifulSoup(restaurant_page.content, 'html.parser')
    #print(local_soup.prettify())
   
    # Finding data field:
    restaurant_data = local_soup.find('div', class_ = 'befi-address')

    # Excluding name, address, zip code and phone from field:
    restaurant_name_tag = restaurant_data.find('div')
    restaurant_name = restaurant_name_tag.text.strip()

    restaurant_address_tag = restaurant_name_tag.find_next('div')
    restaurant_address = restaurant_address_tag.text.strip()

    restaurant_zip_tag = restaurant_data.find('span')
    restaurant_zip = restaurant_zip_tag.text.strip()

    restaurant_phone_tag = restaurant_zip_tag.find_next('a')
    restaurant_phone = restaurant_phone_tag.text.strip()
    # Checking if the phone number exists, otherwise writing "None" as a value:
    if not restaurant_phone.startswith("("):
        restaurant_phone = 'None'

    # Finding the restaurant type in http address (link):
    text = (restaurant.split("adressen/",1)[1]).split("/",1)[0]

    # Splitting and merging text to get correct string type:
    splittet_text = text.split('-',1)
    if len(splittet_text) == 1:
        new_name = splittet_text[0].capitalize()
    else:
        new_name = splittet_text[0].capitalize() + ' ' + splittet_text[1].capitalize()


    # Creating dictionary with restaurant info according new created dataset:
    restaurant_data[restaurant] = {'Name': restaurant_name,
                        'Restaurant Type': new_name,
                        'Address': restaurant_address,
                        'Zip Code': restaurant_zip,
                        'Phone': restaurant_phone}
    
    return restaurant_data[restaurant]

##### Looping through all restaurants and appending infos to new dataset:

In [13]:
start_time = time.time()

for i in progressbar(range(len(all_restaurants_list)), "Computing: ", 50):
    new_info = restaurant_info(all_restaurants_list[i])
    # Appending the dictionary to the main_list:
    main_list.append(new_info)

end_time =  time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {round(elapsed_time,2)}s")

Computing: [####..............................................] 231/2734

KeyboardInterrupt: 

##### Checking list format and length:

In [None]:
main_list
len(main_list)

2734

##### Creating new dataset for restaurants:

In [None]:
table_view = {'Name': [], 'Restaurant_Type': [], 'Address': [], 'Zip_Code': [], 'Phone': []}

dataset_restaurants = pd.DataFrame.from_records(main_list)

##### Exporting dataset as .csv file:

In [16]:
dataset_restaurants.to_csv('dataset_berlin_restaurants.csv', index=False)