## HTML Essentials

In [1]:
%%html
<html>
    <head>
        <title>Page Title</title>
    </head>
    <body>
        <h1>This is a Heading</h1>
        <p>This is a paragraph.</p>
        <a href="https://www.w3schools.com/">This is a link to tutorials</a>
    </body>
</html>

## CSS Classes

In [2]:
%%html
<p class="center medium">This paragraph refers to two classes.</p>

In [3]:
html_string = r"""<html>
    <head>
        <title>Page Title</title>
    </head>
    <body>
        <h1>This is a Heading</h1>
        <p>This is a paragraph.</p>
        <a href="https://www.w3schools.com/">This is a link to tutorials</a>
    </body>
</html>"""

In [4]:
from bs4 import BeautifulSoup

In [5]:
html_soup = BeautifulSoup(html_string, 'html.parser')

In [6]:
print(html_soup)

<html>
<head>
<title>Page Title</title>
</head>
<body>
<h1>This is a Heading</h1>
<p>This is a paragraph.</p>
<a href="https://www.w3schools.com/">This is a link to tutorials</a>
</body>
</html>


In [7]:
html_soup.h1

<h1>This is a Heading</h1>

In [8]:
html_soup.h1.text

'This is a Heading'

In [9]:
html_soup.body

<body>
<h1>This is a Heading</h1>
<p>This is a paragraph.</p>
<a href="https://www.w3schools.com/">This is a link to tutorials</a>
</body>

In [10]:
html_soup.a

<a href="https://www.w3schools.com/">This is a link to tutorials</a>

In [11]:
html_soup.a.get('href')

'https://www.w3schools.com/'

### Requests & Responses

In [12]:
import requests

In [13]:
response = requests.get('https://www.bundes-telefonbuch.de/nuernberg/firma/karosseriefachbetrieb-hofer-tb1150222')

In [14]:
print(response.text)

<!DOCTYPE html>
<html lang="de">
<head>
    <meta charset="utf-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge"/>
<title>Karosseriefachbetrieb Hofer in 90431, Nürnberg</title>
<meta name="description" content=" ✓ Karosseriefachbetrieb Hofer ⌂ Hans-Bunte-Str. 47, 90431 Nürnberg. 6 Bilder, Telefonnummer und Anschrift finden Sie im Bundestelefonbuch."/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="google-site-verification" content="3OBXd2NBPoTz5Mxt-TZqneKf3PLYXGnZ9QDvEJUnH4w"/>
<meta name="msvalidate.01" content="5571DC1344E41A655D0D4C965A626E77"/> <!-- Bing -->
<script type="text/javascript">window.___gcfg = {lang: 'de'}</script>
<script src="https://apis.google.com/js/platform.js" async defer></script>
<meta name="robots" content="index,follow,archive"/>
    <link rel="canonical" href="https://www.bundes-telefonbuch.de/nuernberg/firma/karosseriefachbetrieb-hofer-tb1150222"/>



<!-- Begin Cookie Consent plugin by Silktide - http://silktide.c

In [15]:
response.status_code

200

### Beautiful Soup

In [16]:
entry_soup = BeautifulSoup(response.text, 'html.parser')

In [17]:
print(entry_soup)

<!DOCTYPE html>

<html lang="de">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>Karosseriefachbetrieb Hofer in 90431, Nürnberg</title>
<meta content=" ✓ Karosseriefachbetrieb Hofer ⌂ Hans-Bunte-Str. 47, 90431 Nürnberg. 6 Bilder, Telefonnummer und Anschrift finden Sie im Bundestelefonbuch." name="description"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="3OBXd2NBPoTz5Mxt-TZqneKf3PLYXGnZ9QDvEJUnH4w" name="google-site-verification"/>
<meta content="5571DC1344E41A655D0D4C965A626E77" name="msvalidate.01"/> <!-- Bing -->
<script type="text/javascript">window.___gcfg = {lang: 'de'}</script>
<script async="" defer="" src="https://apis.google.com/js/platform.js"></script>
<meta content="index,follow,archive" name="robots"/>
<link href="https://www.bundes-telefonbuch.de/nuernberg/firma/karosseriefachbetrieb-hofer-tb1150222" rel="canonical"/>
<!-- Begin Cookie Consent plugin by Silktide - http://silktide.com/c

In [18]:
entry_soup.h1

<h1 class="noMarginTop">
<i class="fa fa-check-square-o text-success" data-placement="top" data-toggle="tooltip" title="Karosseriefachbetrieb Hofer ist verifiziert. Die Richtigkeit der eingetragenen Firmendaten wurde von Karosseriefachbetrieb Hofer bestätigt und die Inhaberschaft von Bundestelefonbuch überprüft."></i>
                                                Karosseriefachbetrieb Hofer            </h1>

In [21]:
entry_soup.h1.text.strip()

'Karosseriefachbetrieb Hofer'

In [23]:
entry_soup.find('a', {'class':'detail-email'})

<a class="detail-email" href="mailto:hofer-n@t-online.de">hofer-n@t-online.de</a>

In [24]:
entry_soup.find('a', {'class':'detail-email'}).get('href')

'mailto:hofer-n@t-online.de'

In [25]:
address_tag = entry_soup.find('div', {'class':'detail-address'})
print(address_tag)

<div class="detail-address" itemprop="address" itemscope="" itemtype="http://schema.org/PostalAddress">
<span itemprop="streetAddress">
                            Hans-Bunte-Str.  47                        </span>
<br/>
<span itemprop="postalCode">90431</span>
<span itemprop="addressLocality">Nürnberg</span>
</div>


In [27]:
address_tag.text.strip()

'Hans-Bunte-Str. \xa047                        \n\n90431\nNürnberg'

In [28]:
address_tag.stripped_strings

<generator object stripped_strings at 0x1086a7678>

In [29]:
address_parts = list(address_tag.stripped_strings)
print(address_parts)

['Hans-Bunte-Str. \xa047', '90431', 'Nürnberg']


In [30]:
' '.join(address_parts)

'Hans-Bunte-Str. \xa047 90431 Nürnberg'

## Create Scrapy Project 

In [None]:
scrapy startproject yellow_pages

In [None]:
cd yellow_pages
scrapy genspider telefonbuch bundes-telefonbuch.de

## Scrapy Shell for debugging

In [None]:
scrapy shell 'www.bundes-telefonbuch.de'

### Scrapy: Fill Out Forms

In [None]:
fetch(scrapy.FormRequest.from_response(response, formid='searchForm', formdata={'what':'auto', 'where':'Muenchen'}))

In [None]:
response.css('div.companyBox')

In [None]:
results_soup = BeautifulSoup(response.text,'html.parser')

In [None]:
results_soup.prettify()

### Extract entries

In [None]:
entry = results_soup.find('div',{'class':'companyBox'})

In [None]:
entries = result_soup.find_all('div', {'class':'companyBox'})
len(entries)

### Paging

In [None]:
paging = result_soup.find('div',{'id':'pagination'})

In [None]:
pages = paging.find_all('a')
len(pages)

In [None]:
next_page = None
for page in pages:
    if page.text.strip()=='∨':
        next_page = page.get('href')
        break

In [None]:
start_urls = ['http://bundes-telefonbuch.de/']
if next_page:
    next_page = start_urls[0] + next_page
print(next_page)

### Get the details of the entry

In [None]:
entry_url = entry.a.get('href')
fetch(scrapy.http.Request(start_urls[0] + entry_url))

In [None]:
entry_soup = BeautifulSoup(response.text,'html.parser')

In [None]:
name = entry_soup.h1.text.strip()
print(name)

In [None]:
address_div = entry_soup.find('div', {'class':'detail-address'})
print(address_div.text)
print(address_div.stripped_strings)

In [None]:
address_parts = list(address_div.stripped_strings)
print(address_parts)
address = ' '.join(address_parts)
print(address)

In [None]:
email = entry_soup.find('a',{'class':'detail-email'}).get('href')
print(email)

In [None]:
website = entry_soup.find('a',{'class':'detail-homepage'}).get('href')
print(website)

In [None]:
tel = entry_soup.find('span', {'class':'detail-fax'}).text.strip()
fax = entry_soup.find('span', {'class':'detail-phone'}).text.strip()
print(tel, fax)

## Run crawler

In [None]:
scrapy crawl telefonbuch -a sector="Auto" -a city="Muenchen" -o "Auto_Muenchen.csv"