In [None]:
import requests
from bs4 import BeautifulSoup
page = requests.get('http://pythonscraping.com/pages/page3.html')
html = page.text
# print(page)
soup = BeautifulSoup(html, 'html.parser')
print(soup)

In [17]:
fish = soup.find(name='tr', attrs={'id': 'gift3', 'class': 'gift'}) # If it doesn't find anything it returns None

print(fish)
print(type(fish))

<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>
<class 'bs4.element.Tag'>


<class 'bs4.element.Tag'> allows us to keep 'finding' tags inside it

Let's look at all td (table data) tags.

In [7]:
fish_row = fish.find_all('td') # This returns a list where each item corresponds to each td tag 


In [8]:
fish_row

[<td>
 Fish Painting
 </td>,
 <td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td>,
 <td>
 $10,005.00
 </td>,
 <td>
 <img src="../img/gifts/img3.jpg"/>
 </td>]

In [11]:
print(fish_row[0].text)


Fish Painting



The first td tag corresponds to the title column

The second one corresponds to the description column

The third one corresponds to the price column

The last one corresponds to the image (we'll ignore this by now)

In [12]:
title = fish_row[0].text
description = fish_row[1].text
price = fish_row[2].text

print(title)
print(description)
print(price)


Fish Painting


If something seems fishy about this painting, it's because it's a fish! Also hand-painted by trained monkeys!


$10,005.00



In [22]:
# Fish is pointing at the fish row. The next sibling is the parrot. We can also find this from the fish tag
siblings = fish.find_next_siblings()

In [23]:
siblings

[<tr class="gift" id="gift4"><td>
 Dead Parrot
 </td><td>
 This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
 </td><td>
 $0.50
 </td><td>
 <img src="../img/gifts/img4.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift5"><td>
 Mystery Box
 </td><td>
 If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. <span class="excitingNote">Keep your friends guessing!</span>
 </td><td>
 $1.50
 </td><td>
 <img src="../img/gifts/img6.jpg"/>
 </td></tr>]

Or you can find its children or child!

In [25]:
# print(fish.findChild())
print(fish.findChildren())


[<td>
Fish Painting
</td>, <td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td>, <span class="excitingNote">Also hand-painted by trained monkeys!</span>, <td>
$10,005.00
</td>, <td>
<img src="../img/gifts/img3.jpg"/>
</td>, <img src="../img/gifts/img3.jpg"/>]


# Challenge: What is a Method?

In [28]:
page = requests.get('https://en.wikipedia.org/wiki/Python_(programming_language)')
html = page.text
soup = BeautifulSoup(html, 'html.parser')

In [29]:
method_header = soup.find(id='Method')

In [30]:
h3_tag = method_header.findParent()

AttributeError: 'NoneType' object has no attribute 'findParent'

In [None]:
h3_sibling = h3_tag.find_next_sibling()

In [None]:
method_definition = h3_sibling.text

/html/body/div[3]/div[3]/div[5]/div[1]/p[28]

# Selenium

In [68]:
from selenium import webdriver

driver = webdriver.Firefox()
driver.get("http://www.python.org")

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys # 
driver = webdriver.Chrome('chromedriver')
# driver = webdriver.Chrome('./chromedriver')
driver.get("http://www.python.org")
# assert "Python" in driver.title

# elem = driver.find_element_by_xpath('//*[@id="content"]/div/section/div[5]/p[2]/a[1]')
# elem.click()
# elem.send_keys("method")
# elem.send_keys(Keys.RETURN)
# assert "No results found." not in driver.page_source
# driver.close()

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


In [61]:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [54]:
elem.send_keys("method")

In [55]:
elem.send_keys(Keys.RETURN)

In [56]:
driver.close()

Let's see an example using xpath:

This is the syntax for Xpath: Xpath =//tagname[@Attribute=’value’]

Wherein:

//: Used to select the current node.

tagname: Name of the tag of a particular node.

@: Used to select the select attribute.

Attribute: Name of the attribute of the node.

Value: Value of the attribute

`/button` find **child** (not all) tags of type button, of the element

`//div/button` - finds all of the button tags inside div tags anywhere on the page

`//div[@id='custom_id']` - finds all div tags with the attribute (`@`) `id` equal to `custom_id`, anywhere on the page

You can look for the xpath while inspecting the HTML or XML code

Remember that `.` indicates a relative path, so it will start to look from the relative address



In [33]:
table_of_contents = driver.find_element_by_xpath('//ul[@class="list-recent-events menu"]')
contents = table_of_contents.find_elements_by_xpath('.//li') # Observe the '.' to indicate that we are looking from the table of contents variable

In [42]:
links = []
for post in contents:
    links.append(post.find_element_by_xpath('.//a').get_attribute("href"))

In [43]:
links

['https://www.python.org/download/releases/2.2.3/descrintro',
 'https://www.python.org/download/releases/2.2.2/descrintro',
 'https://www.python.org/download/releases/2.2/descrintro',
 'https://www.python.org/download/releases/2.2.1/descrintro',
 'https://www.python.org/dev/peps/pep-0213/',
 'https://www.python.org/dev/peps/pep-0307/',
 'https://www.python.org/dev/peps/pep-3124/',
 'https://www.python.org/dev/peps/pep-0447/',
 'https://www.python.org/dev/peps/pep-0252/',
 'https://www.python.org/dev/peps/pep-3114/',
 'https://www.python.org/dev/peps/pep-0346/',
 'https://www.python.org/download/releases/2.2.2/bugs',
 'https://www.python.org/dev/peps/pep-0342/',
 'https://www.python.org/download/releases/2.2.1/bugs',
 'https://www.python.org/download/releases/2.2.3/bugs',
 'https://www.python.org/dev/peps/pep-3119/',
 'https://www.python.org/dev/peps/pep-0323/',
 'https://www.python.org/download/releases/2.2/bugs',
 'https://www.python.org/dev/peps/pep-0579/',
 'https://www.python.org/d

In [52]:
import time
definitions = {'Link': [], 'Description': []}
for link in links[:3]:
    driver.get(link)
    definitions['Link'].append(link)
    intro_title = driver.find_element_by_xpath('//a[@name="introduction"]')
    h3 = intro_title.find_element_by_xpath("..")
    paragraph = h3.find_element_by_xpath("./following-sibling::p")
    print(paragraph.text)
    definitions['Description'].append(paragraph.text)
    time.sleep(1)

Python 2.2 introduces the first phase of "type/class unification". This is a series of changes to Python intended to remove most of the differences between built-in types and user-defined classes. Perhaps the most obvious one is the restriction against using built-in types (such as the type of lists and dictionaries) as a base class in a class statement.
Python 2.2 introduces the first phase of "type/class unification". This is a series of changes to Python intended to remove most of the differences between built-in types and user-defined classes. Perhaps the most obvious one is the restriction against using built-in types (such as the type of lists and dictionaries) as a base class in a class statement.
Python 2.2 introduces the first phase of "type/class unification". This is a series of changes to Python intended to remove most of the differences between built-in types and user-defined classes. Perhaps the most obvious one is the restriction against using built-in types (such as the

In [3]:
driver = webdriver.Chrome('./chromedriver') 
URL = "https://www.zoopla.co.uk/new-homes/property/london/?q=London&results_sort=newest_listings&search_source=new-homes&page_size=25&pn=1&view_type=list"
driver.get(URL)

Look for the cookies button, and accept the cookies

In [None]:
for button in buttons:
    if button.text == "Accept all cookies":
        relevant_button = button
        break



relevant_button.click()

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
webdriver.Chrome('./chromedriver') 
# def accept_cookies(driver):
    
# driver = webdriver.Chrome('./chromedriver') 
# URL = "https://www.zoopla.co.uk/new-homes/property/london/?q=London&results_sort=newest_listings&search_source=new-homes&page_size=10&pn=1&view_type=list"
# driver.get(URL)

# search_bar = driver.find_element_by_xpath('//*[@id="modal"]/div/div[1]/form/input')
# elem.send_keys("gcs")
# elem.send_keys(Keys.RETURN)
# house_list = driver.find_element_by_xpath('//*[@id="__next"]/div[5]/div[2]/main/div[2]/div[2]')
# houses = house_list.find_elements_by_xpath('./div')

<selenium.webdriver.chrome.webdriver.WebDriver (session="ba6a0f849d04962c401f4a366ce8632f")>

In [42]:
search_bar = driver.find_element_by_xpath('//*[@id="modal"]/div/div[1]/form/input')
search_bar.send_keys("gcs")

In [43]:
search_bar.send_keys(Keys.RETURN)

In [31]:
link_list = []

for house in houses[:5]:
    link = house.find_element_by_tag_name('a').get_attribute('href')
    link_list.append(link)


In [32]:
link_list

['https://www.zoopla.co.uk/new-homes/details/59158715/',
 'https://www.zoopla.co.uk/new-homes/details/59158715/',
 'https://www.zoopla.co.uk/new-homes/details/59158715/',
 'https://www.zoopla.co.uk/new-homes/details/59158715/',
 'https://www.zoopla.co.uk/new-homes/details/59158715/']

Iterate though all the houses in that page. Look for patterns, google regex if you want to go the extra mile

![](images/zoopla.png)

Try to get as many properties as possible (sqmt, number of bedrooms, n of bathrooms, floors, address...)

In [55]:
data = {"sale_price": [], "num_bedrooms": [], "sqft": [], "description": [], "address": []} # You can populate the lists with what you find