# XPath approach

In [31]:
from lxml import html
import requests

page = requests.get("https://www.comp.nus.edu.sg/~lekhsian/sws3023/page.html")
page.content

b'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="UTF-8" />\n    <meta\n      name="description"\n      content="This is a dummy page for learning selectors"\n    />\n    <meta http-equiv="X-UA-Compatible" content="ie=edge" />\n    <title>Welcome to SWS3023</title>\n  </head>\n  <body>\n    <div class="article" id="a0042">\n      <h1>Cupcake Article</h1>\n      <div class="header">\n        Gummies pie drag\xc3\xa9e pastry lemon drops. Sweet roll bonbon tootsie roll\n        cake. Lollipop sweet roll icing sesame snaps chocolate bar apple pie\n        cake sweet roll biscuit.\n        <div>this is a third level div</div>\n      </div>\n      <p>\n        Topping sesame snaps marzipan. Tootsie roll chocolate bar sesame snaps\n        muffin tart souffl\xc3\xa9 jujubes. Gummies carrot cake cake ice cream sesame\n        snaps bear claw danish. Jelly beans sweet roll jujubes caramels cupcake\n        biscuit.\n      </p>\n      <p>\n        Halvah chocolate oat cake tiramisu topping

In [32]:
#parse the page
#need to use the lxml package
tree = html.fromstring(page.content)

In [33]:
headings = tree.xpath("//h1/text()")
headings

['Cupcake Article', 'Cheese Article', 'Office Article']

In [34]:
#notice that only the first 2 article is selected
articles = tree.xpath("//div[@class='article']")
articles

[<Element div at 0x21bedfb0e50>, <Element div at 0x21bedfb0e00>]

In [35]:
#if we want all articles, we should check that the class attribute contains article instead of equals article
all_articles = tree.xpath("//div[contains(@class,'article')]")
all_articles

[<Element div at 0x21bedfb0e50>,
 <Element div at 0x21bedfb0e00>,
 <Element div at 0x21bedfb8f90>]

In [36]:
#get the text content of each element
for article in articles:
    print(article.text_content())


      Cupcake Article
      
        Gummies pie dragée pastry lemon drops. Sweet roll bonbon tootsie roll
        cake. Lollipop sweet roll icing sesame snaps chocolate bar apple pie
        cake sweet roll biscuit.
        this is a third level div
      
      
        Topping sesame snaps marzipan. Tootsie roll chocolate bar sesame snaps
        muffin tart soufflé jujubes. Gummies carrot cake cake ice cream sesame
        snaps bear claw danish. Jelly beans sweet roll jujubes caramels cupcake
        biscuit.
      
      
        Halvah chocolate oat cake tiramisu topping apple pie lollipop jelly-o
        cake. Topping cotton candy sweet marzipan apple pie. Tart ice cream bear
        claw marshmallow.
      
    

      Cheese Article
      
        The big cheese red leicester rubber cheese. Stilton taleggio halloumi
        croque monsieur bocconcini cheese triangles cheesecake boursin. Ricotta
        paneer caerphilly cheese slices emmental airedale manchego babybel.
     

In [37]:
type(articles[0])

lxml.html.HtmlElement

In [38]:
#if we want the html of each element
for article in articles:
    print(html.tostring(article))

b'<div class="article" id="a0042">\n      <h1>Cupcake Article</h1>\n      <div class="header">\n        Gummies pie drag&#233;e pastry lemon drops. Sweet roll bonbon tootsie roll\n        cake. Lollipop sweet roll icing sesame snaps chocolate bar apple pie\n        cake sweet roll biscuit.\n        <div>this is a third level div</div>\n      </div>\n      <p>\n        Topping sesame snaps marzipan. Tootsie roll chocolate bar sesame snaps\n        muffin tart souffl&#233; jujubes. Gummies carrot cake cake ice cream sesame\n        snaps bear claw danish. Jelly beans sweet roll jujubes caramels cupcake\n        biscuit.\n      </p>\n      <p>\n        Halvah chocolate oat cake tiramisu topping apple pie lollipop jelly-o\n        cake. Topping cotton candy sweet marzipan apple pie. Tart ice cream bear\n        claw marshmallow.\n      </p>\n    </div>\n    <div class="article" id="a0043">\n      <h1>Cheese Article</h1>\n      <div class="header">\n        The big cheese red leicester rubb

In [39]:
firstArticle = articles[0]
firstArticle

<Element div at 0x21bedfb0e50>

In [40]:
#possible to find elements within element (notice the . in front)
paragraphs_in_first_article = firstArticle.xpath(".//p/text()")
paragraphs_in_first_article

['\n        Topping sesame snaps marzipan. Tootsie roll chocolate bar sesame snaps\n        muffin tart soufflé jujubes. Gummies carrot cake cake ice cream sesame\n        snaps bear claw danish. Jelly beans sweet roll jujubes caramels cupcake\n        biscuit.\n      ',
 '\n        Halvah chocolate oat cake tiramisu topping apple pie lollipop jelly-o\n        cake. Topping cotton candy sweet marzipan apple pie. Tart ice cream bear\n        claw marshmallow.\n      ']

In [41]:
#alternatively if you want the syntax weird, you could parse the html of the node before performing
#the xpath expression
firstArticleParsed = html.fromstring(html.tostring(firstArticle))
firstArticleParsed.xpath("//p/text()")

['\n        Topping sesame snaps marzipan. Tootsie roll chocolate bar sesame snaps\n        muffin tart soufflé jujubes. Gummies carrot cake cake ice cream sesame\n        snaps bear claw danish. Jelly beans sweet roll jujubes caramels cupcake\n        biscuit.\n      ',
 '\n        Halvah chocolate oat cake tiramisu topping apple pie lollipop jelly-o\n        cake. Topping cotton candy sweet marzipan apple pie. Tart ice cream bear\n        claw marshmallow.\n      ',
 '\n        Dolcelatte halloumi swiss. Pepper jack brie who moved my cheese danish\n        fontina monterey jack rubber cheese manchego cheese slices. Melted\n        cheese cauliflower cheese rubber cheese jarlsberg cheese on toast\n        fromage frais macaroni cheese halloumi.\n      ',
 '\n        Goat who moved my cheese cheese strings. Monterey jack ricotta\n        mozzarella swiss smelly cheese goat cheese strings edam. Halloumi paneer\n        babybel cow manchego blue castello smelly cheese macaroni cheese.\n 

# CSS Selectors approach

In [42]:
from bs4 import BeautifulSoup
import re

In [43]:
soup = BeautifulSoup(page.content, 'html.parser')

In [44]:
div_headers = soup.select("div.header")
div_headers

[<div class="header">
         Gummies pie dragée pastry lemon drops. Sweet roll bonbon tootsie roll
         cake. Lollipop sweet roll icing sesame snaps chocolate bar apple pie
         cake sweet roll biscuit.
         <div>this is a third level div</div>
 </div>,
 <div class="header">
         The big cheese red leicester rubber cheese. Stilton taleggio halloumi
         croque monsieur bocconcini cheese triangles cheesecake boursin. Ricotta
         paneer caerphilly cheese slices emmental airedale manchego babybel.
         Emmental mascarpone cheeseburger who moved my cheese feta.
       </div>]

In [45]:
for div_header in div_headers:
    text = div_header.get_text(strip=True)
    #replace multiple spaces with single space
    text = re.sub("\s+", " ", text)
    print(text)
    print("\n")

Gummies pie dragée pastry lemon drops. Sweet roll bonbon tootsie roll cake. Lollipop sweet roll icing sesame snaps chocolate bar apple pie cake sweet roll biscuit.this is a third level div


The big cheese red leicester rubber cheese. Stilton taleggio halloumi croque monsieur bocconcini cheese triangles cheesecake boursin. Ricotta paneer caerphilly cheese slices emmental airedale manchego babybel. Emmental mascarpone cheeseburger who moved my cheese feta.


