## LXML 

### Example 1: Reading XML file and traversing through elements

In [None]:
from lxml import etree

In [None]:
!npx degit PacktPublishing/Hands-On-Web-Scraping-with-Python/Chapter03 -f chp3

In [None]:
import os
os.chdir('chp3')

In [None]:
xml = open("food.xml","rb").read()

In [None]:
tree = etree.XML(xml) 
#tree = etree.fromstring(xml)
#tree = etree.parse(xml)

In [None]:
print(tree)
print(type(tree))

In [None]:
for element in tree.iter():
  print("%s - %s"%(element.tag,element.text))

In [None]:
for element in tree.iter('price','name'):
  print("%s - %s"%(element.tag,element.text))

In [None]:
#to overcome encoding error use decoding or else parse() is effective approach
tree = etree.parse("food.xml")
#iterate through 'name' and print text content
for element in tree.iter('name'):
 print(element.text)

In [None]:
for element in tree.iter('name','rating','feedback'):
 print("{} - {}".format(element.tag, element.text))

### Example 2: Reading HTML doc using lxml.html

In [None]:
from lxml import html
from urllib.request import urlopen

In [None]:
root = html.parse(urlopen('http://httpbin.org/forms/post')).getroot()
tree = html.parse(urlopen('http://httpbin.org/forms/post'))

In [None]:
print(type(root))
print(type(tree))

In [None]:
p = root.find('.//p') #find first <p> from root
print(p.text_content()) # Customer name:
print(root.findtext('.//p/label')) #Customer name:

In [None]:
elemP = root.findall('.//p') #find all <p> element from root
for p in elemP :
 print(p.text_content())

In [None]:
print(root.xpath('//p/label/input/@value'))
print(root.xpath('//legend/text()'))

In [None]:
pip install cssselect

In [None]:
for e in root.cssselect('p>label'):
 print(e.text_content())

In [None]:
for e in root.cssselect('form > p'):
 print(e.text_content())

In [None]:
print(root.forms[0].action) #http://httpbin.org/post
print(root.forms[0].keys()) #['method', 'action']
print(root.forms[0].items()) #[('method', 'post'), ('action', '/post')]
print(root.forms[0].method) # POST

### Example 3: Reading & parsing HTML for retrieving HTML form type element attributes

In [None]:
import requests

In [None]:
response = requests.get('http://httpbin.org/forms/post')
# build the DOM Tree
tree = html.fromstring(response.text)

In [None]:
for element in tree.iter('input'):
 print("Element: %s \n\tvalues(): %s \n\tattrib: %s \n\titems(): %s \n\tkeys(): %s"%
 (element.tag, element.values(),element.attrib,element.items(),element.keys()))
 print("\n")

## Web Scraping using LXML

### Example 1: Extract selected data using lxml.html.parse from single page

In [None]:
import lxml.html

In [None]:
musicUrl= "http://books.toscrape.com/catalogue/category/books/music_14/index.html"
doc = lxml.html.parse(musicUrl)

In [None]:
#This path has been found using Devtools manually
#base element
articles = doc.xpath("//*[@id='default']/div/div/div/div/section/div[2]/ol/li[1]/article")[0]
#individual element inside base
title = articles.xpath("//h3/a/text()")
price = articles.xpath("//div[2]/p[contains(@class,'price_color')]/text()")
availability = articles.xpath("//div[2]/p[2][contains(@class,'availability')]/text()[normalize-space()]")
imageUrl = articles.xpath("//div[1][contains(@class,'image_container')]/a/img/@src")
starRating = articles.xpath("//p[contains(@class,'star-rating')]/@class")

In [None]:
#cleaning and formatting 
stock = list(map(lambda stock:stock.strip(),availability))
images = list(map(lambda img:img.replace('../../../..','http://books.toscrape.com'),imageUrl))
rating = list(map(lambda rating:rating.replace('star-rating ',''),starRating))

print(title)
print(price)
print(stock)
print(images)
print(rating)

In [None]:
#Merging all 
dataSet = zip(title,price,stock,images,rating)
print(list(dataSet))

### Example 2: Scraping from multiple pages & loop with Xpath

In [None]:
from lxml.etree import XPath

In [None]:
baseUrl = "http://books.toscrape.com/"
#Main URL
bookUrl = "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/index.html"
#Page URL Pattern obtained (eg: page-1.html, page-2.html...)
pageUrl = "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/page-"

In [None]:
dataSet = []
page=1
totalPages=1
while(page<=totalPages):
  print("Rows in Dataset: "+str(len(dataSet)))
  if (page==1):
      doc = lxml.html.parse(pageUrl+str(page)+".html").getroot()
      perPageArticles = doc.xpath("//*[@id=\"default\"]//form/strong[3]/text()")
      totalArticles = doc.xpath("//*[@id=\"default\"]//form/strong[1]/text()")
      totalPages = round(int(totalArticles[0])/int(perPageArticles[0]))
      print(str(totalArticles[0])+" Results, showing "+str(perPageArticles[0])+" Articles per page")
  else:
    doc = lxml.html.parse(pageUrl+str(page)+".html").getroot()
  #used to find page URL pattern
  nextPage = doc.xpath("//*[@id=\"default\"]//ul[contains(@class,'pager')]/li[2]/a/@href")
  if len(nextPage)>0: 
    print("Scraping Page "+str(page)+" of "+str(totalPages)+". NextPage > "+str(nextPage[0]))
  else:
    print("Scraping Page "+str(page)+" of "+str(totalPages))

  articles = XPath("//*[@id='default']//ol/li[position()>0]")
  titlePath = XPath(".//article[contains(@class,'product_pod')]/h3/a/text()")
  pricePath = XPath(".//article/div[2]/p[contains(@class,'price_color')]/text()")
  stockPath = XPath(".//article/div[2]/p[2][contains(@class,'availability')]/text()[normalize-space()]")
  imagePath = XPath(".//article/div[1][contains(@class,'image_container')]/a/img/@src")
  starRating = XPath(".//article/p[contains(@class,'star-rating')]/@class")

  #looping through 'articles' found in 'doc' i.e each <li><article> found in Page Source
  for row in articles(doc): 
    title = titlePath(row)[0]
    price = pricePath(row)[0]
    availability = stockPath(row)[0].strip()
    image = imagePath(row)[0]
    rating = starRating(row)[0]
    #cleaning and formatting applied to image and rating
    dataSet.append([title,price,availability,image.replace('../../../..',baseUrl),rating.replace('star-rating','')])
  
  page+=1 #updating Page Count for While loop

#Final Dataset with data from all pages.
print(dataSet)

### Example 3: Using lxml.cssselect to scrape content from a page

In [None]:
from lxml.cssselect import CSSSelector

In [None]:
url = 'https://developer.ibm.com/announcements/category/data-science/?fa=date%3ADESC&fb='
url_get = requests.get(url)
tree = html.document_fromstring(url_get.content)

In [None]:
announcements=[]
articles = tree.cssselect('.ibm--card > a.ibm--card__block_link')
for article in articles:
 link = article.get('href')
 atype = article.cssselect('div.ibm--card__body > h5')[0].text.strip()
 adate = article.cssselect('div.ibm--card__body > h5 > .ibm--card__date')[0].text
 title = article.cssselect('div.ibm--card__body > h3.ibm--card__title')[0].text_content()
 excerpt= article.cssselect(' div.ibm--card__body > p.ibm--card__excerpt')[0].text
 category= article.cssselect('div.ibm--card__bottom > p.cpt-byline__categories span')
 
 #only two available on block: except '+'
 #announcements.append([link,atype,adate,title,excerpt,[category[0].text,category[1].text]])
 
 announcements.append([link,atype,adate,title,excerpt,[span.text for span in category if span.text!='+']])
print(announcements)