# Exploring Pyquery

In [None]:
!npx degit PacktPublishing/Hands-On-Web-Scraping-with-Python/Chapter04 -f chp4

In [None]:
import os
os.chdir('chp4')

## Loading documents

In [None]:
pip install pyquery

In [None]:
from pyquery import PyQuery as pq
from urllib.request import urlopen

In [None]:
response=urlopen("http://www.example.com").read()
docTree=pq(response)

In [None]:
pq("https://www.python.org")

In [None]:
site=pq("https://www.python.org")
print(type(site))

In [None]:
pq("https://www.samsclub.com")

In [None]:
 doc = pq('http://www.example.com', parser = 'html') #using parser xml
 print(type(doc))

In [None]:
pgsource=open('test.html','r').read()
print(type(pgsource))

In [None]:
page=pq(pgsource)
print(type(page))

## Element traversing, attr & pseudo-class

In [None]:
page('title')

In [None]:
page.find('title').text()

In [None]:
page.find('meta[name="description"]').attr('content')

In [None]:
page.find('meta[name="keywords"]').attr('content')

In [None]:
btn=page('a.button').html()
btn

In [None]:
page('ul.menu')

In [None]:
page('nav:first')

In [None]:
page('ul:last')

In [None]:
page(':header')

In [None]:
page(':input')

In [None]:
page(':empty')

In [None]:
page(':empty:odd')

In [None]:
page.find('a:last').attr('href')

In [None]:
page.find('a:eq(0)').text()

In [None]:
page.find('a:lt(5)').text()
#eq : equal
#lt : less than
#gt: greater than

In [None]:
page('p:contains("python.org")').text()

In [None]:
page('h1.site-headline:first a img')

In [None]:
#.is_ OR .has_class
page('h1.site-headline:first a img').is_('.python-logo')

## Iterating

In [None]:
meta=page.find('meta[content*="Python.org"]')
[item.attr('name') for item in meta.items() if item.attr('name') is not None]

In [None]:
[item.attr('property') for item in meta.items() if item.attr('property') is not None]

In [None]:
social = page.find('a:contains("Socialize") + ul.subnav li a') 
[item.text() for item in social.items() if item.text() is not None]

In [None]:
[item.attr('href') for item in social.items() if item.attr('href') is not None]

In [None]:
webdevs = page.find('div.applications-widget:first ul.menu li:contains("Web Development") a')
[item.text() for item in webdevs.items() if item.text() is not None]

In [None]:
eventsList = []
upcomingevents = page.find('div.event-widget ul.menu li')
for event in upcomingevents.items():
 time = event.find('time').text()
 url = event.find('a[href*="events/python"]').attr('href')
 title = event.find('a[href*="events/python"]').text()
 eventsList.append([time,title,url])

eventsList

In [None]:
buttons = page.find('a.button')
for item in buttons.items():
  print(item.text(),' :: ',item.attr('href'))

#buttons = page.find('a.button:even/odd') print even/odd result

# Web Scraping using pyquery

## Example 1: Scraping DS announcements

In [None]:
from pyquery import PyQuery as pq
import requests

In [None]:
dataSet=list()

In [None]:
sourceUrl='https://developer.ibm.com/announcements/'
def read_url(url):
  """Read given Url , Returns pyquery object for page content"""
  pageSource = requests.get(url).content
  return pq(pageSource) 

In [None]:
def get_details(page):
    """read 'page' url and append list of queried items to dataSet"""
    response = read_url(page)

    articles = response.find('.ibm--card > a.ibm--card__block_link')
    print("\nTotal articles found :", articles.__len__(), ' in Page: ', page)
    for article in articles.items():
        link = article.attr('href')
        articlebody = article.find('div.ibm--card__body')
        adate = articlebody.find('h5 > .ibm--card__date').text()
        articlebody.find('h5 > .ibm--card__date').remove()
        atype = articlebody.find('h5').text().strip()
        title = articlebody.find('h3.ibm--card__title').text().encode('utf-8')
        excerpt = articlebody.find('p.ibm--card__excerpt').text().encode('utf-8')
        category = article.find('div.ibm--card__bottom > p.cpt-byline__categories span')
        if link:
            link = str(link).replace('/announcements/', sourceUrl)
            categories = [span.text for span in category if span.text != '+']
            dataSet.append([link, atype, adate, title, excerpt,",".join(categories)])

In [None]:
mainUrl = sourceUrl+"category/data-science/?fa=date:DESC&fb="
pageUrls = [sourceUrl+"category/data-science/page/%(page)s?fa=date:DESC&fb=" % {'page': page} for page in range(1, 3)]
for pages in pageUrls:
 get_details(pages)
print("\nTotal articles collected: ", len(dataSet))
print(dataSet)

In [None]:
# just to verify as it's giving different output mentioned in book
!python "example1_ibm_announcements.py"

## Example 2: Scraping infor from nested links

In [None]:
sourceUrl = 'http://quotes.toscrape.com/tag/books/'
dataSet = list()
keys = ['quote_tags','author_url','author_name','born_date','born_location','quote_title']

In [None]:
def read_url(url):
    """Read given Url , Returns pyquery object for page content"""
    pageSource = pq(url)
    return pq(pageSource)


def get_details(page):
    """read 'page' url and append list of queried items to dataSet"""
    nextPage = True
    pageNo = 1
    while (nextPage):
        response = read_url(page + 'page/' + str(pageNo))
        if response.find("ul.pager:has('li.next')"):
            nextPage = True
        else:
            nextPage = False

        quotes = response.find('.quote')
        print("\nTotal Quotes found :", quotes.__len__(), ' in Page: ', pageNo)
        for quote in quotes.items():
            title = quote.find('[itemprop="text"]:first').text()
            author = quote.find('[itemprop="author"]:first').text()
            authorLink = quote.find('a[href*="/author/"]:first').attr('href')
            tags = quote.find('.tags [itemprop="keywords"]').attr('content')

            if authorLink:
                authorLink = 'http://quotes.toscrape.com' + authorLink
                linkDetail = read_url(authorLink)
                born_date = linkDetail.find('.author-born-date').text()
                born_location = linkDetail.find('.author-born-location').text()
                if born_location.startswith('in'):
                    born_location = born_location.replace('in ','')
                dataSet.append(dict(zip(keys,[tags,authorLink,author,born_date,born_location,title[0:50]])))
        pageNo += 1

In [None]:
get_details(sourceUrl)
print("\nTotal Quotes collected: ", len(dataSet))
print(dataSet)
for info in dataSet:
   print(info['author_name'],' born on ',info['born_date'], ' in ',info['born_location'])

In [None]:
!python "example2_quotes_authors.py"

## Example 3: AHL playoff results

In [None]:
import re

In [None]:
sourceUrl = 'http://www.flyershistory.com/cgi-bin/ml-poffs.cgi'
dataSet = list()
keys = ['year','month','day','game_date','team1', 'team1_score', 'team2', 'team2_score', 'game_status']

def read_url(url):
    """Read given Url , Returns pyquery object for page content"""
    pageSource = pq(url)
    return pq(pageSource)

In [None]:
page = read_url(sourceUrl)

tableRows = page.find("h1:contains('AHL Playoff Results') + table tr")
print("\nTotal rows found :", tableRows.__len__())

for tr in tableRows.items():
    team1 = tr.find('td').eq(1).text()
    if team1 != '':
            game_date = tr.find('td').eq(0).text()
            dates = re.search(r'(.*)-(.*)-(.*)',game_date)

            team1_score = tr.find('td').eq(2).text()
            team2 = tr.find('td').eq(4).text()
            team2_score = tr.find('td').eq(5).text()

            #check Game Status should be either 'W' or 'L'
            game_status = tr.find('td').eq(6).text()
            if not re.match(r'[WL]',game_status):
                game_status = tr.find('td').eq(7).text()

            #breaking down date in year,month and day
            year = dates.group(3)
            month = dates.group(2)
            day = dates.group(1)
            if len(year)==2 and int(year)>=68:
                year = '19'+year
            elif len(year)==2 and int(year) <68:
                year = '20'+year
            else:
                pass

            #appending individual data list to the dataSet
            dataSet.append([year,month,day,game_date,team1,team1_score,team2,team2_score,game_status])

In [None]:
print("\nTotal Game Status, found :", len(dataSet))
print(dataSet)

In [None]:
!python "example3_AHL.py"

## Example 4: Collecting URLs from sitemap.xml

In [None]:
sitemap=requests.get("https://webscraping.com/sitemap.xml").content

In [None]:
sitemap.decode()

### Case 1: Using html parser

In [None]:
urlHTML=pq(sitemap,parser='html')

In [None]:
print("Children Length: ",urlHTML.children().__len__())
print("First Children: ",urlHTML.children().eq(0))
print("Inner Child/First Children: ",urlHTML.children().children().eq(0))

In [None]:
dataSet=list()
for url in urlHTML.children().find('loc:contains("blog")').items():
 dataSet.append(url.text())
print("Length of dataSet: ", len(dataSet))
print(dataSet)

### Case 2: Using XML parser

In [None]:
urlXML=pq(sitemap,parser='xml')

In [None]:
print("Children Length: ",urlXML.children().__len__())

In [None]:
print("First Children: ", urlXML.children().eq(0))
print("Inner Child/First Children: ", urlXML.children().children().eq(0))

In [None]:
dataSet=list()
for url in urlXML.children().find('loc:contains("blog")').items():
 dataSet.append(url.text())
print("Length of dataSet: ", len(dataSet))
print(dataSet)

In [None]:
for url in urlXML.children().children().items():
 print(url)
 break

In [None]:
for url in urlXML.remove_namespaces().children().find('loc:contains("blog")').items():
 dataSet.append(url.text())
print("Length of dataSet: ", len(dataSet))
print(dataSet)

In [None]:
print("URLs using Children: ",urlXML.children().text()) 
#print("URLs using Children: ",urlXML.children().children().text()) 
#print("URLs using Children: ",urlXML.text())

In [None]:
blogXML = re.split(r'\s',urlXML .children().text())
print("Length of blogXML: ",len(blogXML))

In [None]:
dataSet= list(filter(lambda blogXML:re.findall(r'blog',blogXML),blogXML))
print("Length of dataSet: ",len(dataSet))
print("Blog Urls: ",dataSet)