### Getting pages: How to Request on the Internet

In [3]:
import urllib
from urllib.parse import quote_plus
from urllib.request import urlopen

In [4]:
#using urllib
google = urlopen('http://google.com')
google = google.read()
print(google[:200])

b'<!doctype html><html dir="rtl" itemscope="" itemtype="http://schema.org/WebPage" lang="ar"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/goog'


In [5]:
url = 'http://google.com?q='
url_with_query = url + quote_plus('python web scraping')

web_search = urlopen(url_with_query)
web_search = web_search.read()

print(web_search[:200])

b'<!doctype html><html dir="rtl" itemscope="" itemtype="http://schema.org/WebPage" lang="ar"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/goog'


In [6]:
# using requests
import requests

google = requests.get('http://google.com')

print(google.status_code)
print(google.content[:200])
print(google.headers)
print(google.cookies.items())

200
b'<!doctype html><html dir="rtl" itemscope="" itemtype="http://schema.org/WebPage" lang="ar"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/goog'
{'Date': 'Mon, 13 Nov 2023 00:07:37 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'Content-Security-Policy-Report-Only': "object-src 'none';base-uri 'self';script-src 'nonce-w7bqbPGG6rPQJprmHry4Vw' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/other-hp", 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'Content-Length': '8834', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2023-11-13-00; expires=Wed, 13-Dec-2023 00:07:37 GMT; path=/; domain=.google.com; Secure, AEC=Ackid1TludjAo0vmXhlD3vnnSEWIFu0Lh17Sz6z3JGiRWUAFU9eQtg_SDA; expires=Sat, 11-Ma

### Reading a Web Page with Beautiful Soup

In [7]:
from bs4 import BeautifulSoup, Tag
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 

page = requests.get('https://enoughproject.org/get-involved/take-action', headers=headers)

bs = BeautifulSoup(page.content)

print(bs.title)
print(bs.find_all('a'))
print(bs.find_all('p'))

<title>Take Action - The Enough Project</title>
[<a class="scroll-top" id="page-scroll-top">
<img alt="Scroll to top" src="https://enoughproject.org/wp-content/themes/argenta/images/scroll-top.svg"/>
</a>, <a class="skip-link screen-reader-text" href="#main">Skip to content</a>, <a href="https://enoughproject.org/" rel="home">
<span class="first-logo">
<img alt="The Enough Project" src="https://enoughproject.org/wp-content/uploads/2017/03/enough_logo_horizontal_nosubtext_white-02-Copy-300x129.png"/>
</span>
</a>, <a class="menu-link main-menu-link item-title" href="https://enoughproject.org/about">About</a>, <a class="menu-link sub-menu-link" href="https://enoughproject.org/about">About</a>, <a class="menu-link sub-menu-link" href="https://enoughproject.org/about/john-prendergast">Founding Director</a>, <a class="menu-link sub-menu-link" href="https://enoughproject.org/about/our-team-2">Our Team</a>, <a class="menu-link sub-menu-link" href="https://enoughproject.org/about/the-sentry">T

In [8]:
header_children = [c for c in bs.head.children]
print(header_children)

['\n', <meta charset="utf-8"/>, '\n', <meta content="width=device-width, initial-scale=1.0" name="viewport"/>, '\n', <link href="https://gmpg.org/xfn/11" rel="profile"/>, '\n', <link href="https://enoughproject.org/xmlrpc.php" rel="pingback"/>, '\n', <title>Take Action - The Enough Project</title>, '\n', <meta content="max-image-preview:large" name="robots"/>, '\n', ' Google Tag Manager for WordPress by gtm4wp.com ', '\n', <script data-cfasync="false" data-pagespeed-no-defer="">
	var gtm4wp_datalayer_name = "dataLayer";
	var dataLayer = dataLayer || [];
</script>, '\n', ' End Google Tag Manager for WordPress by gtm4wp.com ', '\n', ' This site is optimized with the Yoast SEO plugin v4.9 - https://yoast.com/wordpress/plugins/seo/ ', '\n', <link href="https://enoughproject.org/get-involved/take-action" rel="canonical"/>, '\n', <meta content="en_US" property="og:locale"/>, '\n', <meta content="article" property="og:type"/>, '\n', <meta content="Take Action - The Enough Project" property="o

In [9]:
navigation_bar = bs.find(id='masthead')

for d in navigation_bar.descendants:
    print(d)



<div class="header-wrap">
<div class="site-branding">
<p class="site-title"> <a href="https://enoughproject.org/" rel="home">
<span class="first-logo">
<img alt="The Enough Project" src="https://enoughproject.org/wp-content/uploads/2017/03/enough_logo_horizontal_nosubtext_white-02-Copy-300x129.png"/>
</span>
</a>
</p> </div><!-- .site-branding -->
<div class="right">
<nav class="main-nav" id="site-navigation">
<div class="close">
<span class="icon ion-ios-close-empty"></span>
</div>
<div id="mega-menu-wrap">
<ul class="menu" id="primary-menu"><li class="mega-menu-item nav-item menu-item-depth-0 has-submenu" id="nav-menu-item-11157-65514d928ea4e"><a class="menu-link main-menu-link item-title" href="https://enoughproject.org/about">About</a>
<div class="sub-nav"><ul class="menu-depth-1 sub-menu sub-nav-group">
<li class="mega-menu-item sub-nav-item menu-item-depth-1" id="nav-menu-item-16006-65514d928eaca"><a class="menu-link sub-menu-link" href="https://enoughproject.org/about">About</

In [10]:
for s in d.previous_siblings:
    print(s)

 .header-wrap 
<div class="header-wrap">
<div class="site-branding">
<p class="site-title"> <a href="https://enoughproject.org/" rel="home">
<span class="first-logo">
<img alt="The Enough Project" src="https://enoughproject.org/wp-content/uploads/2017/03/enough_logo_horizontal_nosubtext_white-02-Copy-300x129.png"/>
</span>
</a>
</p> </div><!-- .site-branding -->
<div class="right">
<nav class="main-nav" id="site-navigation">
<div class="close">
<span class="icon ion-ios-close-empty"></span>
</div>
<div id="mega-menu-wrap">
<ul class="menu" id="primary-menu"><li class="mega-menu-item nav-item menu-item-depth-0 has-submenu" id="nav-menu-item-11157-65514d928ea4e"><a class="menu-link main-menu-link item-title" href="https://enoughproject.org/about">About</a>
<div class="sub-nav"><ul class="menu-depth-1 sub-menu sub-nav-group">
<li class="mega-menu-item sub-nav-item menu-item-depth-1" id="nav-menu-item-16006-65514d928eaca"><a class="menu-link sub-menu-link" href="https://enoughproject.org/a

In [11]:
# finding the pieces of content
ta_divs = bs.find_all('div', class_='wpb_text_column')
ta_heads = bs.find_all('h6', class_='vc_custom_heading')
print(len(ta_divs), len(ta_heads))

7 6


In [12]:
for i in range(len(ta_heads)):
    link = ta_heads[i].a
    title = ta_heads[i]
    about = ta_divs[i+1].find_all('p')
        
    print(title, link, about)

<h6 class="vc_custom_heading" style="color: #f47b20;text-align: left"><a href="https://enoughproject.org/get-involved/take-action/south-sudan-support-use-robust-financial-tools-actors-highlighted-sentry-report">South Sudan: Support Use of Robust Financial Tools on Actors Highlighted in Sentry Report</a></h6> <a href="https://enoughproject.org/get-involved/take-action/south-sudan-support-use-robust-financial-tools-actors-highlighted-sentry-report">South Sudan: Support Use of Robust Financial Tools on Actors Highlighted in Sentry Report</a> [<p dir="ltr">Tweet at the U.S. State Department and Department of the Treasury as well as the UK’s Foreign Commonwealth Office urging them to use the groundbreaking information in this report to take action in support of peace in South Sudan.</p>, <p><a href="https://enoughproject.org/get-involved/take-action/south-sudan-support-use-robust-financial-tools-actors-highlighted-sentry-report">Take Action Now</a></p>]
<h6 class="vc_custom_heading" style="

In [13]:
all_data = []
for i in range(len(ta_heads)):
    data_dict = {}
    data_dict['title'] = ta_heads[i].get_text()
    data_dict['link'] = ta_heads[i].a.get('href')    
    data_dict['about'] = [p.get_text() for p in ta_divs[i+1].find_all('p')] 
    all_data.append(data_dict)

print(all_data)

[{'title': 'South Sudan: Support Use of Robust Financial Tools on Actors Highlighted in Sentry Report', 'link': 'https://enoughproject.org/get-involved/take-action/south-sudan-support-use-robust-financial-tools-actors-highlighted-sentry-report', 'about': ['Tweet at the U.S. State Department and Department of the Treasury as well as the UK’s Foreign Commonwealth Office urging them to use the groundbreaking information in this report to take action in support of peace in South Sudan.', 'Take Action Now']}, {'title': 'Tell UK to Address Connections to Human Rights Violations and Corruption in South Sudan', 'link': 'https://enoughproject.org/get-involved/take-action/tell-uk-address-connections-human-rights-violations-corruption-south-sudan', 'about': ['Contact the joint FCO-DFID Minister for Africa, Andrew Stephenson, MP, and urge him to leverage the information from the recent Sentry report on South Sudan to take action and focus on networks including those where British citizens\xa0are r

### Reading a Web Page with LXML

In [14]:
from lxml import html

url = 'https://www.enoughproject.org/get-involved/take-action'

page = html.parse('take_action_enough_project.html')
root = page.getroot()

ta_divs = root.cssselect('div.views-row')
print(ta_divs)

[<Element div at 0x19e4b1b7980>, <Element div at 0x19e4b1b7f20>, <Element div at 0x19e4b1b79d0>, <Element div at 0x19e4b2fa490>]


In [15]:
all_data = []
for ta in ta_divs:
    data_dict = {}
    title = ta.cssselect('h2')[0]
    data_dict['title'] = title.text_content()
    data_dict['link'] = title.find('a').get('href')
    data_dict['about'] = [p.text_content() for p in ta.cssselect('p')]
    all_data.append(data_dict)
print(all_data)

[{'title': 'South Sudan: On August 17th, Implement "Plan B" ', 'link': 'https://ssl1.americanprogress.org/o/507/p/dia/action3/common/public/?action_KEY=391', 'about': ["During President Obama's recent trip to Africa, the international community set a deadline of August 17 for a peace deal to be signed by South Sudan's warring parties. The President warned that if an agreement is not reached, it will be 'necessary for us to move forward with a different\xa0plan.' \xa0With conflict raging since December 2013, the world can no longer sit by as they have while past agreements have been broken.", '\xa0', "Read our latest brief on the issue:\nBeyond Deadlock: Recommendations for Obama's Plan B on South Sudan", 'Tell President Obama that if there is no agreement by\xa0August 17 between the warring parties, to implement and enforce a strong "Plan\xa0B."']}, {'title': 'Urge your Members of Congress to Support the counter-LRA Resolution', 'link': 'https://ssl1.americanprogress.org/o/507/p/dia/ac

In [16]:
# what are the main differences between find and cssselect?
print(root.find('div'))
print(root.find('head'))
print(root.find('head').findall('script'))

None
<Element head at 0x19e4b2fb570>
[<Element script at 0x19e4b2fb570>, <Element script at 0x19e4b2fb700>, <Element script at 0x19e4b2fb750>, <Element script at 0x19e4b2fb7a0>, <Element script at 0x19e4b2fb7f0>, <Element script at 0x19e4b2fb840>]


In [17]:
print(root.cssselect('div'))
print(root.cssselect('head script'))

[<Element div at 0x19e4b2fbb10>, <Element div at 0x19e4b2faa30>, <Element div at 0x19e4b2fbd90>, <Element div at 0x19e4b2fbde0>, <Element div at 0x19e4b2fbe30>, <Element div at 0x19e4b2fbe80>, <Element div at 0x19e4b2fbed0>, <Element div at 0x19e4b2fbf20>, <Element div at 0x19e4b2fbf70>, <Element div at 0x19e4b2fbfc0>, <Element div at 0x19e4b514050>, <Element div at 0x19e4b5140a0>, <Element div at 0x19e4b5140f0>, <Element div at 0x19e4b514140>, <Element div at 0x19e4b514190>, <Element div at 0x19e4b5141e0>, <Element div at 0x19e4b514230>, <Element div at 0x19e4b514280>, <Element div at 0x19e4b5142d0>, <Element div at 0x19e4b514320>, <Element div at 0x19e4b514370>, <Element div at 0x19e4b5143c0>, <Element div at 0x19e4b514410>, <Element div at 0x19e4b514460>, <Element div at 0x19e4b5144b0>, <Element div at 0x19e4b1b7980>, <Element div at 0x19e4b514500>, <Element div at 0x19e4b514550>, <Element div at 0x19e4b5145a0>, <Element div at 0x19e4b5145f0>, <Element div at 0x19e4b514640>, <Elemen

In [18]:
# a quick parser for an emoji cheat sheet
file_path = 'emoji_cheat_sheet.html'

with open(file_path, 'r', encoding='utf-8') as file:
    resp = file.read()
    
page = html.document_fromstring(resp)
body = page.find('body')
top_header = body.find('h2')
print(top_header.text)

People


In [19]:
headers_and_lists = [sib for sib in top_header.itersiblings()]
print(headers_and_lists)

[<Element ul at 0x19e4b50caa0>, <Element h2 at 0x19e4b50cb40>, <Element ul at 0x19e4b50cb90>, <Element h2 at 0x19e4b50cbe0>, <Element ul at 0x19e4b50cc30>, <Element h2 at 0x19e4b50cc80>, <Element ul at 0x19e4b50ccd0>, <Element h2 at 0x19e4b50cd20>, <Element ul at 0x19e4b50cd70>, <Element h3 at 0x19e4b50cdc0>, <Element ul at 0x19e4b50ce10>, <Element div at 0x19e4b50ce60>, <Element script at 0x19e4b50ceb0>, <Element script at 0x19e4b50cf00>, <Element script at 0x19e4b50cf50>]


In [20]:
proper_headers_and_lists = [s for s in top_header.itersiblings() if s.tag in ['ul', 'h2', 'h3']]
print(proper_headers_and_lists)

[<Element ul at 0x19e4b50caa0>, <Element h2 at 0x19e4b50cb40>, <Element ul at 0x19e4b50cb90>, <Element h2 at 0x19e4b50cbe0>, <Element ul at 0x19e4b50cc30>, <Element h2 at 0x19e4b50cc80>, <Element ul at 0x19e4b50ccd0>, <Element h2 at 0x19e4b50cd20>, <Element ul at 0x19e4b50cd70>, <Element h3 at 0x19e4b50cdc0>, <Element ul at 0x19e4b50ce10>]


### A Case for XPath

In [26]:
page = html.parse('emoji_cheat_sheet.html')

proper_headers = page.xpath('//h2|//h3')
proper_lists = page.xpath('//ul')


all_emoji = []

for header, list_cont in zip(proper_headers, proper_lists):
    section = header.text
    for li in list_cont.getchildren():
        emoji_dict = {}
        spans = li.xpath('div/span')
        if len(spans):
            link = spans[0].get('data-src')
            if link:
                emoji_dict['emoji_link'] = li.base_url + link
            else:
                emoji_dict['emoji_link'] = None
            emoji_dict['emoji_handle'] = spans[1].text_content()
        else:
            emoji_dict['emoji_link'] = None
            emoji_dict['emoji_handle'] = li.xpath('div')[0].text_content()
        emoji_dict['section'] = section
        all_emoji.append(emoji_dict)
print(all_emoji)

