### Getting pages: How to Request on the Internet

In [6]:
import urllib
from urllib.parse import quote_plus
from urllib.request import urlopen

In [4]:
#using urllib
google = urlopen('http://google.com')
google = google.read()
print(google[:200])

b'<!doctype html><html dir="rtl" itemscope="" itemtype="http://schema.org/WebPage" lang="ar"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/goog'


In [8]:
url = 'http://google.com?q='
url_with_query = url + quote_plus('python web scraping')

web_search = urlopen(url_with_query)
web_search = web_search.read()

print(web_search[:200])

b'<!doctype html><html dir="rtl" itemscope="" itemtype="http://schema.org/WebPage" lang="ar"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/goog'


In [11]:
# using requests
import requests

google = requests.get('http://google.com')

print(google.status_code)
print(google.content[:200])
print(google.headers)
print(google.cookies.items())

200
b'<!doctype html><html dir="rtl" itemscope="" itemtype="http://schema.org/WebPage" lang="ar"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/goog'
{'Date': 'Sat, 11 Nov 2023 22:29:41 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'Content-Security-Policy-Report-Only': "object-src 'none';base-uri 'self';script-src 'nonce-TErdxzImNNUjy4TD64kmhQ' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/other-hp", 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'Content-Length': '8797', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2023-11-11-22; expires=Mon, 11-Dec-2023 22:29:41 GMT; path=/; domain=.google.com; Secure, AEC=Ackid1SWtfA8kAWiJwLMRukQMoi-nwVXwXRmf4E0_7-pXrEaxv9ZE_IgtA; expires=Thu, 09-Ma

### Reading a Web Page with Beautiful Soup

In [48]:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 

page = requests.get('https://enoughproject.org/get-involved/take-action', headers=headers)

bs = BeautifulSoup(page.content)

print(bs.title)
print(bs.find_all('a'))
print(bs.find_all('p'))

<title>Take Action - The Enough Project</title>
[<a class="scroll-top" id="page-scroll-top">
<img alt="Scroll to top" src="https://enoughproject.org/wp-content/themes/argenta/images/scroll-top.svg"/>
</a>, <a class="skip-link screen-reader-text" href="#main">Skip to content</a>, <a href="https://enoughproject.org/" rel="home">
<span class="first-logo">
<img alt="The Enough Project" src="https://enoughproject.org/wp-content/uploads/2017/03/enough_logo_horizontal_nosubtext_white-02-Copy-300x129.png"/>
</span>
</a>, <a class="menu-link main-menu-link item-title" href="https://enoughproject.org/about">About</a>, <a class="menu-link sub-menu-link" href="https://enoughproject.org/about">About</a>, <a class="menu-link sub-menu-link" href="https://enoughproject.org/about/john-prendergast">Founding Director</a>, <a class="menu-link sub-menu-link" href="https://enoughproject.org/about/our-team-2">Our Team</a>, <a class="menu-link sub-menu-link" href="https://enoughproject.org/about/the-sentry">T

In [49]:
header_children = [c for c in bs.head.children]
print(header_children)

['\n', <meta charset="utf-8"/>, '\n', <meta content="width=device-width, initial-scale=1.0" name="viewport"/>, '\n', <link href="https://gmpg.org/xfn/11" rel="profile"/>, '\n', <link href="https://enoughproject.org/xmlrpc.php" rel="pingback"/>, '\n', <title>Take Action - The Enough Project</title>, '\n', <meta content="max-image-preview:large" name="robots">
<!-- Google Tag Manager for WordPress by gtm4wp.com -->
<script data-cfasync="false" data-pagespeed-no-defer="">
	var gtm4wp_datalayer_name = "dataLayer";
	var dataLayer = dataLayer || [];
</script>
<!-- End Google Tag Manager for WordPress by gtm4wp.com -->
<!-- This site is optimized with the Yoast SEO plugin v4.9 - https://yoast.com/wordpress/plugins/seo/ -->
<link href="https://enoughproject.org/get-involved/take-action" rel="canonical">
<meta content="en_US" property="og:locale">
<meta content="article" property="og:type"/>
<meta content="Take Action - The Enough Project" property="og:title"/>
<meta content="https://enoughproj

In [50]:
navigation_bar = bs.find(id='masthead')

for d in navigation_bar.descendants:
    print(d)



<div class="header-wrap">
<div class="site-branding">
<p class="site-title"> <a href="https://enoughproject.org/" rel="home">
<span class="first-logo">
<img alt="The Enough Project" src="https://enoughproject.org/wp-content/uploads/2017/03/enough_logo_horizontal_nosubtext_white-02-Copy-300x129.png"/>
</span>
</a>
</p> </div><!-- .site-branding -->
<div class="right">
<nav class="main-nav" id="site-navigation">
<div class="close">
<span class="icon ion-ios-close-empty"></span>
</div>
<div id="mega-menu-wrap">
<ul class="menu" id="primary-menu"><li class="mega-menu-item nav-item menu-item-depth-0 has-submenu" id="nav-menu-item-11157-65503c48497d2"><a class="menu-link main-menu-link item-title" href="https://enoughproject.org/about">About</a>
<div class="sub-nav"><ul class="menu-depth-1 sub-menu sub-nav-group">
<li class="mega-menu-item sub-nav-item menu-item-depth-1" id="nav-menu-item-16006-65503c484982f"><a class="menu-link sub-menu-link" href="https://enoughproject.org/about">About</

In [51]:
for s in d.previous_siblings:
    print(s)

 .header-wrap 
<div class="header-wrap">
<div class="site-branding">
<p class="site-title"> <a href="https://enoughproject.org/" rel="home">
<span class="first-logo">
<img alt="The Enough Project" src="https://enoughproject.org/wp-content/uploads/2017/03/enough_logo_horizontal_nosubtext_white-02-Copy-300x129.png"/>
</span>
</a>
</p> </div><!-- .site-branding -->
<div class="right">
<nav class="main-nav" id="site-navigation">
<div class="close">
<span class="icon ion-ios-close-empty"></span>
</div>
<div id="mega-menu-wrap">
<ul class="menu" id="primary-menu"><li class="mega-menu-item nav-item menu-item-depth-0 has-submenu" id="nav-menu-item-11157-65503c48497d2"><a class="menu-link main-menu-link item-title" href="https://enoughproject.org/about">About</a>
<div class="sub-nav"><ul class="menu-depth-1 sub-menu sub-nav-group">
<li class="mega-menu-item sub-nav-item menu-item-depth-1" id="nav-menu-item-16006-65503c484982f"><a class="menu-link sub-menu-link" href="https://enoughproject.org/a

In [59]:
# finding the pieces of content
ta_divs = bs.find_all('div', class_='wpb_text_column')
print(len(ta_divs))

7


In [70]:
for ta in ta_divs:
    title = ta.previous_sibling
    link = None
    about = ta.find_all('p')
    
    previous_sibling = ta.previous_sibling
    if previous_sibling and isinstance(previous_sibling,  bs4.element.Tag):
        link = previous_sibling.a
    print(title, link, about)

NameError: name 'bs4' is not defined

In [None]:
all_data = []
for ta in ta_divs:
    data_dict = {}
    data_dict['title'] = ta.h2.get_text()
    data_dict['link'] = ta.a.get_text()    
    data_dict['about'] = [p.get_text() for p in ta.find_all('p')] 
    all_data.append(data_dict)

print(all_data)