In [None]:
# !pip install beautifulsoup4 requests

## HTTP Requests


- GET: Receiving data from server, web pages
- POST: Sending data to server, login and register forms...


In [1]:
# https://requests.readthedocs.io/en/latest/
import requests

In [2]:
url = "https://www.wikipedia.org/"
response = requests.get(url)

In [4]:
response.status_code

200

In [6]:
response.headers["Content-Type"]

'text/html'

In [7]:
response.text

'<!DOCTYPE html>\n<html lang="en" class="no-js">\n<head>\n<meta charset="utf-8">\n<title>Wikipedia</title>\n<meta name="description" content="Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.">\n<script>\ndocument.documentElement.className = document.documentElement.className.replace( /(^|\\s)no-js(\\s|$)/, "$1js-enabled$2" );\n</script>\n<meta name="viewport" content="initial-scale=1,user-scalable=yes">\n<link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png">\n<link rel="shortcut icon" href="/static/favicon/wikipedia.ico">\n<link rel="license" href="//creativecommons.org/licenses/by-sa/4.0/">\n<style>\n.sprite{background-image:linear-gradient(transparent,transparent),url(portal/wikipedia.org/assets/img/sprite-8bb90067.svg);background-repeat:no-repeat;display:inline-block;vertical-align:middle}.svg-Commons-logo_sister{background-position:0 0;width:47px;height:47px}.svg-MediaWiki-logo_sister{

In [9]:
url = "https://mocki.io/v1/d4867d8b-b5d5-4a48-a4ab-79131b5809b8"
response = requests.get(url)

In [11]:
response.text

'[{"name":"Harry Potter","city":"London"},{"name":"Don Quixote","city":"Madrid"},{"name":"Joan of Arc","city":"Paris"},{"name":"Rosa Park","city":"Alabama"}]'

In [28]:
response.content

b'[{"name":"Harry Potter","city":"London"},{"name":"Don Quixote","city":"Madrid"},{"name":"Joan of Arc","city":"Paris"},{"name":"Rosa Park","city":"Alabama"}]'

In [15]:
response.json()

[{'name': 'Harry Potter', 'city': 'London'},
 {'name': 'Don Quixote', 'city': 'Madrid'},
 {'name': 'Joan of Arc', 'city': 'Paris'},
 {'name': 'Rosa Park', 'city': 'Alabama'}]

In [31]:
payload = {"key1": "value1", "key2": "value2"}
r = requests.post("https://httpbin.org/post", data=payload)

In [32]:
r.json()

{'args': {},
 'data': '',
 'files': {},
 'form': {'key1': 'value1', 'key2': 'value2'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Content-Length': '23',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.28.2',
  'X-Amzn-Trace-Id': 'Root=1-658d23c0-6ccdfd224477943b7cb14777'},
 'json': None,
 'origin': '151.241.62.204',
 'url': 'https://httpbin.org/post'}

## HTML


- Tags

  1. Block: P, A, Table, UL, ... `<a href="https://..."> link </a>`

  2. Inline: img, link, ... `<img src="https://..." alt="logo" />`

- Use Browser's inspect


In [35]:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
response = requests.get(url)

In [36]:
response.text



## BeautifulSoup


In [2]:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup

In [3]:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
print(soup.prettify())

In [44]:
soup.title.string

'Python (programming language) - Wikipedia'

In [49]:
links = soup.find_all("a")
len(links)

2310

In [51]:
soup.find("a").text

'Jump to content'

In [52]:
soup.find("a").get("href")

'#bodyContent'

In [71]:
soup.get_text()



In [72]:
soup.get_text(separator="\n", strip=True)



## Extract Text


In [74]:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
response = requests.get(url)
soup = BeautifulSoup(response.text)

In [81]:
content = soup.find(id="mw-content-text")
content.prettify()



In [85]:
paragraphs = content.find_all("p")
paragraphs = [p for p in paragraphs if len(p.text.strip()) > 0]

In [87]:
len(paragraphs)

82

In [92]:
texts = [p.text for p in paragraphs[:5]]

In [93]:
texts

['Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.[31]\n',
 'Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.[32][33]\n',
 'Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python\xa00.9.0.[34] Python\xa02.0 was released in 2000. Python\xa03.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python\xa02.7.18, released in 2020, was the last release of Python\xa02.[35]\n',
 'Python consistently ranks as one of the most popular programming languages, and has gained widespread use in the machine learning community.[36

## Extract Images


In [113]:
from PIL import Image
from io import BytesIO

url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
response = requests.get(url)
soup = BeautifulSoup(response.text)

In [103]:
img = soup.find("main").find("table").find("img")

In [105]:
img_src = img.get("src")

In [118]:
def download_image(src):
    r = requests.get(src)
    i = Image.open(BytesIO(r.content))
    i.save("python.png")

In [120]:
download_image("https:" + img_src)