In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

# `BeatifulSoup` ...

is a Python package that lets you analyse strings in fancy ways.

To do that we need to read a string from somewhere, for example a web page.
This is done with the `urllib` package.

In [2]:
url  = "https://en.wikipedia.org/wiki/Python_(programming_language)"
page = urlopen(url)

print(page)
print(dir(page))

<http.client.HTTPResponse object at 0x7f5616d3d198>
['__abstractmethods__', '__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_checkClosed', '_checkReadable', '_checkSeekable', '_checkWritable', '_check_close', '_close_conn', '_get_chunk_left', '_method', '_peek_chunked', '_read1_chunked', '_read_and_discard_trailer', '_read_next_chunk_size', '_read_status', '_readall_chunked', '_readinto_chunked', '_safe_read', '_safe_readinto', 'begin', 'chunk_left', 'chunked', 'close', 'closed', 'code', 'debuglevel', 'detach', 'fileno', 'flush', 'fp', 'getcode', 'getheader', 'gethea

In [3]:
page_string = page.read()
print(page_string)

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Python (programming language) - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Python_(programming_language)","wgTitle":"Python (programming language)","wgCurRevisionId":836693124,"wgRevisionId":836693124,"wgArticleId":23862,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with reference errors","Pages with incorrect ref formatting","All articles with dead external links","Articles with dead external links from March 2018","Articles with permanently dead external links","All articles with unsourced statements","Articles with unsource

In [4]:
soup = BeautifulSoup(page_string)
print(soup.prettify())



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Python (programming language) - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Python_(programming_language)","wgTitle":"Python (programming language)","wgCurRevisionId":836693124,"wgRevisionId":836693124,"wgArticleId":23862,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with reference errors","Pages with incorrect ref formatting","All articles with dead external links","Articles with dead external links from March 2018","Articles with permanently dead external links","All articles with unsourced statements","Artic

In [5]:
# Much better than just doing it line by line is to define a function ;)

def get_soup(url):
    page = urlopen(url)
    page_string = page.read()
    soup = BeautifulSoup(page_string, "html5lib")
    return soup

In [6]:
url  = "https://en.wikipedia.org/wiki/Python_(programming_language)"
soup = get_soup(url)

# Let's count some words!

In [7]:
def count_word(string, word):
    word = word.lower()
    words = string.lower().split()
    count = 0
    for w in words:
        if word == w:
            count += 1
    return count

In [8]:
ptags = soup.find_all("p")

pythons = 0
for p in ptags:
    pstr = p.get_text()
    pythons += count_word(pstr, "Python")

print("'Python'/'python' occurrs {} times.".format(pythons))

'Python'/'python' occurrs 107 times.


# Or save somthing ...

In [9]:
fname = "filename.txt"
with open(fname, 'w') as f:
    f.write(soup.p.get_text())

# And read it again ...

In [10]:
def get_file_content(fname):
    with open(fname, "r") as f:
        string = f.read()
    print(string)

get_file_content("filename.txt")

Python is an interpreted high-level programming language for general-purpose programming. Created by Guido van Rossum and first released in 1991, Python has a design philosophy that emphasizes code readability, notably using significant whitespace. It provides constructs that enable clear programming on both small and large scales.[26]


# Or maybe even append something to the file!

In [11]:
with open("filename.txt", "a") as f:
    f.write("\n\n Woooooow!")

get_file_content("filename.txt")

Python is an interpreted high-level programming language for general-purpose programming. Created by Guido van Rossum and first released in 1991, Python has a design philosophy that emphasizes code readability, notably using significant whitespace. It provides constructs that enable clear programming on both small and large scales.[26]

 Woooooow!
