# In this notebook we show how we can scrap data from webpages using the [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/), a python library.
<br><br>

In [1]:
# To install only the requirements of this notebook, uncomment the lines below and run this cell

# ===========================

!pip install numpy==1.19.5
!pip install beautifulsoup4==4.6.3

# ===========================

Collecting numpy==1.19.5
  Downloading numpy-1.19.5.zip (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: numpy
  Building wheel for numpy (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for numpy [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[1187 lines of output][0m
  [31m   [0m Running from numpy source directory.
  [31m   [0m numpy/random/_bounded_integers.pxd.in has not changed
  [31m   [0m numpy/random/_philox.pyx has not changed
  [31m   [0m numpy/random/_bounded_integers.pyx.in has 

In [2]:
# To install the requirements for the entire chapter, uncomment the lines below and run this cell

# ===========================

try :
    import google.colab
    !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch2/ch2-requirements.txt | xargs -n 1 -L 1 pip install
except ModuleNotFoundError :
    !pip install -r "ch2-requirements.txt"

# ===========================

Collecting numpy==1.19.5
  Using cached numpy-1.19.5.zip (7.3 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pandas==1.1.5
  Downloading pandas-1.1.5.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting pytesseract==0.3.7
  Downloading pytesseract-0.3.7.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting spacy==2.2.4
  Downloading spacy-2.2.4.tar.gz (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies

In [3]:
# making the necessary imports
from pprint import pprint
from bs4 import BeautifulSoup
from urllib.request import urlopen 

In [11]:
myurl = "https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python" # specify the url
html = urlopen(myurl).read() # query the website so that it returns a html page  
soupified = BeautifulSoup(html, 'html.parser') # parse the html in the 'html' variable, and store it in Beautiful Soup format

As the size of the HTML webpage (soupified) is large, we are just showing some of its output (only 2000 characters).

In [5]:
#pprint(soupified.prettify())      # for printing the full HTML structure of the webpage

In [12]:
pprint(soupified.prettify()[:2000]) # to get an idea of the html structure of the webpage 

('<!DOCTYPE html>\n'
 '<html class="html__responsive " itemscope="" '
 'itemtype="https://schema.org/QAPage" lang="en">\n'
 ' <head>\n'
 '  <title>\n'
 '   python - How do I get the current time? - Stack Overflow\n'
 '  </title>\n'
 '  <link '
 'href="https://cdn.sstatic.net/Sites/stackoverflow/Img/favicon.ico?v=ec617d715196" '
 'rel="shortcut icon"/>\n'
 '  <link '
 'href="https://cdn.sstatic.net/Sites/stackoverflow/Img/apple-touch-icon.png?v=c78bd457575a" '
 'rel="apple-touch-icon"/>\n'
 '  <link '
 'href="https://cdn.sstatic.net/Sites/stackoverflow/Img/apple-touch-icon.png?v=c78bd457575a" '
 'rel="image_src"/>\n'
 '  <link href="/opensearch.xml" rel="search" title="Stack Overflow" '
 'type="application/opensearchdescription+xml"/>\n'
 '  <link '
 'href="https://stackoverflow.com/questions/415511/how-do-i-get-the-current-time" '
 'rel="canonical">\n'
 '   <meta content="width=device-width, height=device-height, '
 'initial-scale=1.0, minimum-scale=1.0" name="viewport"/>\n'
 '   <meta

In [13]:
soupified.title # to get the title of the web page 

<title>python - How do I get the current time? - Stack Overflow</title>

In [14]:
question = soupified.find("div", {"class": "question"}) # find the nevessary tag and class which it belongs to
questiontext = question.find("div", {"class": "s-prose js-post-body"})
print("Question: \n", questiontext.get_text().strip())

answer = soupified.find("div", {"class": "answer"}) # find the nevessary tag and class which it belongs to
answertext = answer.find("div", {"class": "s-prose js-post-body"})
print("Best answer: \n", answertext.get_text().strip())

Question: 
 How do I get the current time?
Best answer: 
 Use datetime:
>>> import datetime
>>> now = datetime.datetime.now()
>>> now
datetime.datetime(2009, 1, 6, 15, 8, 24, 78915)
>>> print(now)
2009-01-06 15:08:24.789150

For just the clock time without the date:
>>> now.time()
datetime.time(15, 8, 24, 78915)
>>> print(now.time())
15:08:24.789150


To save typing, you can import the datetime object from the datetime module:
>>> from datetime import datetime

Then remove the prefix datetime. from all of the above.


BeautifulSoup is one of the many libraries which allow us to scrape web pages. Depending on your needs you can choose between the many available choices like beautifulsoup, scrapy, selenium, etc

In [24]:
mySimpleTex = 'Oh my oh my ❗️🏧'
mySimpleTextEnc = mySimpleTex.encode("utf-8")
print(mySimpleTextEnc)

b'Oh my oh my \xe2\x9d\x97\xef\xb8\x8f\xf0\x9f\x8f\xa7'
