In [1]:
# Web Scrape Wikipedia example 1
# https://en.wikipedia.org/wiki/COVID-19_pandemic_in_the_Republic_of_Ireland
%config Completer.use_jedi = False

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [3]:
# Scape the all of the html page and then get the first h1 heading
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_in_the_Republic_of_Ireland"

response = requests.get(url)

html = response.content

# Webpage is a Beautiful Soup object
webpage = BeautifulSoup(html)

# print(webpage.prettify())

In [4]:
# Get the first header on the page and get the text from it...
h1 = webpage.find("h1")

h1.get_text() # or h1.text

'COVID-19 pandemic in the Republic of Ireland'

In [5]:
h1_page_heading = webpage.find("h1", attrs={"id":"firstHeading"})

h1_page_heading

<h1 class="firstHeading" id="firstHeading" lang="en">COVID-19 pandemic in the Republic of Ireland</h1>

In [6]:
h1_page_heading2 = webpage.select_one("h1#firstHeading.firstHeading") # webpage.select_one("h1#firstHeading")
h1_page_heading2

<h1 class="firstHeading" id="firstHeading" lang="en">COVID-19 pandemic in the Republic of Ireland</h1>

In [7]:
h1_page_headings = webpage.select("h1#firstHeading")
page_title = h1_page_headings[0].get_text()
page_title

'COVID-19 pandemic in the Republic of Ireland'

In [8]:
# Try and download a specific image from the page
wikipedia_web_address = "https://en.wikipedia.org"

# they changing the 
#image_scr = "/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/f742a3ab683d2e18ce3da6857a5b57d0a1eb8c2d.png"
image_scr ="/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/902d16b30bcac33076e2304cf2fdf15d5006bde2.png"

image_url = wikipedia_web_address + image_scr
image_url

'https://en.wikipedia.org/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/902d16b30bcac33076e2304cf2fdf15d5006bde2.png'

In [9]:
response2 = requests.get(image_url)
response2

<Response [200]>

In [10]:
image_content = response2.content

In [11]:
# Image content will be bytes (binary data) (Not text like html or txt file etc)
# image_content

# Write the image to a file called test_image.png
# wb - Opens a file for writing only in binary format. Overwrites the file if the file exists. 
# If the file does not exist, creates a new file for writing.
with open("test_image.png","wb") as f:
    f.write(image_content)
    print("File written")

File written


In [12]:
#image_url = "https://en.wikipedia.org/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/f742a3ab683d2e18ce3da6857a5b57d0a1eb8c2d.png"

In [13]:
# Use regex to try and extract the image name 'f742a3ab683d2e18ce3da6857a5b57d0a1eb8c2d.png' from
# the image url
# Common image formats are png, jpg, gif, bmp

import re

# We want to try and get the part of the image name that contain a / followed by somw number of characters
# followed by . and an image extention such as png, jpg, gif, bmp

# $ - end of the string
# (png|gif|jpg|bmp) is a group where we are looking for any of these
# [] <- makes it a character class 
#   +[.](png|gif|jpg|bmp))$ <- at the end it should be . and any of the characteres 
# \w- <- any character a-z,A-Z,1-9 and a hifen cos some pictures are like this ->black-cat1.jpg   
image_name_pattern = r'/([\w-]+[.](png|gif|jpg|bmp))$'

image_filename = re.search(image_name_pattern,image_url)

# image_filename is a re.Match object
# Match objects (can) contain groups of matches relating to
# to what was matched in the () parts of the regex expression

print(image_filename.group()) # the whole matched pattern
print(image_filename.group(0)) # the whole matched pattern
print(image_filename.group(1)) # the first group ([\w-]+[.](png|gif|jpg|bmp))
print(image_filename.group(2)) # the second group (png|gif|jpg|bmp)

/902d16b30bcac33076e2304cf2fdf15d5006bde2.png
/902d16b30bcac33076e2304cf2fdf15d5006bde2.png
902d16b30bcac33076e2304cf2fdf15d5006bde2.png
png


In [14]:
# lets save the file now with the actual image name 
filename = image_filename.group(1)
with open (filename,'wb') as f:
    f.write(image_content)
    print('File has been written')

File has been written
