In [1]:
# Web scrape Wikipedia example 2
%config Completer.use_jedi = False
import requests
import re
from bs4 import BeautifulSoup
from textblob import TextBlob
import os

In [2]:
# URL of our webpage
webpage_url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_in_the_Republic_of_Ireland"
# webpage_url = "http://www.rte.ie"

In [3]:
# Try and scape all the images from a wikipedia page and save them to a sub folder
response = requests.get(webpage_url)

# Get the base path of the URL
from urllib.parse import urlparse

parsed_url = urlparse(webpage_url)
base_url = parsed_url[0] + "://" + parsed_url[1]
base_url

'https://en.wikipedia.org'

In [4]:
# Make a subdirectory for our images
from datetime import datetime

# Auto genertae the folder name for the script to use by using the base url and the current time and date
folder_name = parsed_url[1] + datetime.now().strftime(" %m %d %Y %H-%M-%S")

if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' has been created.")
else:
    print(f"Folder '{folder_name}' already exists.")

Folder 'en.wikipedia.org 10 07 2020 10-11-55' has been created.


In [5]:
# Use BS to get all the image tags and get all the image scr values
html = response.content # html text
webpage_soup = BeautifulSoup(html)

# webpage_soup

# all_imgs is a list of img tag BS objects
all_imgs = webpage_soup.find_all("img")

print(len(all_imgs))

49


In [6]:
# Make a list of the the image sources
img_source_urls = [ img["src"] for img in all_imgs]

# Some images ont he page may be the same so lets get the unique list
img_source_urls = list(set(img_source_urls))

img_source_urls
# images that start with a single '/'' are relative paths - to make the full url we add 'https://en.wikipedia.org'
# images that start with a double '//' are absolute paths - to make the full url we need to add just 'https:'

['//upload.wikimedia.org/wikipedia/commons/thumb/5/5f/Pandemic_pints.jpg/120px-Pandemic_pints.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Pharmacy_in_Ireland_during_the_2020_coronavirus_pandemic.jpg/120px-Pharmacy_in_Ireland_during_the_2020_coronavirus_pandemic.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/3/31/Queue_at_Irish_grocery_store_during_the_2020_coronavirus_pandemic.jpg/90px-Queue_at_Irish_grocery_store_during_the_2020_coronavirus_pandemic.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/3/3c/Collection_of_books_at_an_Irish_primary_school_during_the_2020_pandemic.jpg/90px-Collection_of_books_at_an_Irish_primary_school_during_the_2020_pandemic.jpg',
 '/static/images/footer/wikimedia-button.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/0/02/Bus_%C3%89ireann_social_distancing_signs_on_a_bus_in_October_2020.jpg/120px-Bus_%C3%89ireann_social_distancing_signs_on_a_bus_in_October_2020.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/8/8c

In [7]:
# Test can we extract all the image names from the list of image sources
# with our regular expression
image_regex = r'/([\w.%-]+[.](png|gif|jpg|bmp))$'

for source in img_source_urls:
    match = re.search(image_regex,source)
    if match:
        print(match.group(1))

120px-Pandemic_pints.jpg
120px-Pharmacy_in_Ireland_during_the_2020_coronavirus_pandemic.jpg
90px-Queue_at_Irish_grocery_store_during_the_2020_coronavirus_pandemic.jpg
90px-Collection_of_books_at_an_Irish_primary_school_during_the_2020_pandemic.jpg
wikimedia-button.png
120px-Bus_%C3%89ireann_social_distancing_signs_on_a_bus_in_October_2020.jpg
120px-Bram_Stoker_Park_social_distancing_01.jpg
50f6aad61a2a8195a12d35a437de8ea0ebbcd3b8.png
d93ec6db9af8497e6acc3aae6d4017d823b46f8b.png
9bcd5f4ee98e1c15516d51f481f89072995f73e3.png
120px-Supermarket_social_distancing_signs.jpg
poweredby_mediawiki_88x31.png
120px-Porterstown_Park_in_lockdown.jpg
16px-Folder_Hexagonal_Icon.svg.png
d03079a48ab27bc93af806e49216889d80e437ee.png
120px-First_Day_of_School.jpg
f864b1831cf5f9c59cbd36212fd099de1598a3a0.png
1309fecde84289d41061dafede8fe6631f3406fd.png
43fdcb8fc91d4c156198e30d0a1175f25fbb37e3.png
d602cd929fed40d7eacad4fed77f36837efcf258.png
90px-A_barber_shop_on_June%2C_29%2C_2020%2C_the_first_day_of_the_th

In [8]:
# Next step would be to iterate through all the img_source_urls
# Make a request and get each images content
# Write it to a file using the image name that we extract using our regex expression on each image url
# NOTE: Need to add on either 'http://', "https://" or the base_url dependng on weather the url starts with
# '//' for an absolute path or '/' for relative path
# NOTE: Check to see first if the regex re.search is None before attempt to request the URL

count_imgs_written = 0

for img_source in img_source_urls:
    # if the image path (src) starts with "//" its an absolute path
    if img_source.startswith("//"):
        full_img_url = parsed_url[0] + ":" + img_source
    # if the image path already starts with 'http' then it already the full url path
    elif img_source.startswith("http"):
        full_img_url = img_source
    # else we assume it's a relative path that starts with '/' and we need to concat with our base url
    else:
        full_img_url = base_url + img_source
    
    # The full and correct image URL should now be full_img_url
       
    
    # Make sure we have valid img filename using regex(.gif,.jpg etc....)
    # img_file_name will either be a re.match object or 'None' if there was no match
    img_file_name_match = re.search(image_regex,full_img_url)
    
    # print(full_img_url,"\n")
    
    # if img_file_name is None (Meaning that no re.MatchPObject was returned by re.search)
    if not img_file_name_match:
        print(f"COULD NOT FIND A VALID IMAGE NAME IN:\n '{full_img_url}'\n")
        continue # back to the start of the for loop if this happens
    
    # Otherwise we have an img_file_name and lets try and download that image file 
    # using the img url and write it the images folder with the file name
    # filepath show be like "images\myimage.jpg" which we can construct by
    # joining folder_name with the img_file_name
    # The inage file name should be h first group in out re.Match object img_file_name
    # due the way we constructed the regex expression 
    
    # Assign the group(1) string to img_file_name tp holdthe file name
    img_file_name = img_file_name_match.group(1)
    
    # Make the full file path where the file should be written
    filepath = os.path.join(folder_name,img_file_name)
     
    # Trying to write the files was casing an issue for one filename/filepath as the filepath was too
    # long for the winndows file system (over 250 characters)
    # ...so we can catch any exception thrown by the 'with open(filepath,"wb") as f:' statement
    try:
        # Open a file for writing bytes of data "wb"
        with open(filepath,"wb") as f:       
            # Get the contents of the image from the request for the image url
            image_content = requests.get(full_img_url).content
            # Write the image content to the file
            f.write(image_content)
            print(f"IMAGE:\n '{img_file_name}'has been written to FOLDER: '{folder_name}'\n")
            count_imgs_written += 1                  
    except:
        print(f"AN ERROR OCCURRED. COULD NOT WRITE FILE: {filepath}")
        
print(f"TOTAL NUMBER OF FILES WRITTEN: {count_imgs_written}")

IMAGE:
 '120px-Pandemic_pints.jpg'has been written to FOLDER: 'en.wikipedia.org 10 07 2020 10-11-55'

IMAGE:
 '120px-Pharmacy_in_Ireland_during_the_2020_coronavirus_pandemic.jpg'has been written to FOLDER: 'en.wikipedia.org 10 07 2020 10-11-55'

IMAGE:
 '90px-Queue_at_Irish_grocery_store_during_the_2020_coronavirus_pandemic.jpg'has been written to FOLDER: 'en.wikipedia.org 10 07 2020 10-11-55'

IMAGE:
 '90px-Collection_of_books_at_an_Irish_primary_school_during_the_2020_pandemic.jpg'has been written to FOLDER: 'en.wikipedia.org 10 07 2020 10-11-55'

IMAGE:
 'wikimedia-button.png'has been written to FOLDER: 'en.wikipedia.org 10 07 2020 10-11-55'

IMAGE:
 '120px-Bus_%C3%89ireann_social_distancing_signs_on_a_bus_in_October_2020.jpg'has been written to FOLDER: 'en.wikipedia.org 10 07 2020 10-11-55'

IMAGE:
 '120px-Bram_Stoker_Park_social_distancing_01.jpg'has been written to FOLDER: 'en.wikipedia.org 10 07 2020 10-11-55'

IMAGE:
 '50f6aad61a2a8195a12d35a437de8ea0ebbcd3b8.png'has been writt