### Web Scrape Wikipedia example 2
### https://en.wikipedia.org/wiki/COVID-19_pandemic_in_the_Republic_of_Ireland


In [1]:
%config Completer.use_jedi = False

from bs4 import BeautifulSoup
import requests
import re 
import os
from textblob import TextBlob

In [2]:
# create a folder 

foldername = 'images'
if not os.path.exists(foldername):
    os.makedirs(foldername)
    print(f'Folder {foldername} has been created.')
else:
    print(f'Folder {foldername} already exists.')

Folder images already exists.


In [3]:
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_in_the_Republic_of_Ireland"

In [4]:
# scrape all the images and save them to a subfolder 

response = requests.get(url)
response

<Response [200]>

In [5]:
# get the base path of the URL

from urllib.parse import urlparse

parsed_url = urlparse(url)
base_url = parsed_url[0] + '://' + parsed_url[1]
base_url


'https://en.wikipedia.org'

In [6]:
# use beautiful soup to get all the image tags and get all the image scalar values 

soup = BeautifulSoup(response.content)
#soup -> commented out so it wont print here

all_images = soup.find_all('img')
print(f'There is: {len(all_images)} images on this page')

There is: 49 images on this page


In [7]:
images_sources = [img['src'] for img in all_images]
images_sources

['//upload.wikimedia.org/wikipedia/commons/thumb/a/a9/Confirmed_cases_of_COVID-19_in_Ireland_by_county.webm/220px--Confirmed_cases_of_COVID-19_in_Ireland_by_county.webm.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/8/8f/Confirmed_cases_of_COVID-19_per_100k_inhabitants_in_Ireland.webm/220px--Confirmed_cases_of_COVID-19_per_100k_inhabitants_in_Ireland.webm.jpg',
 '/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/4a4c75fb1d1ff19c2d21b42abc2b0835c8e1ad12.png',
 '/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/9d94d24c11c1ecb40d77a5d0d8fb9c558b645edf.png',
 '/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/1309fecde84289d41061dafede8fe6631f3406fd.png',
 '/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/ef49aa69acd8b67e55d96372774cf57a7b7700fc.png',
 '/api/rest_v1/page/graph/png/COVID-19_pandemic_in_the_Republic_of_Ireland/0/8758e4d6b9691a45073b7c0eb05c0e152887a2a9.png',
 '/api/r

### <font color  ='red '> when image starts with a single / it is a RELATIVE patch we need to add base url in front <br>

### <font color  ='red '> if image starts with double // these are absolute paths all we need is to add 'https:' in front. 

In [11]:
# test can we extract the image names from the list of image sources 
img_regex = r'/([\w.%-]+[.](png|gif|jpg|bmp))$'


for source in images_sources:
    match = re.search(img_regex,source)
    if match:
        print (match.group())
   

/220px--Confirmed_cases_of_COVID-19_in_Ireland_by_county.webm.jpg
/220px--Confirmed_cases_of_COVID-19_per_100k_inhabitants_in_Ireland.webm.jpg
/4a4c75fb1d1ff19c2d21b42abc2b0835c8e1ad12.png
/9d94d24c11c1ecb40d77a5d0d8fb9c558b645edf.png
/1309fecde84289d41061dafede8fe6631f3406fd.png
/ef49aa69acd8b67e55d96372774cf57a7b7700fc.png
/8758e4d6b9691a45073b7c0eb05c0e152887a2a9.png
/ebd6e5e2ae198fd765116fbca51f5e87111a48a2.png
/355d4a8fead212453b682614171b41897938a2da.png
/10429fae5ab785e1831b667cd57c1837e2e229d1.png
/e11f1f8f64868d5e422975be6717e9342be66ce8.png
/50f6aad61a2a8195a12d35a437de8ea0ebbcd3b8.png
/7f1678c44e5a535f8a33b88edbf9be819c53107a.png
/5263381c4e3ebeabf2157a73d4381182ee81ce80.png
/440px-CoViD-19_IE.svg.png
/d03079a48ab27bc93af806e49216889d80e437ee.png
/43fdcb8fc91d4c156198e30d0a1175f25fbb37e3.png
/90px-A_barber_shop_on_June%2C_29%2C_2020%2C_the_first_day_of_the_third_phase_of_the_lifting_of_public_health_restrictions.jpg
/90px-Shop_door_in_Ireland_during_the_2020_coronavirus_pand

In [19]:
# now we have to iterate through image sources and send a request for each one of them 

# if an absolute path we just add http in front if not we add the base_url
for image  in images_sources:
    if image.startswith('//'):
        image = 'http:' + image
        #print(image)
    elif image.startswith('http'):
        pass             
    else:
        image = base_url + image 
        #print(image)
        
        
    image_name = re.search(img_regex,image)    
    
    if not image_name:
        print(f'Could not find an image name in {image}')
        continue
        
        
     # lets try to download the file and write it to the image folder 
    
    filepath = os.path.join(foldername,image_name.group(1))
    
    with open (filepath,'wb') as f:
        image_content = requests.get(image).content
        
        f.write(image_content)
print('All images are written')       

Could not find an image name in http://en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1
All images are written
