### Web Scrape Wikipedia example 2
### https://en.wikipedia.org/wiki/COVID-19_pandemic_in_the_Republic_of_Ireland


In [1]:
%config Completer.use_jedi = False

from bs4 import BeautifulSoup
import requests
import re 
import os
from textblob import TextBlob

In [2]:
# create a folder 

foldername = 'tatts'
if not os.path.exists(foldername):
    os.makedirs(foldername)
    print(f'Folder {foldername} has been created.')
else:
    print(f'Folder {foldername} already exists.')

Folder tatts has been created.


In [3]:
url = "https://www.tattersalls.com/news/four-top-1-000-000-guineas-on-opening-day-of-tattersalls-october-book-1"

In [4]:
# scrape all the images and save them to a subfolder 

response = requests.get(url)
response

<Response [200]>

In [5]:
# get the base path of the URL

from urllib.parse import urlparse

parsed_url = urlparse(url)
base_url = parsed_url[0] + '://' + parsed_url[1]
base_url


'https://www.tattersalls.com'

In [6]:
# use beautiful soup to get all the image tags and get all the image scalar values 

soup = BeautifulSoup(response.content)
#soup -> commented out so it wont print here

all_images = soup.find_all('img')
print(f'There is: {len(all_images)} images on this page')

There is: 12 images on this page


In [7]:
images_sources = [img['src'] for img in all_images]
images_sources

['/img/logo.svg',
 '/img/social-facebook--gold.svg',
 '/img/social-twitter--gold.svg',
 '/img/social-email--gold.svg',
 'https://www.tattersalls.com/img/news/big/2020oc1lot174.jpg',
 '/img/news/big/2020OC1Lot109.jpg',
 '/img/news/big/2020oc1lot41.jpg',
 '/img/news/big/2020oc1lot162.jpg',
 '/img/logo-icon.svg',
 '/img/newmarket/social-facebook.svg',
 '/img/newmarket/social-twitter.svg',
 '/img/newmarket/social-instagram.svg']

### <font color  ='red '> when image starts with a single / it is a RELATIVE patch we need to add base url in front <br>

### <font color  ='red '> if image starts with double // these are absolute paths all we need is to add 'https:' in front. 

In [8]:
# test can we extract the image names from the list of image sources 
img_regex = r'/([\w.%-]+[.](png|gif|jpg|bmp))$'


for source in images_sources:
    match = re.search(img_regex,source)
    if match:
        print (match.group())
   

/2020oc1lot174.jpg
/2020OC1Lot109.jpg
/2020oc1lot41.jpg
/2020oc1lot162.jpg


In [9]:
# now we have to iterate through image sources and send a request for each one of them 

# if an absolute path we just add http in front if not we add the base_url
for image  in images_sources:
    if image.startswith('//'):
        image = 'http:' + image
        #print(image)
    elif image.startswith('http'):
        pass             
    else:
        image = base_url + image 
        #print(image)
        
        
    image_name = re.search(img_regex,image)    
    
    if not image_name:
        print(f'Could not find an image name in {image}')
        continue
        
        
     # lets try to download the file and write it to the image folder 
    
    filepath = os.path.join(foldername,image_name.group(1))
    
    with open (filepath,'wb') as f:
        image_content = requests.get(image).content
        
        f.write(image_content)
print('All images are written')       

Could not find an image name in https://www.tattersalls.com/img/logo.svg
Could not find an image name in https://www.tattersalls.com/img/social-facebook--gold.svg
Could not find an image name in https://www.tattersalls.com/img/social-twitter--gold.svg
Could not find an image name in https://www.tattersalls.com/img/social-email--gold.svg
Could not find an image name in https://www.tattersalls.com/img/logo-icon.svg
Could not find an image name in https://www.tattersalls.com/img/newmarket/social-facebook.svg
Could not find an image name in https://www.tattersalls.com/img/newmarket/social-twitter.svg
Could not find an image name in https://www.tattersalls.com/img/newmarket/social-instagram.svg
All images are written
