# Scrapping images on a fandom wiki

## Install missing modules

In [None]:
import os

In [None]:
os.system('pip install jupyterlab beautifulsoup4 pillow --quiet')

## Imports

In [None]:
from bs4 import BeautifulSoup
import requests
import shutil


In [None]:
pilots = "https://intothebreach.fandom.com/wiki/Pilots"
veks = "https://intothebreach.fandom.com/wiki/Vek"
mechs = "https://intothebreach.fandom.com/wiki/Mechs"
pages = {'pilots': pilots, 'veks': veks, 'mechs': mechs}

## Doing one page first

In [None]:
pilots_html = requests.get(pilots).text

In [None]:
soup = BeautifulSoup(pilots_html, 'html.parser')

In [None]:
soup.title.string

### Pulling one image only

In [None]:
# create a list of all the <img …> in the page
images = soup.find_all('img')

In [None]:
test = images[4]

In [None]:
# find the image's url
src = test.attrs.get('src')

In [None]:
src

In [None]:
# find "png"'s position in the url, to trim it after .png
src.index('png')

In [None]:
src = src[:src.index('png')+3]

In [None]:
# download with wget. You can replace wget by curl if needed by commenting the wget version and uncommenting the lines of the curl version
os.system(f"wget -cq {src}")

# curl version
#name = images[4].attrs.get("data-image-name").replace(" ", "_")
#os.system(f"curl {src} > {name} --silent")

### Pullings all the images in one page



In [None]:
images = soup.find_all('img')

In [None]:
for image in images:
    src = image.attrs.get('src') if "png" in image.attrs.get('scr', '') else image.attrs.get('data-src', '')
    if src:
        src = src[:src.index('png')+3]
        # download with wget
        #os.system(f"wget -cq {src}")
        # alternative : download with curl 
        name = image.attrs.get('data-image-name').replace(' ', '_')
        os.system(f"curl {src} > {name} --silent")
    

## clean up - delete downloaded png files

In [None]:
files = [f for f in os.listdir() if f.endswith('png')]
if files:
    for f in files:
        os.remove(f)

### Download all the pngs in each page to subfolders

In [None]:
pages

In [None]:
def wiki_image_download(pages):
    """From a dict of page_name: page_url, scan the html for img tags and download each png in a subfolder."""
    for dir_name, page_url in pages.items():
        # get the html and turn it into a beautiful soup
        html = requests.get(page_url).text
        soup = BeautifulSoup(html, 'html.parser')
        # scan the soup for all <img> tags
        images = soup.find_all('img')
        # create subfolder
        os.makedirs(dir_name, exist_ok=True)
        # iterate over the images. count will be used to print some text every 10 images
        for count, image in enumerate(images, 1):
            # get the src. It's in the attrs, but can be either in 'src' or 'data-src'
            # use '' as alternative value for get because in tests don't work with None
            src = image.attrs.get('src') if "png" in image.attrs.get('scr', '') else image.attrs.get('data-src', '')
            # if the image was not a png, src is an empty string.
            if src:
                # trim down src after .png
                src = src[:src.index('png')+3]
                # download in the subfolder
                os.system(f"wget -cq {src} --directory-prefix={dir_name}")
                # alternative : download with curl uncomment 2 lines below
                #name = image.attrs.get('data-image-name').replace(' ', '_')
                #os.system(f"curl {src} > {dir_name}/{name} --silent")
            if count % 10 == 0:
                print(f"Scanned {count} images. Note that some might not be proper pngs and thus not downloaded.")
        print(f"Finished downloading {len(os.listdir(dir_name))} images in page {dir_name}.")

In [None]:
wiki_image_download(pages)

## Now let's try our code on another similar wiki

In [None]:
slay_the_spire_base_url = "https://slay-the-spire.fandom.com/wiki/"

In [None]:
slay_the_spire_characters = ['Ironclad', 'Silent', 'Defect', 'Watcher']

slay_the_spire_pages = {char: f'{slay_the_spire_base_url}{char}_Cards' for char in slay_the_spire_characters}


In [None]:
slay_the_spire_pages

In [None]:
wiki_image_download(slay_the_spire_pages)

## Final clean up, delete the subfolders

<div class='alert alert-danger'>
    
Use the cell below to delete all the subfolders containing the downloaded images, if you don't need them.
    
</div>

In [None]:
folders_to_delete = list(pages.keys()) + list(slay_the_spire_pages.keys())

for folder in folders_to_delete:
    shutil.rmtree(folder, ignore_errors=True)
