In [86]:
## manipulate stuff from internet
import requests
import bs4

## manipulate local files
import shutil
import os

## for converting strings properly
import string

## for timing info
import time

In [78]:
SAFEWORDS = ['Dream', 'HOME', 'Spr', '20th Anniversary', ' s', 'OD']
def has_safeword(s):
    '''returns whether s satisfies the "has a safeword" requirement.'''
    for i in SAFEWORDS:
        if i in s: return True
    if 'Café Mix' in s:
        if 'guest' in s or 'staff' in s: return True
    return False
    
def is_probably_image_of_poke(filename, pokename, pokenumber):
    '''returns whether filename probably represents a decent image of the pokemon or not.'''
    N = str(pokenumber).zfill(3)  #zfill(3) makes leading 0s until len(string)==3.
    Fname = filename.replace('_', ' ')
    if (Fname[:3]==N) and (Fname[3]!=' '): return True
    
    P = string.capwords(pokename)
    if ((N in Fname) or (P in Fname)) and has_safeword(Fname): return True
    
    return False

def local_file(name, folder, i=0):
    '''returns name for local file. e.g. folder/000-name'''
    return os.path.join(folder, str(i).zfill(3)+'-'+name)

def requests_get_SAFE(url, valid_codes=[200,], **kwargs):
    '''returns requests.get(url, **kwargs). Also asserts that status code is in valid_codes(==[200,] by default).'''
    r = requests.get(url, **kwargs)
    assert (r.status_code in valid_codes), \
            'Bad status_code, got '+str(r.status_code)+' but expected value in list'+str(valid_codes)
    return r

In [None]:
## constants ##
DATASET_DIR = 'dataset/scraped/'  #location for data to go to, on local machine.

PARSER      = 'html.parser'
BASE_URL    = 'https://archives.bulbagarden.net'
ARCHIVE_URL = BASE_URL + '/wiki/Category:'
FILE_URL    = BASE_URL + '/wiki/File:'
L_PREFIX    = len('/wiki/File:')

pokemon     = 'Jirachi' #which poke to get images for.
pokenumber  = 385       #pokemon's pokedex number

def download_images(pokemon, pokenumber, filepath=None):
    '''downloads images of pokemon from bulbapedia! filepath defaults to os.path.join(DATASET_DIR, pokemon).'''
    ## set up local stuff ##
    filepath  = filepath if filepath is not None else os.path.join(DATASET_DIR, pokemon)
    if not os.path.exists(filepath):
        os.mkdir(filepath)
    ## begin getting images ##
    pics_url  = string.capwords(ARCHIVE_URL + pokemon).replace(' ', '_')
    pics_page = requests_get_SAFE(pics_url) #page with many images on it. e.g. .../wiki/Category:Jirachi
    pics_soup = bs4.BeautifulSoup(pics_page.content, PARSER)
    picnames  = [x.get('href')[L_PREFIX:] for x in pics_soup.find_all('a', class_='image')]
    picnames_and_nums = [picnames[i], i for i in range(len(picnames)) if is_probably_image_of_poke(picnames[i], pokemon, pokenumber)]
    for picname, picnum in picnames_and_nums:
        pic_url = FILE_URL + picname
        pic_page = requests_get_SAFE(pic_url)
        pic_soup = bs4.BeautifulSoup(pic_page.content, PARSER)
        img_url = 'https:'+pic_soup.find('a', class_='internal').get('href')
        r = requests_get_SAFE(img_url, stream=True)
        with open(local_file(picname, filepath, picnum), 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)