In [142]:
## manipulate stuff from internet
import requests
import bs4

## manipulate local files
import shutil
import os

## for converting strings properly
import string

## for timing info
import time

## for list of all the pokemon
import  DataProcessing as dp #ProjectOAK file with data  processing functions.

In [105]:
### functions! ###
## determine if image name implies image is probably image of pokemon. ##
SAFEWORDS = ['Dream', 'HOME', 'Spr', '20th Anniversary', ' s', 'OD']
def has_safeword(s):
    '''returns whether s satisfies the "has a safeword" requirement.'''
    for i in SAFEWORDS:
        if i in s: return True
    if 'Café Mix' in s:
        if 'guest' in s or 'staff' in s: return True
    return False
    
def is_probably_image_of_poke(filename, pokename, pokenumber):
    '''returns whether filename probably represents a decent image of the pokemon or not.'''
    N = str(pokenumber).zfill(3)  #zfill(3) makes leading 0s until len(string)==3.
    Fname = filename.replace('_', ' ').replace("'","%27")
    if (Fname[:3]==N) and (Fname[3]!=' '): return True
    
    P = string.capwords(pokename)
    if ((N in Fname) or (P in Fname)) and has_safeword(Fname): return True
    
    return False

## naming convention for how to save downloaded files. ##
def local_file(name, folder, i=0):
    '''returns name for local file. e.g. folder/000-name'''
    return os.path.join(folder, str(i).zfill(3)+'-'+name)

## get stuff from url and ensure the request actually went through. ##
def requests_get_SAFE(url, valid_codes=[200,], **kwargs):
    '''returns requests.get(url, **kwargs). Also asserts that status code is in valid_codes(==[200,] by default).'''
    r = requests.get(url, **kwargs)
    assert (r.status_code in valid_codes), \
            'Bad status_code, got '+str(r.status_code)+' but expected value in list'+str(valid_codes)
    return r

## convenience function for clearing a line during print statements. ##
def _print_clear(N=80):
    '''clears current printed line of up to N characters, and returns cursor to beginning of the line.
    debugging: make sure to use print(..., end=''), else your print statement will go to the next line.
    '''
    print('\r'+' '*N+'\r',end='')

In [130]:
## constants ##
DATASET_DIR = 'dataset/scraped/'  #location for data to go to, on local machine.

PARSER      = 'html.parser'
BASE_URL    = 'https://archives.bulbagarden.net'
ARCHIVE_URL = BASE_URL + '/wiki/Category:'
FILE_URL    = BASE_URL + '/wiki/File:'
L_PREFIX    = len('/wiki/File:')

def picnames_of_poke(pokemon):
    '''returns the names of all the (online) pics of pokemon'''
    pics_url  = string.capwords(ARCHIVE_URL + pokemon).replace(' ', '_')
    pics_page = requests_get_SAFE(pics_url) #page with many images on it. e.g. .../wiki/Category:Jirachi
    pics_soup = bs4.BeautifulSoup(pics_page.content, PARSER)
    picnames  = [x.get('href')[L_PREFIX:] for x in pics_soup.find_all('a', class_='image')]
    return picnames
    
def probably_good_picnames_and_nums(picnames, pokemon, pokenumber):
    '''returns [(picnames[i], i) for i in range(len(picnames)) if picnames[i] is probably an image of pokemon.'''
    return [(picnames[i], i) for i in range(len(picnames)) if is_probably_image_of_poke(picnames[i], pokemon, pokenumber)]

def download_images(pokemon, pokenumber, filepath=None, MAXN=None):
    '''downloads images of pokemon from bulbapedia!
    filepath = where to save images locally. defaults to os.path.join(DATASET_DIR, pokemon).
    MAXN = max number of images to download for this pokemon. defaults to None (get all the images). Mainly for debugging.
    returns number of images it tried to download.
    '''
    now=time.time()
    ## set up local stuff ##
    filepath  = filepath if filepath is not None else os.path.join(DATASET_DIR, pokemon)
    if not os.path.exists(filepath):
        os.mkdir(filepath)
    ## get list of images (at page of many images) ##
    picnames  = picnames_of_poke(pokemon)
    picnames_and_nums = probably_good_picnames_and_nums(picnames, pokemon, pokenumber)[:MAXN]
    for p in range(len(picnames_and_nums)):
        picname, picnum = picnames_and_nums[p]
        print('Downloading {:s} pic {:3d} of {:3d}; ({:3d} of {:3d} online): {:s}'.format(
               pokemon, p+1, len(picnames_and_nums), picnum+1, len(picnames), picname    ), end='')
        pic_url = FILE_URL + picname
        pic_page = requests_get_SAFE(pic_url)
        pic_soup = bs4.BeautifulSoup(pic_page.content, PARSER)
        img_url = 'https:'+pic_soup.find('a', class_='internal').get('href')
        r = requests_get_SAFE(img_url, stream=True)
        with open(local_file(picname, filepath, picnum), 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
        _print_clear(N=200)
    print('Completed download of {:3d} images for {:20s} in {:5.2f} seconds.'.format(len(picnames_and_nums), pokemon, time.time()-now))
    return len(picnames_and_nums) #returns number of

In [132]:
pokemon     = "Farfetch'd" #which poke to get images for.
pokenumber  = 122       #pokemon's pokedex number
#picnames_of_poke(pokemon) #tells which images exist for this pokemon. useful for debugging (esp. if no images are downloaded.)
#download_images(pokemon, pokenumber, MAXN=3) #actually download some images.

Completed download of   0 images for Farfetch'd           in  0.65 seconds.


In [143]:
csvdata, cc = dp.read_csv(dp.CSVFILE2)

Took  0.01 seconds to read data from dataset/Pokedex_Ver6.csv


In [154]:
pokerows    = csvdata[:,cc.CODE]=='1'
POKENAMES   = csvdata[:,cc.NAME][pokerows]
POKENUMBERS = csvdata[:,cc.NUMBER][pokerows]

In [160]:
for i in range(len(POKENAMES)):
    pokemon, pokenumber = POKENAMES[i], POKENUMBERS[i]
    failed = []
    try:
        download_images(pokemon, pokenumber)
    except:
        print('failed to get any images for pokemon {:s}'.format(pokemon))
        failed+=[pokemon]

Completed download of  76 images for Bulbasaur            in 42.31 seconds.                                                                                                                             
Completed download of  76 images for Ivysaur              in 41.30 seconds.                                                                                                                             
Completed download of 114 images for Venusaur             in 69.98 seconds.                                                                                                                             
Completed download of  77 images for Charmander           in 46.74 seconds.                                                                                                                             
Completed download of  75 images for Charmeleon           in 43.79 seconds.                                                                                                                         

In [161]:
failed

[]

In [163]:
#woops I did a bug (failed=[] inside loop), so it didn't track which ones failed... I just got them manually & put here:
failed = ['Nidoran F', 'Nidoran M', 'Ursaring','Ho-Oh', 'Porygon-Z', 'Flabebe', 'Meowstic M', 'Convisquire', 'Conviknight', 'Indeedee M', 'Zamazanta']
failed_partly = ['Tentacruel', 'Grimer', 'Weezing', 'Ditto', 'Wobbuffet', 'Remoraid', 'Stantler', 'Blaziken', 'Spoink', 'Lumineon', 'Sawsbuck', 'Lampent', 'Quilladin']

#note some of the 'failed' names are just typos in the csv... e.g. Conviknight should be Corviknight.

In [173]:
#let's try the failed_partly list again; it's possible there was just a disconnection issue or something that messed it up.
partlyrows    = [x[0] for x in dp.vectorized_rows_where(failed_partly, csvdata)]
partlynames   = csvdata[:,cc.NAME][partlyrows]
partlynumbers = csvdata[:,cc.NUMBER][partlyrows]

In [174]:
failed2 = []
for i in range(len(partlyrows)):
    pokemon, pokenumber = partlynames[i], partlynumbers[i]
    try:
        download_images(pokemon, pokenumber)
    except:
        print('failed to get any images for pokemon {:s}'.format(pokemon))
        failed2+=[pokemon, pokenumber]

Completed download of  68 images for Tentacruel           in 37.55 seconds.                                                                                                                             
Completed download of  82 images for Grimer               in 45.04 seconds.                                                                                                                             
Completed download of  79 images for Weezing              in 40.56 seconds.                                                                                                                             
Completed download of  67 images for Ditto                in 43.25 seconds.                                                                                                                             
Completed download of  88 images for Wobbuffet            in 44.27 seconds.                                                                                                                         

**It looks like these partial failures were probably due to internet connection issues or maybe Bulbapedia didn't like my continual requests for data. I don't know enough about data scraping to really diagnose why it failed the first time, but hey, it worked now so we're good to go :)**

Just leaving a quick note here; the bulbapedia pages the images were scraped from from have a max of 200 images per page. Usually the most relevant images are at the top but some pokemon have many many images, such as over 900 for Eevee, and some of the images on pages after the first page are actually decent. It is surely *possible* to get more data by going through these pages, though for now let's just ignore them because we have a decent amount of data already.

In [None]:
#TODO: get images for the FULLY failed pokemon.
#This will probably require going through one by one and figuring out how they are referred to on Bulbapedia.

#TODO: note that any files which have a shiny version (e.g. 385.png & 385_s.png) usually are clean images.
#Previously, '_s' was even used as a safeword (safe to include those images).
#The to do here is to use this info to also get non-shiny versions of any images
#    which were included by merit of the '_s' safeword, if/when such images actually exist.
#    This is likely to be a high payout in terms of getting good quality data.