In [1]:
# import modules

import requests
from bs4 import BeautifulSoup
import re
from time import sleep
import csv
import pandas as pd

In [2]:
# url of pokemon list
url = 'https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number'

# mine content
r = requests.get(url)

# gather text
html_content = r.text

# convert html into beautiful soup object
soup = BeautifulSoup(html_content, 'lxml')

In [3]:
# Title tag of soup object
soup.title.string

'List of Pokémon by National Pokédex number - Bulbapedia, the community-driven Pokémon encyclopedia'

In [4]:
# Create pokemon list of Bulbapedia links

# gather all lines with links from beautiful soup object
http_links = soup.find_all('a', {'href': re.compile(r'\(Pok%C3%A9mon\)')})

# append link to pokemon list
pokemon_links = []
for link in http_links:
    if link['href'] not in pokemon_links:
        pokemon_links.append(link['href'])

In [5]:
# check for 809 Pokemon (includes Meltan & Melmetal)
len(pokemon_links)

809

In [6]:
# url of pokemon 
url_poke = 'https://bulbapedia.bulbagarden.net' + pokemon_links[1]

# mine content
r_poke = requests.get(url_poke)

# gather text
html_content_poke = r_poke.text

soup_poke = BeautifulSoup(html_content_poke, 'lxml')

In [7]:
soup.find_all('a', {'id': re.compile(r'top')})

[<a id="top"></a>]

In [8]:
# filter for stats

# hp
soup_poke.find_all('th', {'style': re.compile(r'#FA92B2; width: 30px')})[0].string.strip()

# attack
soup_poke.find_all('th', {'style': re.compile(r'#F5AC78; width: 30px')})[0].string.strip()

# stats and corresponding color bands on bulbapedia stat table
stats = ['hp', 'attack', 'defense', 'sp.atk', 'sp.def', 'speed']
colorbands = ['#FA92B2','#F5AC78','#FAE078','#9DB7F5','#A7DB8D','#FA92B2']

In [9]:
soup_poke.find_all('big')[0].string.strip()#, {'style': re.compile(r'#FA92B2; width: 30px')})

'Bulbasaur'

In [10]:
# function to pull a list of pokemon names with stats from bulbapedia

def poke_web_scrape(url_list):
    '''
    Scrape bulbapedia webite for pokemon name and stats
    
    input: list of wiki suffix
    output: list of lists with pokemon name and stats
    '''
    for poke_link in url_list:
        
        # obtain html
        url_poke = 'https://bulbapedia.bulbagarden.net' + poke_link
        r_poke = requests.get(url_poke)
        html_content_poke = r_poke.text
        soup_poke = r_poke = BeautifulSoup(html_content_poke, 'lxml')
        
        # initialize list
        poke_stat_list = []
        
        # obtain pokemon name and append to list
        poke_name = soup_poke.find_all('big')[0].string.strip()
        poke_stat_list.append(poke_name)
        
        # obtain hp, attack, defense, sp.atk, sp.def, and speed stats
        for colorband in colorbands:
            stat_string = colorband+'; width: 30px'
            stat_value = soup_poke.find_all('th', {'style': re.compile(stat_string)})[0].string.strip()
            poke_stat_list.append(stat_value)
            
        poke_table.append(poke_stat_list)    
    
    
    sleep(1)
    
    return poke_table

In [11]:
poke_table = []
table_values = poke_web_scrape(pokemon_links)

In [12]:
headers = ['Name'] + stats

pokemon = pd.DataFrame(table_values, columns=headers)
pokemon.head()

Unnamed: 0,Name,hp,attack,defense,sp.atk,sp.def,speed
0,Victini,100,100,100,100,100,100
1,Bulbasaur,45,49,49,65,65,45
2,Ivysaur,60,62,63,80,80,60
3,Venusaur,80,82,83,100,100,80
4,Charmander,65,52,43,60,50,65


In [13]:
pokemon.to_csv('data\pokemon.csv', index=False)