In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import wikipedia as wp
from io import StringIO

Creating data for visualization. Taken from pokemondb site and from wikipedia. Going to merge them both because I want generation data in the dataframe as well, which is not present when scraping pokemondb

In [2]:
page = requests.get("https://pokemondb.net/pokedex/all")

dex = pd.read_html(StringIO(page.text), attrs = {'id': 'pokedex'}, index_col = '#')

dex[0].to_csv("output.csv", encoding='utf-8')

In [3]:
html = wp.page("List_of_Pokémon").html().encode("UTF-8")
old_df = pd.read_html(html)[2]  # Try 2nd table first as most pages contain contents table first

I unfortunately couldn't figure out a way to turn this into a neat little table using just pd.melt and pd.pivot commands. Maybe in the future I would be able to, but not now.

In [4]:
df = old_df.copy()

def process_generation(gen_num, gen_name, index_limit):
    gen_df = pd.concat([pd.Series(gen_num, index=np.arange(155)), df[gen_name]['Name']], axis=1)
    gen_df.columns = ['Generation', 'Name']
    gen_df.Name = gen_df.Name.map(lambda c: c[:-1] if c[-1].isupper() else c)
    return gen_df.loc[:index_limit]

generations = [
    (1, 'Generation I', 150),
    (2, 'Generation II', 99),
    (3, 'Generation III', 134),
    (4, 'Generation IV', 106),
    (5, 'Generation V', 156),
    (6, 'Generation VI', 71),
    (7, 'Generation VII', 87),
    (8, 'Generation VIII', 95),
    (9, 'Generation IX', 119),
]

# Process all generations and concatenate results
gens = pd.concat([process_generation(gen_num, gen_name, index_limit) for gen_num, gen_name, index_limit in generations])
gens

Unnamed: 0,Generation,Name
0,1.0,Bulbasaur
1,1.0,Ivysaur
2,1.0,Venusaur
3,1.0,Charmander
4,1.0,Charmeleon
...,...,...
115,9.0,Raging Bolt
116,9.0,Iron Boulder
117,9.0,Iron Crown
118,9.0,Terapagos


In [5]:
final = pd.DataFrame(dex[0]).merge(gens, how = 'left', on = 'Name')
final.Generation[final.Generation.isnull()]

3      NaN
7      NaN
8      NaN
12     NaN
19     NaN
        ..
1203   NaN
1204   NaN
1211   NaN
1212   NaN
1213   NaN
Name: Generation, Length: 241, dtype: float64

In [6]:
def label(name):
    keyword_map = {
        'Mega': 6,
        'Alolan': 7,
        'Galarian': 8,
        'Hisuian': 8,
        'Paldean': 9,
        'Therian': 5,
        'Incarnate': 5,
        'Partner': 8,
        'Nidoran♂': 1,
        'Nidoran♀': 1,
        'Breed': 9,
        'Castform': 3,
        'Primal': 7,
        'Deoxys': 3,
        'Burmy': 4,
        'Wormadam': 4,
        'Porygon-Z': 4,
        'Rotom': 4,
        'Red-Striped': 5,
        'Blue-Striped': 5,
        'White-Striped': 8,
        'Darmanitan': 5,
        'Kyurem': 5,
        'Keldeo': 5,
        'Meloetta': 5,
        'Genesect': 5,
        'Ash-Greninja': 7,
        'Flabébé': 6,
        'Meowstic': 6,
        'Aegislash': 6,
        'Pumpkaboo': 6,
        'Gourgeist': 6,
        '50%': 6,
        '10%': 7,
        'Complete Forme': 7,
        'Hoopa': 6,
        'Oricorio': 7,
        'Rockruff': 7,
        'Lycanroc': 7,
        'Wishiwashi': 7,
        'Cranidos': 4,
        'Rampardos': 4,
        'Shieldon': 4,
        'Bastiodon': 4,
        'Origin Forme': 4,
        'Shaymin': 4,
        'Altered Forme': 4,
        'Minior': 7,
        'Necrozma': 7,
        'Toxtricity': 8,
        'Eiscue': 8,
        'Indeedee': 8,
        'Morpeko': 8,
        'Zacian': 8,
        'Zamazenta': 8,
        'Eternamax': 8,
        'Urshifu': 8,
        'Calyrex': 8,
        'Bloodmoon': 8,
        'Oinkologne': 9,
        'Basculegion': 8,
        'Maushold': 9,
        'Squawkabilly': 9,
        'Palafin': 9,
        'Tatsugiri': 9,
        'Dudunsparce': 9,
        'Gimmighoul': 9,
        'Koraidon': 9,
        'Miraidon': 9,
        'Ogerpon': 9,
        'Terapagos': 9
    }

    # Iterate through the keyword map and check for presence in name
    for keyword, label_value in keyword_map.items():
        if keyword in name:
            return label_value
            
final.Generation = final.Generation.fillna(final.Name[final.Generation.isnull()].map(label))

In [7]:
final.Generation = final.Generation.astype(int)

In [8]:
final.to_csv('Pokemon.csv', index = False)

If there is any easier way to do this then I am going to curse the Gods up above. I think this was the only way to do it though, especially since I definitely wanted the pokemondb

More data just to get the names list of pseudo-legendary and legendary and mythical pokemon. Mostly goign through all this because it would be much too bothersome to have to go and type them all down manually.

In [9]:
url = 'https://www.serebii.net/pokemon/legendary.shtml'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [10]:
sub = [item.text for item in soup.find_all('table')][1:56]
leg = [item.text for item in soup.find_all('table')][57:84]
mythical = [item.text for item in soup.find_all('table')][85:]

In [11]:
sub_leg = []
for item in sub:
    item = item.split()
    if item[0] in ['Type:', 'Tapu']:
        sub_leg.append(item[0] + ' ' + item[1])
    elif 'Until' in item[0]:
        sub_leg.append(item[0].replace('Until', ''))
    else:
        sub_leg.append(item[0])

leg = [item.split()[0] for item in leg]
mythical = [item.split()[0] for item in mythical]

In [12]:
pd.DataFrame(sub_leg).to_csv('sub_leglist.csv', index = False)
pd.DataFrame(leg).to_csv('leglist.csv', index = False)
pd.DataFrame(mythical).to_csv('mythicallist.csv', index = False)

Pseudo legendary scraping here

In [13]:
url = 'https://bulbapedia.bulbagarden.net/wiki/Pseudo-legendary_Pok%C3%A9mon'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [14]:
pseudo_list = [item.text for item in soup.find_all('div', {'class': 'roundy'})][1:12]
pseudo_list = [item.split()[::2][1:] for item in pseudo_list]
pseudo_list.pop(9)
pseudo_list = [item for sublist in pseudo_list for item in sublist]
pseudo_list.extend(['Hisuian Goodra', 'Hisuian Sliggoo'])

In [18]:
pd.DataFrame(pseudo_list).to_csv('pseudolist.csv', index = False)