# Pokemon Project // Data Scraping Part

In [1]:
# importing used lbraries and modules

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [2]:
# we define the web page we are going to parse

page = requests.get("https://pokemondb.net/pokedex/all")
soup = BeautifulSoup(page.content,'html.parser')

In [5]:
number_test = soup.find_all('span', class_="infocard-cell-data")[0].get_text()
number_test

'001'

### Pokedex Number

In [6]:
# we get the pokedex number for all pokemon

pokedex_number = []

def get_pokedex_number():
    for row in soup.find_all('span', class_="infocard-cell-data"):
        text = row.get_text()
        num = int(text)
        pokedex_number.append(num)

In [7]:
get_pokedex_number()

### Name

In [10]:
# we get the name of all pokemon, and deal with a few exceptions

pokemon_name = []

def get_pokemon_name():
    for row in soup.find_all('a', class_="ent-name"):
        name = row.get_text().replace('é','e').replace('è','e')
        if name == 'Nidoran♀':
            name = 'Nidoran-f'
        if name == 'Nidoran♂':
            name = 'Nidoran-m'
        if name == 'Mr. Mime':
            name = 'Mr-Mime'
        if name == 'Mime Jr.':
            name = 'Mime-Jr'
        if name == 'Type: Null':
            name = 'Type-Null'
            
        pokemon_name.append(name)

In [11]:
get_pokemon_name()

In [13]:
# we create a dataframe with the 2 lists we've previously created

df_pokemon = pd.DataFrame(list(zip(pokedex_number, pokemon_name)), columns = ['Pokedex Number', 'Name'])

In [14]:
df_pokemon

Unnamed: 0,Pokedex Number,Name
0,1,Bulbasaur
1,2,Ivysaur
2,3,Venusaur
3,3,Venusaur
4,4,Charmander
...,...,...
921,805,Stakataka
922,806,Blacephalon
923,807,Zeraora
924,808,Meltan


### URL

In [15]:
# we define the url of all pokemon pages

url_list = []
base_url = "https://pokemondb.net/pokedex/"

for index, row in df_pokemon.iterrows():
    url = base_url + row['Name'].lower().replace(' ','-')
    url_list.append(url)

In [17]:
# and we store these url in our dataframe

df_pokemon['URL'] = url_list

### Types

In [19]:
pagetest = requests.get("https://pokemondb.net/pokedex/mr-mime")
souptest = BeautifulSoup(pagetest.content,'html.parser')

In [20]:
souptest.find_all('tr')[1].find('td').get_text().strip().split(" ")

['Psychic', 'Fairy']

In [21]:
# we get the types for all pokemon (all pokemon have either 1 or 2 types)

type1 = []
type2 = []

def get_types():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        types = soup.find_all('tr')[1].find('td').get_text().strip().split(" ")
        type1.append(types[0])
        if len(types) == 1:
            type2.append("None")
        else:
            type2.append(types[1])

In [22]:
get_types()

In [24]:
df_pokemon['Type1'] = type1
df_pokemon['Type2'] = type2

### Height

In [26]:
raw_height = souptest.find_all('tr')[3].find('td').get_text()

In [27]:
float(re.findall("\d+\.\d+", raw_height)[0])

1.3

In [28]:
# we parse all pokemon personal pages and get the pokemon's height (in m)

height_list = []

def get_height():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        raw_height = soup.find_all('tr')[3].find('td').get_text()
        height = float(re.findall("\d+\.\d+", raw_height)[0])
        height_list.append(height)

In [29]:
get_height()

### Weight

In [31]:
raw_weight = souptest.find_all('tr')[4].find('td').get_text()

In [32]:
float(re.findall("\d+\.\d+", raw_weight)[0])

54.5

In [33]:
# we parse all pokemon personal pages and get the pokemon's weight (in kg)

weight_list = []

def get_weight():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        raw_weight = soup.find_all('tr')[4].find('td').get_text()
        weight = float(re.findall("\d+\.\d+", raw_weight)[0])
        weight_list.append(weight)

In [34]:
get_weight()

In [36]:
# and we store these info in our dataframe

df_pokemon['Height'] = height_list
df_pokemon['Weight'] = weight_list

### HP

In [38]:
pagetest= requests.get("https://pokemondb.net/pokedex/salamence")
souptest = BeautifulSoup(pagetest.content,'html.parser')

In [39]:
souptest.find_all('tr')[15].find_all('td')[-4].get_text()

'95'

In [40]:
# we parse all pokemon personal pages and get the pokemon's base HP (at lvl 1), 
# min HP (at lvl 100) and max HP (at lvl 100)

base_HP_list = []
min_HP_list = []
max_HP_list = []

def get_HP():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        base_HP = soup.find_all('tr')[15].find_all('td')[-4].get_text()
        min_HP = soup.find_all('tr')[15].find_all('td')[-2].get_text()
        max_HP = soup.find_all('tr')[15].find_all('td')[-1].get_text()
        base_HP_list.append(int(base_HP))
        min_HP_list.append(int(min_HP))
        max_HP_list.append(int(max_HP))

In [41]:
get_HP()

In [42]:
# and we store these stats in our dataframe

df_pokemon['Base HP'] = base_HP_list
df_pokemon['Min HP'] = min_HP_list
df_pokemon['Max HP'] = max_HP_list

### Attack

In [44]:
# we parse all pokemon personal pages and get the pokemon's base Attack (at lvl 1), 
# min Attack (at lvl 100) and max Attack (at lvl 100)

base_attack_list = []
min_attack_list = []
max_attack_list = []

def get_attack():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        base_attack = soup.find_all('tr')[16].find_all('td')[-4].get_text()
        min_attack = soup.find_all('tr')[16].find_all('td')[-2].get_text()
        max_attack = soup.find_all('tr')[16].find_all('td')[-1].get_text()
        base_attack_list.append(int(base_attack))
        min_attack_list.append(int(min_attack))
        max_attack_list.append(int(max_attack))

In [45]:
get_attack()

In [46]:
# and we store these stats in our dataframe

df_pokemon['Base Attack'] = base_attack_list
df_pokemon['Min Attack'] = min_attack_list
df_pokemon['Max Attack'] = max_attack_list

### Defense

In [47]:
# we parse all pokemon personal pages and get the pokemon's base Defense (at lvl 1), 
# min Defense (at lvl 100) and max Defense (at lvl 100)

base_defense_list = []
min_defense_list = []
max_defense_list = []

def get_defense():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        base_defense = soup.find_all('tr')[17].find_all('td')[-4].get_text()
        min_defense = soup.find_all('tr')[17].find_all('td')[-2].get_text()
        max_defense = soup.find_all('tr')[17].find_all('td')[-1].get_text()
        base_defense_list.append(int(base_defense))
        min_defense_list.append(int(min_defense))
        max_defense_list.append(int(max_defense))

In [48]:
get_defense()

In [49]:
# and we store these stats in our dataframe

df_pokemon['Base Defense'] = base_defense_list
df_pokemon['Min Defense'] = min_defense_list
df_pokemon['Max Defense'] = max_defense_list

### Sp. Attack

In [51]:
# we parse all pokemon personal pages and get the pokemon's base Sp. Attack (at lvl 1), 
# min Sp. Attack (at lvl 100) and max Sp. Attack (at lvl 100)

base_sp_attack_list = []
min_sp_attack_list = []
max_sp_attack_list = []

def get_sp_attack():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        base_sp_attack = soup.find_all('tr')[18].find_all('td')[-4].get_text()
        min_sp_attack = soup.find_all('tr')[18].find_all('td')[-2].get_text()
        max_sp_attack = soup.find_all('tr')[18].find_all('td')[-1].get_text()
        base_sp_attack_list.append(int(base_sp_attack))
        min_sp_attack_list.append(int(min_sp_attack))
        max_sp_attack_list.append(int(max_sp_attack))

In [52]:
get_sp_attack()

In [53]:
# and we store these stats in our dataframe

df_pokemon['Base Sp. Attack'] = base_sp_attack_list
df_pokemon['Min Sp. Attack'] = min_sp_attack_list
df_pokemon['Max Sp. Attack'] = max_sp_attack_list

### Sp. Defense

In [55]:
# we parse all pokemon personal pages and get the pokemon's base Sp. Defense (at lvl 1), 
# min Sp. Defense (at lvl 100) and max Sp. Defense (at lvl 100)

base_sp_defense_list = []
min_sp_defense_list = []
max_sp_defense_list = []

def get_sp_defense():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        base_sp_defense = soup.find_all('tr')[19].find_all('td')[-4].get_text()
        min_sp_defense = soup.find_all('tr')[19].find_all('td')[-2].get_text()
        max_sp_defense = soup.find_all('tr')[19].find_all('td')[-1].get_text()
        base_sp_defense_list.append(int(base_sp_defense))
        min_sp_defense_list.append(int(min_sp_defense))
        max_sp_defense_list.append(int(max_sp_defense))

In [56]:
get_sp_defense()

In [57]:
# and we store these stats in our dataframe

df_pokemon['Base Sp. Defense'] = base_sp_defense_list
df_pokemon['Min Sp. Defense'] = min_sp_defense_list
df_pokemon['Max Sp. Defense'] = max_sp_defense_list

### Speed

In [59]:
# we parse all pokemon personal pages and get the pokemon's base Speed (at lvl 1), 
# min Speed (at lvl 100) and max Speed (at lvl 100)

base_speed_list = []
min_speed_list = []
max_speed_list = []

def get_speed():
    for index, row in df_pokemon.iterrows():
        page = requests.get(str(row['URL']))
        soup  = BeautifulSoup(page.content,'html.parser')
        base_speed = soup.find_all('tr')[20].find_all('td')[-4].get_text()
        min_speed = soup.find_all('tr')[20].find_all('td')[-2].get_text()
        max_speed = soup.find_all('tr')[20].find_all('td')[-1].get_text()
        base_speed_list.append(int(base_speed))
        min_speed_list.append(int(min_speed))
        max_speed_list.append(int(max_speed))

In [60]:
get_speed()

In [61]:
# and we store these stats in our dataframe

df_pokemon['Base Speed'] = base_speed_list
df_pokemon['Min Speed'] = min_speed_list
df_pokemon['Max Speed'] = max_speed_list

In [63]:
# we don't want the "Mega" forms of our pokemon (around 100 values out of 900), so we drop them

df_pokemon = df_pokemon.drop_duplicates(subset=None, keep='first', inplace=False).reset_index(drop=True)

In [64]:
df_pokemon

Unnamed: 0,Pokedex Number,Name,URL,Type1,Type2,Height,Weight,Base HP,Min HP,Max HP,...,Max Defense,Base Sp. Attack,Min Sp. Attack,Max Sp. Attack,Base Sp. Defense,Min Sp. Defense,Max Sp. Defense,Base Speed,Min Speed,Max Speed
0,1,Bulbasaur,https://pokemondb.net/pokedex/bulbasaur,Grass,Poison,0.7,6.9,45,200,294,...,216,65,121,251,65,121,251,45,85,207
1,2,Ivysaur,https://pokemondb.net/pokedex/ivysaur,Grass,Poison,1.0,13.0,60,230,324,...,247,80,148,284,80,148,284,60,112,240
2,3,Venusaur,https://pokemondb.net/pokedex/venusaur,Grass,Poison,2.0,100.0,80,270,364,...,291,100,184,328,100,184,328,80,148,284
3,4,Charmander,https://pokemondb.net/pokedex/charmander,Fire,,0.6,8.5,39,188,282,...,203,60,112,240,50,94,218,65,121,251
4,5,Charmeleon,https://pokemondb.net/pokedex/charmeleon,Fire,,1.1,19.0,58,226,320,...,236,80,148,284,65,121,251,80,148,284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804,805,Stakataka,https://pokemondb.net/pokedex/stakataka,Rock,Steel,5.5,820.0,61,232,326,...,573,53,99,225,101,186,331,13,27,137
805,806,Blacephalon,https://pokemondb.net/pokedex/blacephalon,Fire,Ghost,1.8,13.0,53,216,310,...,225,151,276,441,79,146,282,107,197,344
806,807,Zeraora,https://pokemondb.net/pokedex/zeraora,Electric,,1.5,44.5,88,286,380,...,273,102,188,333,80,148,284,143,261,423
807,808,Meltan,https://pokemondb.net/pokedex/meltan,Steel,,0.2,8.0,46,202,296,...,251,55,103,229,35,67,185,34,65,183


### Total

In [65]:
# we compute the total for each stats category (base at lvl 1, min at lvl 100 and max at lvl 100)

df_pokemon['Base Total'] = df_pokemon['Base HP'] + df_pokemon['Base Attack'] + df_pokemon['Base Defense'] + df_pokemon['Base Sp. Attack'] + df_pokemon['Base Sp. Defense'] + df_pokemon['Base Speed']
df_pokemon['Min Total'] = df_pokemon['Min HP'] + df_pokemon['Min Attack'] + df_pokemon['Min Defense'] + df_pokemon['Min Sp. Attack'] + df_pokemon['Min Sp. Defense'] + df_pokemon['Min Speed']
df_pokemon['Max Total'] = df_pokemon['Max HP'] + df_pokemon['Max Attack'] + df_pokemon['Max Defense'] + df_pokemon['Max Sp. Attack'] + df_pokemon['Max Sp. Defense'] + df_pokemon['Max Speed']

### Legendary

In [66]:
# we copy and paste a legendary pokemon list found on Wikipedia

legendary_raw_list = 'Articuno • Zapdos • Moltres • Mewtwo • Mew • Raikou • Entei • Suicune • Lugia • Ho-oh • Celebi • Regirock • Regice • Registeel • Latias • Latios • Kyogre • Groudon • Rayquaza • Jirachi • Deoxys • Uxie • Mesprit • Azelf • Dialga • Palkia • Heatran • Regigigas • Giratina • Cresselia • Phione • Manaphy • Darkrai • Shaymin • Arceus • Victini • Cobalion • Terrakion • Virizion • Tornadus • Thundurus • Reshiram • Zekrom • Landorus • Kyurem • Keldeo • Meloetta • Genesect • Xerneas • Yveltal • Zygarde • Diancie • Hoopa • Volcanion • Type-Null • Silvally • Tapu Koko • Tapu Lele • Tapu Bulu • Tapu Fini • Cosmog • Cosmoem • Solgaleo • Lunala • Necrozma • Magearna • Marshadow • Zeraora • Meltan • Melmetal'

In [67]:
legendary_raw_list = legendary_raw_list.replace('•',',')

In [68]:
legendary_raw_list

'Articuno , Zapdos , Moltres , Mewtwo , Mew , Raikou , Entei , Suicune , Lugia , Ho-oh , Celebi , Regirock , Regice , Registeel , Latias , Latios , Kyogre , Groudon , Rayquaza , Jirachi , Deoxys , Uxie , Mesprit , Azelf , Dialga , Palkia , Heatran , Regigigas , Giratina , Cresselia , Phione , Manaphy , Darkrai , Shaymin , Arceus , Victini , Cobalion , Terrakion , Virizion , Tornadus , Thundurus , Reshiram , Zekrom , Landorus , Kyurem , Keldeo , Meloetta , Genesect , Xerneas , Yveltal , Zygarde , Diancie , Hoopa , Volcanion , Type-Null , Silvally , Tapu Koko , Tapu Lele , Tapu Bulu , Tapu Fini , Cosmog , Cosmoem , Solgaleo , Lunala , Necrozma , Magearna , Marshadow , Zeraora , Meltan , Melmetal'

In [69]:
legendary_list = legendary_raw_list.split(',')

In [71]:
legendary_list = [name.strip() for name in legendary_list]

In [72]:
legendary_list

['Articuno',
 'Zapdos',
 'Moltres',
 'Mewtwo',
 'Mew',
 'Raikou',
 'Entei',
 'Suicune',
 'Lugia',
 'Ho-oh',
 'Celebi',
 'Regirock',
 'Regice',
 'Registeel',
 'Latias',
 'Latios',
 'Kyogre',
 'Groudon',
 'Rayquaza',
 'Jirachi',
 'Deoxys',
 'Uxie',
 'Mesprit',
 'Azelf',
 'Dialga',
 'Palkia',
 'Heatran',
 'Regigigas',
 'Giratina',
 'Cresselia',
 'Phione',
 'Manaphy',
 'Darkrai',
 'Shaymin',
 'Arceus',
 'Victini',
 'Cobalion',
 'Terrakion',
 'Virizion',
 'Tornadus',
 'Thundurus',
 'Reshiram',
 'Zekrom',
 'Landorus',
 'Kyurem',
 'Keldeo',
 'Meloetta',
 'Genesect',
 'Xerneas',
 'Yveltal',
 'Zygarde',
 'Diancie',
 'Hoopa',
 'Volcanion',
 'Type-Null',
 'Silvally',
 'Tapu Koko',
 'Tapu Lele',
 'Tapu Bulu',
 'Tapu Fini',
 'Cosmog',
 'Cosmoem',
 'Solgaleo',
 'Lunala',
 'Necrozma',
 'Magearna',
 'Marshadow',
 'Zeraora',
 'Meltan',
 'Melmetal']

In [73]:
# we create a list containing a boolean for each Pokemon (True if legendary, False if not legendary)

legendary = []

for index, row in df_pokemon.iterrows():
    if row['Name'] in legendary_list:
        legendary.append("True")
    else:
        legendary.append("False")

In [76]:
df_pokemon['Legendary'] = legendary

### Generation

In [78]:
# according to their pokedex number, we define the generation which each pokemon belongs to (ranges found on the internet)

gen_list = []

for index, row in df_pokemon.iterrows():
    if 0 < row['Pokedex Number'] < 152:
        gen = 1
    elif 151 < row['Pokedex Number'] < 252:
        gen = 2
    elif 251 < row['Pokedex Number'] < 387:
        gen = 3
    elif 386 < row['Pokedex Number'] < 494:
        gen = 4
    elif 493 < row['Pokedex Number'] < 650:
        gen = 5
    elif 649 < row['Pokedex Number'] < 722:
        gen = 6
    elif 721 < row['Pokedex Number']:
        gen = 7
    
    gen_list.append(gen)

In [80]:
df_pokemon['Generation'] = gen_list

In [82]:
df_pokemon

Unnamed: 0,Pokedex Number,Name,URL,Type1,Type2,Height,Weight,Base HP,Min HP,Max HP,...,Min Sp. Defense,Max Sp. Defense,Base Speed,Min Speed,Max Speed,Base Total,Min Total,Max Total,Legendary,Generation
0,1,Bulbasaur,https://pokemondb.net/pokedex/bulbasaur,Grass,Poison,0.7,6.9,45,200,294,...,121,251,45,85,207,318,711,1435,False,1
1,2,Ivysaur,https://pokemondb.net/pokedex/ivysaur,Grass,Poison,1.0,13.0,60,230,324,...,148,284,60,112,240,405,871,1624,False,1
2,3,Venusaur,https://pokemondb.net/pokedex/venusaur,Grass,Poison,2.0,100.0,80,270,364,...,184,328,80,148,284,525,1091,1884,False,1
3,4,Charmander,https://pokemondb.net/pokedex/charmander,Fire,,0.6,8.5,39,188,282,...,94,218,65,121,251,309,694,1417,False,1
4,5,Charmeleon,https://pokemondb.net/pokedex/charmeleon,Fire,,1.1,19.0,58,226,320,...,121,251,80,148,284,405,870,1624,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804,805,Stakataka,https://pokemondb.net/pokedex/stakataka,Rock,Steel,5.5,820.0,61,232,326,...,186,331,13,27,137,570,1168,1989,False,7
805,806,Blacephalon,https://pokemondb.net/pokedex/blacephalon,Fire,Ghost,1.8,13.0,53,216,310,...,146,282,107,197,344,570,1167,1990,False,7
806,807,Zeraora,https://pokemondb.net/pokedex/zeraora,Electric,,1.5,44.5,88,286,380,...,148,284,143,261,423,600,1228,2048,True,7
807,808,Meltan,https://pokemondb.net/pokedex/meltan,Steel,,0.2,8.0,46,202,296,...,67,185,34,65,183,300,679,1395,True,7


### Data export

In [81]:
# Now that our dataset is finally complete, we export it in .csv file

df_pokemon.to_csv(r'C:\Users\Nathan\pokemon_dataset.csv',index=False)