**Project-2:** *Gotta fetch 'em all* - Enriching the dataset

**Libraries**

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from pandas import json_normalize
import json
import os
from dotenv import load_dotenv
import time

# Importing the dataset and defining my goals

The dataset was obtained from the kaggle website:
[Pokémon dataset](https://www.kaggle.com/datasets/abcsds/pokemon)

In [587]:
# Importing and checking how the dataset looks like

pokemon = pd.read_csv("./data/pokemon.csv")
pokemon = pokemon.set_index('#')
pokemon.head() 

Unnamed: 0_level_0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [266]:
pokemon.tail()

Unnamed: 0_level_0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True
721,Volcanion,Fire,Water,600,80,110,120,130,90,70,6,True


- As of today, the Pokémon games consist of 8 generations, adding up a total of **#905** creatures. Thus, this dataset from kaggle is outdated. In addition, it is missing some categories that would make for an interesting investigation. 
- Luckily enough, there's an **API** called [PokéAPI](https://pokeapi.co/) that contains huge amounts of information about the pokémon world. I will make use of this API to first add the missing pokémon to my dataset with their respective categories.

## My goal

In [174]:
# To start off, this would be my goal dataset:

goal = {'Name':['Pokémon_1', 'Pokémon_2', 'Pokémon_3'],
            'Height':['x', 'y', 'z'],
            'Weight':['x', 'y', 'z'],
            'Type 1':['Type_a', 'Type_b', 'Type_c'],
            'Type 1':['Type_a', 'Type_b', 'Type_c'],
            'Total':['x', 'y', 'z'],
            'HP':['x', 'y', 'z'],
            'Attack':['x', 'y', 'z'],
            'Defense':['x', 'y', 'z'],
            'Sp. Atk':['x', 'y', 'z'],
            'Sp. Def':['x', 'y', 'z'],
            'Speed':['x', 'y', 'z'],
            'Habitat':['x', 'y', 'z'],
            'Generation':['1', '2', '3'],
            'Legendary':['True', 'False', 'True'],
}
  
# Create DataFrame
goal_df = pd.DataFrame(goal)
goal_df

Unnamed: 0,Name,Height,Weight,Type 1,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Habitat,Generation,Legendary
0,Pokémon_1,x,x,Type_a,x,x,x,x,x,x,x,x,1,True
1,Pokémon_2,y,y,Type_b,y,y,y,y,y,y,y,y,2,False
2,Pokémon_3,z,z,Type_c,z,z,z,z,z,z,z,z,3,True


# Enriching the dataset with an API - fetching the missing pokemon

- According to the docs, the PokéAPI does not require authentication and is free and open to use.
- They also removed the rate limit but still encourage the users to limit the number of their requests.
- For the moment I will limit my requests to the missing pokémon of my dataset, since I would also like to request other information from the API.

In [None]:
# Link for requesting pokémon:
# url_api = https://pokeapi.co/api/v2/pokemon/{id or name}/

## Trying one request

I will first try with a single request to see if I get what I need.

In [57]:
response_api = requests.get(f"https://pokeapi.co/api/v2/pokemon/{810}/")
response_api # Request accepted

<Response [200]>

In [58]:
# Looking at the content of the request. We got the first pokémon of the 8th gen:
# response_api.content
# response_api.json()

first_poke = json_normalize(response_api.json())
first_poke # We got lots of info: 129 columns

Unnamed: 0,abilities,base_experience,forms,game_indices,height,held_items,id,is_default,location_area_encounters,moves,...,sprites.versions.generation-vi.x-y.front_shiny,sprites.versions.generation-vi.x-y.front_shiny_female,sprites.versions.generation-vii.icons.front_default,sprites.versions.generation-vii.icons.front_female,sprites.versions.generation-vii.ultra-sun-ultra-moon.front_default,sprites.versions.generation-vii.ultra-sun-ultra-moon.front_female,sprites.versions.generation-vii.ultra-sun-ultra-moon.front_shiny,sprites.versions.generation-vii.ultra-sun-ultra-moon.front_shiny_female,sprites.versions.generation-viii.icons.front_default,sprites.versions.generation-viii.icons.front_female
0,"[{'ability': {'name': 'overgrow', 'url': 'http...",62,"[{'name': 'grookey', 'url': 'https://pokeapi.c...",[],3,[],810,True,https://pokeapi.co/api/v2/pokemon/810/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",...,,,,,,,,,https://raw.githubusercontent.com/PokeAPI/spri...,


### Cleaning and accessing flatened information

In [60]:
# Dropping sprites columns with regex
# Dropping other unnecessary columns

first_poke = first_poke[first_poke.columns.drop(list(first_poke.filter(regex='sprites.*')))]
first_poke = first_poke.drop(labels=['game_indices', 'held_items', 'is_default', 'past_types', 'species.name', 'species.url', 'order', 'forms'], axis=1)
first_poke.shape

(1, 10)

In [62]:
first_poke

# Will need to extract the 'stats' and types' from the respective columns

Unnamed: 0,abilities,base_experience,height,id,location_area_encounters,moves,name,stats,types,weight
0,"[{'ability': {'name': 'overgrow', 'url': 'http...",62,3,810,https://pokeapi.co/api/v2/pokemon/810/encounters,"[{'move': {'name': 'mega-punch', 'url': 'https...",grookey,"[{'base_stat': 50, 'effort': 0, 'stat': {'name...","[{'slot': 1, 'type': {'name': 'grass', 'url': ...",50


**Accessing stats**

In [75]:
# first_poke['stats'][0]

In [81]:
first_poke['stats'][0][0]['base_stat'] # value of the stat HP -> in a new column 'HP'
first_poke['stats'][0][1]['base_stat'] # value of the stat attack -> in new column 'Attack'
first_poke['stats'][0][2]['base_stat'] # value of the stat defense -> in new column 'Defense'
first_poke['stats'][0][3]['base_stat'] # value of the stat special-attack -> in new column 'Sp. Atk'
first_poke['stats'][0][4]['base_stat'] # value of the stat special-defense -> in new column 'Sp. Def'
first_poke['stats'][0][5]['base_stat'] # value of the stat speed -> in new column 'Speed'

65

**Accessing types**

In [79]:
# first_poke['types'][0]

In [47]:
first_poke['types'][0][0]['type']['name'] # If it has only 1 type -> in new column 'Type 1'
# first_poke['types'][0][1]['type']['name'] # If it had a second type -> in new column 'Type 2'

'grass'

## Looping for many requests to get the missing pokemon

Let's fetch all the missing pokemon:

In [33]:
def fetchMissing(a, b):
    '''This functions receives two pokemon id numbers as integers ('a' and 'b'),
    and returns a dataframe containing the specified pokemon from the ids 'a' to 'b'.
    '''
    missing_pokes = []
    for i in range(a,b+1):
        time.sleep(1)
        print(f"Fetching pokemon with id: {i}")
        response_api = requests.get(f"https://pokeapi.co/api/v2/pokemon/{i}/").json()
        missing_pokes.append(response_api)
    
    return json_normalize(missing_pokes)

In [38]:
# missing_all = fetchMissing(722,905)

In [264]:
# missing_all.sample()

In [37]:
# Exporting new dataset of missing pokemon
missing_all.to_csv('./data/pokemon_missing.csv', index = False)

### Debugging

**fetchMissing function**

In [16]:
# When I first tried the fetchMissing function, when reaching pokemon 902 it returned an error message:
# JSONDecodeError: Expecting value: line 1 column 1 (char 0)
# Tried defining a new fetchMissing function but ended up returning a dataframe with no info in it.
# Then tried adding a time.sleep(1) to the original function and it worked just fine.

In [98]:
# def fetchMissing(a, b):
#     '''This functions receives two pokemon id numbers as integers ('a' and 'b'),
#     and returns a dataframe containing the specified pokemon from the ids 'a' to 'b'.
#     '''
#     missing_pokes = []
#     for i in range(a,b+1):
#         response_api = requests.get(f"https://pokeapi.co/api/v2/pokemon/{i}/")
#         try:
#             response_api.json()
#             missing_pokes.append(response_api)
#         except JSONDecodeError:
#             print(f"Encountered a JSONDecodeError for pokemon id {i}")
               
#     return json_normalize(missing_pokes)

### Extracting flatened information

In [481]:
# Importing dataset to work on
missing_pokes = pd.read_csv("./data/pokemon_missing.csv")
# missing_pokes

**Extracting stats**

In [482]:
def extractStats(df):
    '''This function receives a dataframe and extracts the HP, Attack, Defense, Sp. Atk, Sp. Def, and Speed from 'stats'.
    It returns a df with the extracted values of each stat for each pokemon in new columns.
    '''
    HP = []
    Attack = []
    Defense = []
    Sp_Atk = []
    Sp_Def = []
    Speed = []
    
    for i in range(len(df)):
        HP.append(int("".join([x for x in missing_pokes.iloc[i]['stats'].split(",")[0] if x.isdigit()])))
        Attack.append(int("".join([x for x in missing_pokes.iloc[i]['stats'].split(",")[4] if x.isdigit()])))
        Defense.append(int("".join([x for x in missing_pokes.iloc[i]['stats'].split(",")[8] if x.isdigit()])))
        Sp_Atk.append(int("".join([x for x in missing_pokes.iloc[i]['stats'].split(",")[12] if x.isdigit()])))
        Sp_Def.append(int("".join([x for x in missing_pokes.iloc[i]['stats'].split(",")[16] if x.isdigit()])))
        Speed.append(int("".join([x for x in missing_pokes.iloc[i]['stats'].split(",")[20] if x.isdigit()])))
    
    df['HP'] = HP
    df['Attack'] = Attack
    df['Defense'] = Defense
    df['Sp. Atk'] = Sp_Atk
    df['Sp. Def'] = Sp_Def
    df['Speed'] = Speed
    
    return df

In [483]:
missing_pokes = extractStats(missing_pokes)

## Cleaning and adding new columns

**Cleaning columns**

In [484]:
def cleanColumns(df):
    '''This functions receives a dataframe and drops unwanted columns from the dataframe.
    '''
    df = df.drop(labels=list(df.filter(regex='sprites.*')), axis=1)
    df = df.drop(labels=['game_indices', 
                         'held_items', 
                         'is_default', 
                         'past_types', 
                         'species.name', 
                         'species.url', 
                         'order', 
                         'forms', 
                         'abilities', 
                         'base_experience', 
                         'location_area_encounters', 
                         'moves',
                         'stats', 
                         'types'], axis=1)
    df.columns = df.columns.str.title()
    
    return df

In [485]:
missing_pokes = cleanColumns(missing_pokes)

**Transforming Height and Weight to 'm' and 'kg, respectively**

In [486]:
def transformMetrics(df):
    '''This function receives a dataframe and transforms heights and weights to meters and kilograms, respectively.
    '''
    df['Height'] = df['Height'].apply(lambda x: x / 10)
    df['Weight'] = df['Weight'].apply(lambda x: x / 10)
    return df

In [487]:
missing_pokes = transformMetrics(missing_pokes)

**Adding Total score column**

In [488]:
def calculateTotal(df):
    '''This function receives a dataframe and calculates the Total score of a pokemon
    based on the sum of its stats (Hp, Attack, Defense, Sp. Atk, Sp. Def, Speed).
    It returns the dataframe with the Total column.
    '''
    df['Total'] = df['Hp'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']
    return df

In [489]:
missing_pokes = calculateTotal(missing_pokes)

**Adding Generation column using the ID number**

In [490]:
def setGeneration(df):
    '''This function receives a dataframe and returns the generation to which the pokemon belongs based on its ID.
    '''
    generation_7 = [n for n in range(722,810)]
    generation_8 = [n for n in range(810,905)]
    
    df['Generation'] = df['Id'].apply(lambda x: 7 if x in generation_7 else 8)
    
    return df

In [491]:
missing_pokes = setGeneration(missing_pokes)

**Adding Legendary Column using ID number**

In [492]:
def setLegendary(df):
    '''This function receives a dataframe and returns the legendary status (True/False) of a pokemon based on its ID.
    '''
    gen_7_legendaries = [772,773,785,786,787,788,789,790,791,792,793,800] 
    gen_8_legendaries = [888,889,890,891,892,894,895,896,897,898,905]
    legendaries = gen_7_legendaries + gen_8_legendaries
    
    df['Legendary'] = df['Id'].apply(lambda x: True if x in legendaries else False)
    
    return df

In [493]:
missing_pokes = setLegendary(missing_pokes)

**Cleaning names**

In [494]:
names_before = missing_pokes['Name'].to_list()

In [495]:
def cleanNames(df):
    '''This function receives a dataframe and cleans the Name column.
    '''
    for i in df['Name']:
        if "-" in i:
            df['Name'] = df['Name'].str.replace(i, i.split("-")[0])
    df['Name'] = df['Name'].str.title()
    
    return df

In [496]:
missing_pokes = cleanNames(missing_pokes)

In [497]:
names_after = missing_pokes['Name'].to_list()

- ***Checking the new names***

In [498]:
# Checking which names were changed. 
# Might need to re-name some before Web Scraping, otherwise would not be able to request them (wikidex)

different = []
for i in names_after:
    if i.lower() not in names_before:
        different.append(i)
print(different)

# Names to change:
# Type -> Codigo_Cero
# Jangmo -> Jangmo-o
# Hakamo -> Hakamo-o
# Kommo -> Kommo-o
# Mr -> Mr._Rime

# This I need to access through Id:
# Tapu -> Tapu_Koko
# Tapu -> Tapu_Lele
# Tapu -> Tapu_Bulu
# Tapu -> Tapu_Fini

['Oricorio', 'Lycanroc', 'Wishiwashi', 'Type', 'Minior', 'Mimikyu', 'Jangmo', 'Hakamo', 'Kommo', 'Tapu', 'Tapu', 'Tapu', 'Tapu', 'Toxtricity', 'Mr', 'Eiscue', 'Indeedee', 'Morpeko', 'Urshifu', 'Basculegion', 'Enamorus']


In [499]:
def nameChange(df):
    '''This function receives a dataframe and updates the name of some pokemon.
    '''
    names_change = {'Type' : 'Codigo_Cero',
               'Jangmo' : 'Jangmo-o',
               'Hakamo' : 'Hakamo-o',
               'Kommo' : 'Kommo-o',
               'Mr' : 'Mr._Rime'}
    
    df['Name'] = df['Name'].replace(names_change)
    df.loc[df['Id'] == 785, 'Name'] = "Tapu_Koko"
    df.loc[df['Id'] == 786, 'Name'] = "Tapu_Lele"
    df.loc[df['Id'] == 787, 'Name'] = "Tapu_Bulu"
    df.loc[df['Id'] == 788, 'Name'] = "Tapu_Fini"
    
    return df

In [500]:
missing_pokes = nameChange(missing_pokes)

In [504]:
# Way cleaner dataset
missing_pokes

Unnamed: 0,Height,Id,Name,Weight,Hp,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,0.3,722,Rowlet,1.5,68,55,55,50,50,42,320,7,False
1,0.7,723,Dartrix,16.0,78,75,75,70,70,52,420,7,False
2,1.6,724,Decidueye,36.6,78,107,75,100,100,70,530,7,False
3,0.4,725,Litten,4.3,45,65,40,60,40,70,320,7,False
4,0.7,726,Torracat,25.0,65,85,50,80,50,90,420,7,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,2.4,901,Ursaluna,290.0,130,140,105,45,80,50,550,8,False
180,3.0,902,Basculegion,110.0,120,112,65,80,75,78,530,8,False
181,1.3,903,Sneasler,43.0,80,130,60,40,80,120,510,8,False
182,2.5,904,Overqwil,60.5,85,115,95,65,65,85,510,8,False


## Adding Types using Web Scraping

In [507]:
pokemon_url = "https://www.wikidex.net/wiki/Bulbasaur"
# To get the info of all of them will need to iterate on the url f"{Pokémon}"
response = requests.get(pokemon_url)
response

<Response [200]>

In [568]:
html = response.content
soup = BeautifulSoup(html, "html.parser")
types = soup.find("p")
types.getText().split("tipo ")[1].split(" ")[0]

'planta/veneno'

In [570]:
def getType(list_of_pokes):
    '''This functions appends the Type of each pokemon to a list.
    If the pokemon is not found in the wiki, it appends "NaN" instead.
    '''
    types_list = []
    
    for i in range(len(list_of_pokes)):
        pokemon_url = f"https://www.wikidex.net/wiki/{list_of_pokes[i]}"
        response = requests.get(pokemon_url)
        html = response.content
        soup = BeautifulSoup(html, "html.parser")
        types = soup.find("p")
        try:
            types_list.append(types.getText().split("tipo ")[1].split(" ")[0])
        except IndexError:
            types_list.append('NaN')
        
    return types_list

In [580]:
poke_list = missing_pokes['Name'].to_list()
# poke_list

In [581]:
all_tipos = getType(poke_list)

In [583]:
# Adding new column Types (in spanish) to the dataset
missing_pokes['Types'] = all_tipos
missing_pokes

Unnamed: 0,Height,Id,Name,Weight,Hp,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary,Types
0,0.3,722,Rowlet,1.5,68,55,55,50,50,42,320,7,False,planta/volador
1,0.7,723,Dartrix,16.0,78,75,75,70,70,52,420,7,False,planta/volador
2,1.6,724,Decidueye,36.6,78,107,75,100,100,70,530,7,False,planta/fantasma
3,0.4,725,Litten,4.3,45,65,40,60,40,70,320,7,False,fuego
4,0.7,726,Torracat,25.0,65,85,50,80,50,90,420,7,False,fuego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,2.4,901,Ursaluna,290.0,130,140,105,45,80,50,550,8,False,tierra/normal
180,3.0,902,Basculegion,110.0,120,112,65,80,75,78,530,8,False,agua/fantasma
181,1.3,903,Sneasler,43.0,80,130,60,40,80,120,510,8,False,lucha/veneno
182,2.5,904,Overqwil,60.5,85,115,95,65,65,85,510,8,False,siniestro/veneno


In [619]:
def cleanTypes(df):
    '''This function receives a dataframe, splits the column Type and translates the content, from ES to EN.
    '''
    df[["Type 1", "Type 2"]] = df["Types"].str.split("/", 1, expand = True)
    df = df.drop(labels="Types", axis=1)
    
    translate = {"normal" : "Normal",
                 "fuego" : "Fire",
                 "agua" : "Water",
                 "planta" : "Grass",
                 "volador" : "Flying",
                 "tierra" : "Ground",
                 "roca" : "Rock",
                 "eléctrico" : "Electric",
                 "bicho" : "Bug",
                 "lucha" : "Fighting",
                 "psíquico" : "Psychic",
                 "veneno" : "Poison",
                 "fantasma" : "Ghost",
                 "hielo" : "Ice",
                 "dragón" : "Dragon",
                 "acero" : "Steel",
                 "siniestro" : "Dark",
                 "hada" : "Fairy"}
        
    df['Type 1'] = df['Type 1'].replace(translate)
    df['Type 2'] = df['Type 2'].replace(translate)
    
    return df

In [620]:
missing_pokes = cleanTypes(missing_pokes)

In [640]:
missing_pokes.sample()

Unnamed: 0,Height,Id,Name,Weight,Hp,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary,Type 1,Type 2
38,2.1,760,Bewear,135.0,120,125,80,55,60,60,500,7,False,Normal,Fighting


## Re-organizing columns of missing poke-dataset

In [633]:
def organizeMissing(df):
    '''This function re-organizes the columns to match the original pokemon dataset from kaggle.
    '''
    df = df.set_index("#", drop = True)
    df = df.reindex(columns = ["Name", "Type 1", "Type 2", "Total", "Hp", "Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed", "Generation", "Legendary"])
    return df

In [638]:
missing_pokes_clean = organizeMissing(missing_pokes)

In [645]:
missing_pokes_clean.sample()

Unnamed: 0_level_0,Name,Type 1,Type 2,Total,Hp,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
805,Stakataka,Rock,Steel,570,61,131,211,53,101,13,7,False


In [646]:
# Exporting dataset before re-organize
missing_pokes.to_csv('./data/pokemon_missing_new.csv', index = False)

In [641]:
# Exporting dataset clean
missing_pokes_clean.to_csv('./data/pokemon_missing_clean.csv', index = False)

# Enriching the ORIGINAL kaggle dataset using Web Scraping

In [271]:
# Importing
pokemon = pd.read_csv("./data/pokemon.csv")
pokemon = pokemon.set_index('#')
# pokemon.head()

## Weight

### Trying one request

In [4]:
# Wikidex
pokemon_url = "https://www.wikidex.net/wiki/Bulbasaur"
# To get the info of all of them will need to iterate on the url f"{Pokémon}"

In [5]:
response = requests.get(pokemon_url)
response

<Response [200]>

In [6]:
html = response.content
# html

In [7]:
soup = BeautifulSoup(html, "html.parser")
# soup

In [8]:
weight = soup.find_all("tr", attrs = {"title" : "Peso del Pokémon"})
weight

[<tr title="Peso del Pokémon">
 <th><a href="/wiki/Lista_de_Pok%C3%A9mon_por_peso" title="Lista de Pokémon por peso">Peso</a>
 </th>
 <td>6,9 kg
 </td></tr>]

In [9]:
# Weight in kg of bulbasur
float(weight[0].getText().strip().split('\n')[-1].split(" ")[0].replace(",","."))

6.9

### Looping for many requests

In [35]:
def getWeight(list_of_pokes):
    '''This functions appends the weight of each pokemon to a list.
    If the pokemon is not found in the wiki, it appends "NaN" instead.
    '''
    weight_list = []
    
    for i in range(len(list_of_pokes)):
        pokemon_url = f"https://www.wikidex.net/wiki/{list_of_pokes[i]}"
        response = requests.get(pokemon_url)
        html = response.content
        soup = BeautifulSoup(html, "html.parser")
        weight = soup.find_all("tr", attrs = {"title" : "Peso del Pokémon"})
        
        try:
            weight_float = float(weight[0].getText().strip().split('\n')[-1].split(" ")[0].replace(",","."))
            weight_list.append(weight_float)
        except IndexError:
            weight_list.append('NaN')
        
    return weight_list

In [280]:
all_pokemon_list = pokemon['Name'].to_list()

In [49]:
# all_weights = getWeight(all_pokemon_list)
# print(all_weights)

In [43]:
# Adding new column Weight to the pokedex
pokemon['Weight (kg)'] = all_weights

In [48]:
# Exporting dataset with added weight
pokemon.to_csv('./data/pokemon_weight.csv', index = False)

In [79]:
# pokemon_nan_weight = pokemon[pokemon['Weight (kg)'] == 'NaN']['Name'].to_list()
# pokemon_nan_weight
# Megaevolution and special forms: weight not added properly

## Height

In [272]:
# Importing
pokemon = pd.read_csv("./data/pokemon_weight.csv")
pokemon.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Weight (kg)
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,6.9
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,13.0
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,100.0
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False,8.5


In [273]:
def getHeight(list_of_pokes):
    '''This functions appends the height of each pokemon to a list.
    If the pokemon is not found in the wiki, it appends "NaN" instead.
    '''
    height_list = []
    
    for i in range(len(list_of_pokes)):
        pokemon_url = f"https://www.wikidex.net/wiki/{list_of_pokes[i]}"
        response = requests.get(pokemon_url)
        html = response.content
        soup = BeautifulSoup(html, "html.parser")
        height = soup.find_all("tr", attrs = {"title" : "Altura del Pokémon"})
        
        try:
            height_float = float(height[0].getText().strip().split('\n')[-1].split(" ")[0].replace(",","."))
            height_list.append(height_float)
        except IndexError:
            height_list.append('NaN')
        
    return height_list

In [274]:
all_pokemon_list = pokemon['Name'].to_list()

In [275]:
all_heights = getHeight(all_pokemon_list)

In [276]:
# Adding new column Height to the pokedex
pokemon['Height (m)'] = all_heights
pokemon

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Weight (kg),Height (m)
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,6.9,0.7
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,13.0,1.0
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,100.0,2.0
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,,
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False,8.5,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True,8.8,0.7
796,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True,,
797,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True,,
798,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True,,


In [277]:
# Exporting dataset with added height
pokemon.to_csv('./data/pokemon_weight_height.csv', index = False)

# Bulbapedia ->>> catch rate

In [None]:
# Using bulbapedia

In [None]:
pokemon_url_bulb = "https://bulbapedia.bulbagarden.net/wiki/Bulbasaur_(Pok%C3%A9mon)"

In [None]:
# Execute navigator.userAgent in Chrome developer console, withing the page - Ctrl+Shit+J
# Add that User-Agent into headers

headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"}
response_bulb = requests.get(pokemon_url_2, headers = headers)
response_bulb

In [None]:
html_bulb = response_bulb.content
soup_bulb = BeautifulSoup(html_bulb, "html.parser")
catchrate = soup_bulb.find_all("tr", attrs = {"title" : "Peso del Pokémon"})