# Pokemon Analysis

-----**TO DELETE**-----
* Type prediction using Logistic Regression, Decision Trees, Random Forest, Support Vector Machines, K-Nearest Neighbors, Naive Bayes, Gradient Boosting
    * Use base stats
* Image recognition with Convolutional Neural Networks

In [28]:
import pandas as pd
import numpy as np
import requests
import bs4

### Verify scrapability via robots.txt

In [26]:
def allowed_by_robots_txt(url):
    """
    Returns a boolean value representing if a url is allowed 
    to be scraped, according to the site's robots.txt
    ---
    url: string representing url to scrape
    """
    # Get robots.txt
    url_split = gen1_url.split("/")
    robots_txt_url = url_split[0] + '//' + url_split[2] + '/robots.txt'

    response = requests.get(robots_txt_url)
    response.raise_for_status()

    lines = response.text.split('\n')

    user_agent_allowed = True

    for line in lines:
        if line.lower().startswith('disallow'):
            # Check if the URL is disallowed
            disallowed_path = line.split(':', 1)[1].strip()
            if url.endswith(disallowed_path):
                return False

    # If no specific rule is found, the URL is allowed
    return True

In [27]:
gen1_url = "https://pokemondb.net/pokedex/stats/gen1"
gen2_url = "https://pokemondb.net/pokedex/stats/gen2"
gen3_url = "https://pokemondb.net/pokedex/stats/gen3"

gen1_allowed = allowed_by_robots_txt(gen1_url)
gen2_allowed = allowed_by_robots_txt(gen2_url)
gen3_allowed = allowed_by_robots_txt(gen3_url)

print(f'Gen 1 scrapable: {gen1_allowed}\nGen 2 scrapable: {gen2_allowed}\nGen 3 scrapable: {gen3_allowed}')

Gen 1 scrapable: True
Gen 2 scrapable: True
Gen 3 scrapable: True


### Scrape Data

In [124]:
def get_pokedex(url):
    """
    Returns a DataFrame object that contains the Pokédex for the url to the specified generation.
    ---
    url: string representing url to scrape
    """
    # Make request to site
    response = requests.get(url)
    
    # Check to see if response was successful
    if response.status_code == 200:
        html_content = response.text
    else:
        raise Exception(f"Error: Unable to fetch content. Status code: {response.status_code}.")
        
    # create soup object and get only 'tr' tags
    soup = bs4.BeautifulSoup(html_content, features='lxml')
    soup = soup.find('div', class_='resp-scroll').find_all('tr')
    
    # get column data
    column_info = soup[0]
    column_info = column_info.find_all('th')
    
    # get data
    num, name, elements, total, hp, attack, defense, spatk, spdef, spd = [], [], [], [], [], [], [], [], [], []
    for pokemon in soup[1:]:
        num.append(pokemon.find_all('td')[0].text)
        name.append(pokemon.find_all('td')[1].text)
        elements.append(pokemon.find_all('td')[2].text)
        total.append(pokemon.find_all('td')[3].text)
        hp.append(pokemon.find_all('td')[4].text)
        attack.append(pokemon.find_all('td')[5].text)
        defense.append(pokemon.find_all('td')[6].text)
        spatk.append(pokemon.find_all('td')[7].text)
        spdef.append(pokemon.find_all('td')[8].text)
        spd.append(pokemon.find_all('td')[9].text)
    
    # combine column data and raw data to form DataFrame
    data = {column_info[0].text: num, 
            column_info[1].text: name, 
            column_info[2].text: elements, 
            column_info[3].text: total, 
            column_info[4].text: hp, 
            column_info[5].text: attack, 
            column_info[6].text: defense, 
            column_info[7].text: spatk, 
            column_info[8].text: spdef, 
            column_info[9].text: spd}
    return pd.DataFrame(data)

In [116]:
gen1 = get_pokedex(gen1_url)
gen2 = get_pokedex(gen2_url)
gen3 = get_pokedex(gen3_url)

In [120]:
gen1.head()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,4,Charmander,Fire,309,39,52,43,60,50,65
4,5,Charmeleon,Fire,405,58,64,58,80,65,80


In [121]:
gen2.head()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,152,Chikorita,Grass,318,45,49,65,49,65,45
1,153,Bayleef,Grass,405,60,62,80,63,80,60
2,154,Meganium,Grass,525,80,82,100,83,100,80
3,155,Cyndaquil,Fire,309,39,52,43,60,50,65
4,156,Quilava,Fire,405,58,64,58,80,65,80


In [122]:
gen3.head()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,252,Treecko,Grass,310,40,45,35,65,55,70
1,253,Grovyle,Grass,405,50,65,45,85,65,95
2,254,Sceptile,Grass,530,70,85,65,105,85,120
3,255,Torchic,Fire,310,45,60,40,70,50,45
4,256,Combusken,Fire Fighting,405,60,85,60,85,60,55


### EDA

In [133]:
df = pd.concat(objs=[gen1, gen2, gen3], axis=0, ignore_index=True)

In [134]:
df.head()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,4,Charmander,Fire,309,39,52,43,60,50,65
4,5,Charmeleon,Fire,405,58,64,58,80,65,80


In [135]:
df.tail()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
387,385,Jirachi,Steel Psychic,600,100,100,100,100,100,100
388,386,Deoxys Normal Forme,Psychic,600,50,150,50,150,50,150
389,386,Deoxys Attack Forme,Psychic,600,50,180,20,180,20,150
390,386,Deoxys Defense Forme,Psychic,600,50,70,160,70,160,90
391,386,Deoxys Speed Forme,Psychic,600,50,95,90,95,90,180
