In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import preprocessing as pre
from sklearn.metrics import r2_score
import logging

In [3]:
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [4]:
filename = open('./data/pokemon.csv')
pokemon = pd.read_csv(filename)
pokemon.head()

Unnamed: 0,name,pokedex_number,abilities,typing,hp,attack,defense,special_attack,special_defense,speed,...,ground_attack_effectiveness,fly_attack_effectiveness,psychic_attack_effectiveness,bug_attack_effectiveness,rock_attack_effectiveness,ghost_attack_effectiveness,dragon_attack_effectiveness,dark_attack_effectiveness,steel_attack_effectiveness,fairy_attack_effectiveness
0,Bulbasaur,1,Overgrow~Chlorophyll,Grass~Poison,45,49,49,65,65,45,...,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5
1,Ivysaur,2,Overgrow~Chlorophyll,Grass~Poison,60,62,63,80,80,60,...,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5
2,Venusaur,3,Overgrow~Chlorophyll,Grass~Poison,80,82,83,100,100,80,...,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5
3,Venusaur Gmax,3,Overgrow~Chlorophyll,Grass~Poison,80,82,83,100,100,80,...,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5
4,Venusaur Mega,3,Thick Fat,Grass~Poison,80,100,123,122,120,80,...,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5


In [5]:
pokemon.keys()

Index(['name', 'pokedex_number', 'abilities', 'typing', 'hp', 'attack',
       'defense', 'special_attack', 'special_defense', 'speed', 'height',
       'weight', 'genus', 'gen_introduced', 'female_rate', 'genderless',
       'baby_pokemon', 'legendary', 'mythical', 'is_default',
       'forms_switchable', 'base_experience', 'capture_rate', 'egg_groups',
       'egg_cycles', 'base_happiness', 'can_evolve', 'evolves_from',
       'primary_color', 'shape', 'number_pokemon_with_typing',
       'normal_attack_effectiveness', 'fire_attack_effectiveness',
       'water_attack_effectiveness', 'electric_attack_effectiveness',
       'grass_attack_effectiveness', 'ice_attack_effectiveness',
       'fighting_attack_effectiveness', 'poison_attack_effectiveness',
       'ground_attack_effectiveness', 'fly_attack_effectiveness',
       'psychic_attack_effectiveness', 'bug_attack_effectiveness',
       'rock_attack_effectiveness', 'ghost_attack_effectiveness',
       'dragon_attack_effectiveness', '

In [6]:
#get rid of mega/gmax forms
gmax_names = [p for p in pokemon.name if 'Gmax' in p or 'Mega' in p and p != 'Meganium']
gmax_megas = pd.Series(gmax_names,name='name')
gmax_megas = pokemon.merge(gmax_megas)
for p in pokemon.name:
    if 'Gmax' in p or 'Mega' in p:
        if p == 'Meganium':
            pass
        else:
            pokemon.drop(pokemon.loc[pokemon.name == p].index,axis=0,inplace=True)

In [7]:
types = pokemon.typing.drop_duplicates()
monotypes = []
for t in types:
    if '~' not in t:
        monotypes.append(t)
effectives = [f'{t.lower()}_attack_effectiveness' for t in monotypes if t != 'Flying']
effectives.append('fly_attack_effectiveness')
pokemon.drop(columns=effectives,inplace=True)

In [8]:
twotype = pokemon.typing.str.split('~',expand=True)
twotype.rename(columns={0:'primary_type',1:'second_type'},inplace=True)

In [9]:
pokestats = pd.DataFrame([pokemon.name,pokemon.pokedex_number,twotype.primary_type,twotype.second_type,pokemon.weight,pokemon.height,pokemon.hp,pokemon.attack,pokemon.defense,pokemon.special_attack,pokemon.special_defense,pokemon.speed]).T
pokestats.set_index(pokestats.name,inplace=True)
pokestats.drop(columns='name',inplace=True)

In [10]:
pokestats.head()

Unnamed: 0_level_0,pokedex_number,primary_type,second_type,weight,height,hp,attack,defense,special_attack,special_defense,speed
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Bulbasaur,1,Grass,Poison,69,7,45,49,49,65,65,45
Ivysaur,2,Grass,Poison,130,10,60,62,63,80,80,60
Venusaur,3,Grass,Poison,1000,20,80,82,83,100,100,80
Charmander,4,Fire,,85,6,39,52,43,60,50,65
Charmeleon,5,Fire,,190,11,58,64,58,80,65,80


In [11]:
statsbytype = pokestats.groupby('primary_type')
bytypes = dict.fromkeys(monotypes)
bytypes_desc = dict.fromkeys(monotypes)
by2types_desc = dict.fromkeys(monotypes)

for t in monotypes:
    tp = statsbytype[['second_type','hp','attack','defense','special_attack','special_defense','speed']].get_group(t)
    tpnum = statsbytype[['hp','attack','defense','special_attack','special_defense','speed']].get_group(t)
    tp = tp.apply(pd.to_numeric, errors='ignore')
    tpnum =  tpnum.apply(pd.to_numeric,errors='ignore')
    bytypes[t] = tpnum
    bytypes_desc[t] = tp.describe()
    bytypes_desc[t] = bytypes_desc[t].apply(round,ndigits = 0)
    by2types_desc[t] = tp.groupby('second_type').describe()
    by2types_desc[t] = by2types_desc[t].apply(round,ndigits = 0)

# Andrews Curves for Type Stats

In [12]:
typenorm = dict.fromkeys(monotypes)
for t in monotypes:
   norm = pre.normalize(bytypes[t])
   normdf = pd.DataFrame(norm,columns=[0,1,2,3,4,5])
   # lower, upper = np.quantile(normdf),.25-1.5*stats.iqr(normdf.values), np.quantile(normdf.values,.75)+1.5*stats.iqr(normdf.values)
   typenorm[t] = normdf.T
   typenorm[t]['Stat'] = ['hp','attack','defense','special_attack','special_defense','speed']
typenorm['Bug'].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,Stat
0,0.538816,0.556415,0.361649,0.47724,0.515339,0.388623,0.287456,0.346844,0.477712,0.376595,...,0.301793,0.301793,0.236757,0.322301,0.429896,0.272037,0.320092,0.34001,0.27787,hp
1,0.359211,0.222566,0.271237,0.417585,0.286299,0.538093,0.574911,0.54917,0.437903,0.349695,...,0.276644,0.276644,0.33146,0.537169,0.558463,0.524916,0.256074,0.238007,0.208403,attack
2,0.419079,0.612056,0.301374,0.35793,0.572598,0.239153,0.451716,0.462459,0.398094,0.322795,...,0.301793,0.301793,0.378811,0.601629,0.558463,0.141766,0.256074,0.544016,0.509429,defense
3,0.239474,0.278207,0.542474,0.23862,0.286299,0.269047,0.369586,0.346844,0.318475,0.484193,...,0.47784,0.47784,0.189405,0.257841,0.212939,0.524916,0.320092,0.34001,0.370493,special_attack
4,0.239474,0.278207,0.482199,0.23862,0.286299,0.478305,0.451716,0.462459,0.437903,0.403494,...,0.352092,0.352092,0.284108,0.386762,0.212939,0.141766,0.576166,0.612018,0.55574,special_defense


In [13]:
types1 = np.array([['Grass','Water','Fire'],['Normal','Flying','Bug']])
types2 = np.array([['Psychic','Ghost', 'Dark'],['Ground', 'Rock', 'Fighting']])
types3 = np.array([['Electric','Ice','Poison'],['Dragon', 'Fairy', 'Steel']])
colortypes1 = np.array([['xkcd:pale green','xkcd:baby blue','xkcd:rose'],['xkcd:beige',"xkcd:light periwinkle",'xkcd:pale olive green']])
colortypes2 = np.array([['xkcd:pale magenta','xkcd:dusty lavender','xkcd:almost black'],['xkcd:sandy brown','xkcd:grey brown','xkcd:pale orange']])
colortypes3 = np.array([['xkcd:neon yellow','xkcd:ice blue','xkcd:soft purple'],['xkcd:sky blue','xkcd:pale pink', 'xkcd:steel blue']])
colorstats = ['xkcd:fire engine red','xkcd:brick orange', 'xkcd:sunflower yellow','xkcd:clear blue', 'xkcd:lawn green', 'xkcd:magenta']

In [14]:
# first round of types
fig, axs = plt.subplots(2,3,figsize=(18,15),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):
        pd.plotting.andrews_curves(typenorm[types1[i,j]],class_column='Stat',ax=axs[i,j],color=colorstats)
        axs[i,j].set_facecolor(colortypes1[i,j])
        axs[i,j].set_title(f'{types1[i,j]}-Type Pokemon Stat Curves')
plt.tight_layout()
plt.savefig('./assets/plots/type-stats/type-stats1.png')
plt.close('all')

In [15]:
# 2nd round of types
fig, axs = plt.subplots(2,3,figsize=(18,15),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):
        pd.plotting.andrews_curves(typenorm[types2[i,j]],class_column='Stat',ax=axs[i,j],color=colorstats)
        axs[i,j].set_facecolor(colortypes2[i,j])
        axs[i,j].set_title(f'{types2[i,j]}-Type Pokemon Stat Curves')
plt.tight_layout()
plt.savefig('./assets/plots/type-stats/type-stats2.png')
plt.close('all')

In [16]:
# last round of types
fig, axs = plt.subplots(2,3,figsize=(18,15),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):
        pd.plotting.andrews_curves(typenorm[types3[i,j]],class_column='Stat',ax=axs[i,j],color=colorstats)
        axs[i,j].set_facecolor(colortypes3[i,j])
        axs[i,j].set_title(f'{types3[i,j]}-Type Pokemon Stat Curves')
plt.tight_layout()
plt.savefig('./assets/plots/type-stats/type-stats3.png')
plt.close('all')

# Weight/Height vs Stats

## Weight

In [17]:
weight = pre.normalize(np.reshape(pokestats.weight.values,(-1,1)),axis=0)
hp = pre.normalize(np.reshape(pokestats.hp.values,(-1,1)),axis=0)
attack = pre.normalize(np.reshape(pokestats.attack.values,(-1,1)),axis=0)
defense = pre.normalize(np.reshape(pokestats.defense.values,(-1,1)),axis=0)
sp_attack = pre.normalize(np.reshape(pokestats.special_attack.values,(-1,1)),axis=0)
sp_defense = pre.normalize(np.reshape(pokestats.special_defense.values,(-1,1)),axis=0)
speed = pre.normalize(np.reshape(pokestats.speed.values,(-1,1)),axis=0)

## Weight Frequency (Log-Transform)

In [18]:
fig, ax = plt.subplots(facecolor='xkcd:light blue grey')
ax.hist(weight,bins=100,color='xkcd:dark lilac',label='Log-Frequency')
plt.legend()
ax.set_yscale('log')
ax.set_xlabel('Weight')
ax.set_facecolor('xkcd:pale turquoise')
ax.set_title('Log-Tranform Frequency of Pokemon Weight')
plt.savefig('./assets/plots/weight-stats/weight_freq.png')
plt.close('all')

## Weight vs. Stats

In [19]:
lower, upper = np.quantile(weight,.25)-1.5*stats.iqr(weight), np.quantile(weight,.75)+1.5*stats.iqr(weight)
heavy_weight = np.where(np.logical_or(weight<=lower,weight>=upper))
bad_weight = weight[heavy_weight]
bad_hp = hp[heavy_weight]
bad_attack = attack[heavy_weight]
bad_defense = defense[heavy_weight]
bad_spattack = sp_attack[heavy_weight]
bad_spdefense = sp_defense[heavy_weight]
bad_speed = speed[heavy_weight]

In [20]:
light_weight = np.where(np.logical_and(weight>lower,weight<upper))
new_weight = weight[light_weight]
light_hp = hp[light_weight]
light_attack = attack[light_weight]
light_defense = defense[light_weight]
light_spattack = sp_attack[light_weight]
light_spdefense = sp_defense[light_weight]
light_speed = speed[light_weight]

In [21]:
fig, axs = plt.subplots(2,3,figsize=(12,10),facecolor='xkcd:light blue grey')
colors = np.array([['xkcd:rose pink','xkcd:orange pink','xkcd:ugly yellow'],['xkcd:bluey green', 'xkcd:vibrant blue', 'xkcd:bright violet']])
lwstat = np.array([[light_hp,light_attack,light_defense],[light_spattack,light_spdefense,light_speed]])
badstat = np.array([[bad_hp,bad_attack,bad_defense],[bad_spattack,bad_spdefense,bad_speed]])
stat_name = np.array([['Hp','Attack','Defense'],['Sp. Attack','Sp. Defense','Speed']])
for i in range(2):
    for j in range(3):
        axs[i,j].scatter(new_weight,lwstat[i,j],c=colors[i,j],alpha=0.75,edgecolor='xkcd:dark grey')
        axs[i,j].scatter(bad_weight,badstat[i,j],c='xkcd:brownish grey', edgecolor='xkcd:dark grey',alpha=0.5,label='Outliers')
        axs[i,j].set_xlabel('Pokemon Weight (lbs)')
        axs[i,j].set_ylabel(f'Pokemon {stat_name[i,j]}')
        axs[i,j].set_title(f'Pokemon Weight versus {stat_name[i,j]}',fontsize=14)
        axs[i,j].set_facecolor('xkcd:very light pink')
        axs[0,j].legend(loc='lower right')
        axs[1,j].legend(loc='upper right')
plt.tight_layout()
plt.savefig('./assets/plots/weight-stats/weight_stats.png')
plt.close('all')

## Weight Outliers Removed

In [22]:
fig, ax = plt.subplots(facecolor='xkcd:light blue grey')
ax.hist(new_weight,bins=50,color='xkcd:dark lilac',label='Frequency')
plt.legend()
ax.set_xlabel('Weight')
ax.set_title('Frequency of Pokemon Weight \n (Outliers Removed)')
ax.set_facecolor('xkcd:pale turquoise')
plt.savefig('./assets/plots/weight-stats/lightweight_freq.png')
plt.close('all')

In [23]:
fig, axs = plt.subplots(2,3,figsize=(12,10),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):
        axs[i,j].scatter(new_weight,lwstat[i,j],c=colors[i,j],alpha=0.75,edgecolor='xkcd:dark grey')
        axs[i,j].set_xlabel('Pokemon Weight (lbs)')
        axs[i,j].set_ylabel(f'Pokemon {stat_name[i,j]}')
        axs[i,j].set_title(f'Pokemon Weight versus {stat_name[i,j]} \n (Weight Outliers Removed)',fontsize=14)
        axs[i,j].set_facecolor('xkcd:very light pink')
plt.tight_layout()
plt.savefig('./assets/plots/weight-stats/lightweight_stats.png')
plt.close('all')

## Height

In [24]:
height = pre.normalize(np.reshape(pokestats.height.values,(-1,1)),axis=0)
hp = pre.normalize(np.reshape(pokestats.hp.values,(-1,1)),axis=0)
attack = pre.normalize(np.reshape(pokestats.attack.values,(-1,1)),axis=0)
defense = pre.normalize(np.reshape(pokestats.defense.values,(-1,1)),axis=0)
spattack = pre.normalize(np.reshape(pokestats.special_attack.values,(-1,1)),axis=0)
spdefense = pre.normalize(np.reshape(pokestats.special_defense.values,(-1,1)),axis=0)
speed = pre.normalize(np.reshape(pokestats.speed.values,(-1,1)),axis=0)

## Height Frequency

In [25]:
fig, ax = plt.subplots(facecolor='xkcd:light blue grey')
ax.hist(height,bins=100,color='xkcd:turquoise',label='Log-Frequency')
plt.legend()
ax.set_yscale('log')
ax.set_xlabel('Height')
ax.set_facecolor('xkcd:pale lilac')
ax.set_title('Log-Tranform Frequency of Pokemon Height')
plt.savefig('./assets/plots/height-stats/height_freq.png')
plt.close('all')

## Height vs. Stats

In [26]:
lower, upper = np.quantile(height,.25)-1.5*stats.iqr(height), np.quantile(height,.75)+1.5*stats.iqr(height)
out_height = np.where(np.logical_or(height<=lower,height>=upper))
bad_height = height[out_height]
badh_hp = hp[out_height]
badh_attack = attack[out_height]
badh_defense = defense[out_height]
badh_spattack = sp_attack[out_height]
badh_spdefense = sp_defense[out_height]
badh_speed = speed[out_height]

In [27]:
short_height = np.where(np.logical_and(height>lower,height<upper))
new_height = height[short_height]
short_hp = hp[short_height]
short_attack = attack[short_height]
short_defense = defense[short_height]
short_spattack = sp_attack[short_height]
short_spdefense = sp_defense[short_height]
short_speed = speed[short_height]

In [28]:
fig, axs = plt.subplots(2,3,figsize=(12,10),facecolor='xkcd:light blue grey')
shstat = np.array([[short_hp,short_attack,short_defense],[short_spattack,short_spdefense,short_speed]])
badstat = np.array([[badh_hp,badh_attack,badh_defense],[badh_spattack,badh_spdefense,badh_speed]])

for i in range(2):
    for j in range(3):
        axs[i,j].scatter(new_height,shstat[i,j],c=colors[i,j],alpha=0.75,edgecolor='xkcd:dark grey')
        axs[i,j].scatter(bad_height,badstat[i,j],c='xkcd:brownish grey',alpha=0.5,edgecolor='xkcd:dark grey',label='Outliers')
        axs[i,j].set_xlabel('Pokemon Height (ft)')
        axs[i,j].set_ylabel(f'Pokemon {stat_name[i,j]}')
        axs[i,j].set_title(f'Pokemon Height versus {stat_name[i,j]}',fontsize=14)
        axs[i,j].set_facecolor('xkcd:very light pink')
        axs[0,j].legend(loc='lower right')
        axs[1,j].legend(loc='upper right')
plt.tight_layout()
plt.savefig('./assets/plots/height-stats/height_stats.png')
plt.close('all')

## Height Outliers Removed

In [29]:
fig, ax = plt.subplots(facecolor='xkcd:light blue grey')
ax.hist(new_height,bins=20,color='xkcd:turquoise',label='Frequency')
plt.legend()
ax.set_xlabel('Height')
ax.set_title('Frequency of Pokemon Height \n (Outliers Removed)')
ax.set_facecolor('xkcd:pale lilac')
plt.savefig('./assets/plots/height-stats/shortheight_freq.png')
plt.close('all')

In [30]:
fig, axs = plt.subplots(2,3,figsize=(12,10),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):
        axs[i,j].scatter(new_height,shstat[i,j],c=colors[i,j],alpha=0.75,edgecolor='xkcd:dark grey')
        axs[i,j].set_xlabel('Pokemon Height (lbs)')
        axs[i,j].set_ylabel(f'Pokemon {stat_name[i,j]}')
        axs[i,j].set_title(f'Pokemon Height versus {stat_name[i,j]} \n (Height Outliers Removed)',fontsize=14)
        axs[i,j].set_facecolor('xkcd:very light pink')  
plt.tight_layout()
plt.savefig('./assets/plots/height-stats/shortheight_stats.png')
plt.close('all')

In [31]:
pokebasics = pd.DataFrame([pokemon.name,pokemon.pokedex_number,pokemon.abilities,pokemon.typing,pokemon.gen_introduced,pokemon.can_evolve,pokemon.evolves_from,pokemon.number_pokemon_with_typing]).T

In [32]:
pokecolor = pd.DataFrame([pokemon.name, pokemon.pokedex_number,pokemon.primary_color]).T

In [33]:
pokecolor.set_index(pokecolor.name,inplace=True)
pokecolor.drop(columns='name', inplace=True)

In [34]:
pokecolor = pokecolor.merge(pokestats.primary_type,right_index=True,left_index=True)
pokecolor = pokecolor.merge(pokestats.second_type,right_index=True,left_index=True)

In [35]:
colors = dict.fromkeys(set(pokecolor.primary_color.values))
colortype = dict.fromkeys(monotypes)
fig, axs = plt.subplots(2,3,figsize=(20,12),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):  
        tcount = pokecolor.primary_color.loc[pokecolor.primary_type==types1[i,j]].value_counts()
        poketotal1 = len(pokecolor.pokedex_number.loc[pokecolor.primary_type==types1[i,j]])
        colortype[types1[i,j]] = tcount.plot.bar(rot=0,color=colortypes1[i,j],ax=axs[i,j],edgecolor='k',width=.85,label=f"Total no. of Primarily\n{types1[i,j]}-type Pokemon:\n{poketotal1}")
        colortype[types1[i,j]].set_ylabel("Color Frequency",fontsize=16)
        colortype[types1[i,j]].set_title(f"Color Distribution for {types1[i,j]}-Type Pokemon", fontsize=18)
        colortype[types1[i,j]].set_facecolor('xkcd:light peach')
        colortype[types1[i,j]].legend(loc='upper right',fontsize=14)
        plt.setp(colortype[types1[i,j]].get_xticklabels(), fontsize=14)
plt.tight_layout()
plt.savefig('./assets/plots/color-count/colorbytype1.png')
plt.close('all')

In [36]:
colors = dict.fromkeys(set(pokecolor.primary_color.values))
colortype = dict.fromkeys(monotypes)
fig, axs = plt.subplots(2,3,figsize=(20,12),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):  
        tcount = pokecolor.primary_color.loc[pokecolor.primary_type==types2[i,j]].value_counts()
        poketotal2 = len(pokecolor.pokedex_number.loc[pokecolor.primary_type==types2[i,j]]) 
        colortype[types2[i,j]] = tcount.plot.bar(rot=0,color=colortypes2[i,j],ax=axs[i,j],edgecolor='k',width=.85,label=f"Total no. of Primarily\n{types2[i,j]}-type Pokemon:\n{poketotal2}")
        colortype[types2[i,j]].set_ylabel("Color Frequency",fontsize=16)
        colortype[types2[i,j]].set_title(f"Color Distribution for {types2[i,j]}-Type Pokemon", fontsize=18)
        colortype[types2[i,j]].set_facecolor('xkcd:light peach')
        colortype[types2[i,j]].legend(loc='upper right',fontsize=14)
        plt.setp(colortype[types2[i,j]].get_xticklabels(), fontsize=14)

plt.tight_layout()
plt.savefig('./assets/plots/color-count/colorbytype2.png')
plt.close('all')

In [37]:
colors = dict.fromkeys(set(pokecolor.primary_color.values))
colortype = dict.fromkeys(monotypes)
fig, axs = plt.subplots(2,3,figsize=(20,12),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):  
        tcount = pokecolor.primary_color.loc[pokecolor.primary_type==types3[i,j]].value_counts()
        poketotal3 = len(pokecolor.pokedex_number.loc[pokecolor.primary_type==types3[i,j]])
        colortype[types3[i,j]] = tcount.plot.bar(rot=0,color=colortypes3[i,j],ax=axs[i,j],edgecolor='k',width=.85,label=f"Total no. of Primarily\n{types3[i,j]}-type Pokemon:\n{poketotal3}")
        colortype[types3[i,j]].set_ylabel("Color Frequency",fontsize=16)
        colortype[types3[i,j]].set_title(f"Color Distribution for {types3[i,j]}-Type Pokemon", fontsize=18)
        colortype[types3[i,j]].set_facecolor('xkcd:light peach')
        plt.setp(colortype[types3[i,j]].get_xticklabels(), fontsize=14)
        colortype[types3[i,j]].legend(loc='upper right',fontsize=14)
plt.tight_layout()
plt.savefig('./assets/plots/color-count/colorbytype3.png')
plt.close('all')


In [38]:
pokegen = pd.DataFrame([pokemon.name, pokemon.pokedex_number, pokemon.gen_introduced,pokemon.legendary,pokemon.mythical,pokemon.hp,pokemon.attack,pokemon.defense,pokemon.special_attack,pokemon.special_defense,pokemon.speed]).T

In [39]:
pokegen.set_index(pokegen.name,inplace=True)
pokegen.drop(columns='name', inplace=True)

In [40]:
gengroup = pokegen.groupby('gen_introduced')
legendno = []
for i in range(1,9): 
    len1 = len(gengroup.get_group(i).loc[pokegen.legendary == True])
    len2 = len(gengroup.get_group(i).loc[pokegen.mythical == True])
    legendno.append(len1+len2)

In [41]:
fig, ax = plt.subplots(facecolor='xkcd:light blue grey')
ax.plot(range(1,9),legendno,color='xkcd:light eggplant',marker='o',markersize=6)
ax.set_facecolor('xkcd:light gold')
ax.set_xlabel('Generation Introduced')
ax.set_ylabel('Number of Legendary and Mythical Pokemon')
ax.set_title('The Number of Legendary and Mythical Pokemon\nIntroduced Each Generation')
plt.savefig('./assets/plots/generation-plots/legendaries.png')
plt.close('all')

In [42]:
forms = []
for i in range(1,9):
    forms.append(len(pokemon.groupby('gen_introduced').get_group(i).loc[pokemon.forms_switchable == True]))

In [43]:
fig, ax = plt.subplots(facecolor='xkcd:light blue grey')
ax.plot(range(1,9),forms,color='xkcd:light eggplant',marker='o',markersize=6)
ax.set_facecolor('xkcd:light gold')
ax.set_xlabel('Generation Introduced')
ax.set_ylabel('Number of Pokemon with Alternate Forms')
ax.set_title('The Number of Pokemon with Alternate Forms\nIntroduced Each Generation')
plt.savefig('./assets/plots/generation-plots/forms.png')
plt.close('all')

In [44]:
meanstatbygen = {s: [gengroup.get_group(i)[s].mean() for i in range(1,9)] for s in typenorm['Bug']['Stat']}
medstatbygen = {s: [gengroup.get_group(i)[s].median() for i in range(1,9)] for s in typenorm['Bug']['Stat']}

In [45]:
statarray = np.array([['hp','attack','defense'],['special_attack','special_defense','speed']])
colorarray = np.array([['xkcd:fire engine red','xkcd:brick orange', 'xkcd:sunflower yellow'],['xkcd:clear blue', 'xkcd:lawn green', 'xkcd:magenta']])
compcolors = np.array([['xkcd:pale light green','xkcd:pastel blue', 'xkcd:pastel purple'],['xkcd:pastel orange', 'xkcd:pale pink', 'xkcd:pastel yellow']])

In [46]:
fig, axs = plt.subplots(2,3,figsize=(18,15),facecolor='xkcd:light blue grey')
for i in range(2):
    for j in range(3):
        axs[i,j].plot(range(1,9),meanstatbygen[statarray[i,j]],color=colorarray[i,j],marker='o',markersize=6,label='Mean',linewidth=3)
        axs[i,j].plot(range(1,9),medstatbygen[statarray[i,j]],color=colorarray[i,j],linestyle='dashed',marker='o',markersize=6, label='Median',linewidth=2)
        axs[i,j].set_facecolor(compcolors[i,j])
        axs[i,j].set_xlabel('Generation',fontsize=16)
        axs[i,j].set_ylabel(f'{statarray[i,j].capitalize()}',fontsize=16)
        axs[i,j].set_title(f'Mean vs. Median Pokemon {statarray[i,j].capitalize()}\nby Generation',fontsize=18)
        axs[i,j].legend(fontsize=16)
plt.tight_layout()
plt.savefig('./assets/plots/generation-plots/statsbygen.png')
plt.close('all')

In [47]:
pokestats.keys()

Index(['pokedex_number', 'primary_type', 'second_type', 'weight', 'height',
       'hp', 'attack', 'defense', 'special_attack', 'special_defense',
       'speed'],
      dtype='object')

In [48]:
stat = np.ndarray.flatten(statarray)

def norm(s):
    ma = s.max()
    mi = s.min()
    snew = []
    for x in s:
        normx = (float(x) - mi)/(ma-mi)
        snew.append(normx)
    return snew

def normed():
    norm_dict = dict.fromkeys(pokemon.keys())
    for x in pokemon.keys():
        if x in stat:
            norm_dict[x] = norm(pokemon[x].loc[pokemon.baby_pokemon == False].loc[pokemon.legendary == False])
        else:
            norm_dict[x] = pokemon[x].loc[pokemon.baby_pokemon == False].loc[pokemon.legendary == False]
    newdf = pd.DataFrame(norm_dict)
    return newdf
        
pokenorm = normed()

pokenorm[stat].describe()

Unnamed: 0,hp,attack,defense,special_attack,special_defense,speed
count,938.0,938.0,938.0,938.0,938.0,938.0
mean,0.259112,0.403773,0.294319,0.343654,0.231592,0.350436
std,0.096425,0.161018,0.126308,0.167975,0.121525,0.161428
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.192913,0.284091,0.2,0.217647,0.142857,0.228571
50%,0.251969,0.386364,0.273333,0.311765,0.214286,0.342857
75%,0.30315,0.511364,0.363333,0.470588,0.309524,0.462857
max,1.0,1.0,1.0,1.0,1.0,1.0


In [49]:
for s in stat:
    print(s, stats.normaltest(pokenorm[s]))

hp NormaltestResult(statistic=423.3784799106801, pvalue=1.1601955866799265e-92)
attack NormaltestResult(statistic=25.63053927017659, pvalue=2.7189368036309605e-06)
defense NormaltestResult(statistic=184.13888925338196, pvalue=1.0345422859145147e-40)
special_attack NormaltestResult(statistic=56.677829636701084, pvalue=4.926808377965786e-13)
special_defense NormaltestResult(statistic=125.36952812964995, pvalue=5.9752057896584195e-28)
speed NormaltestResult(statistic=28.347191581221068, pvalue=6.990134936975859e-07)


# These stats are VERY MUCH NOT normally distributed!

In [50]:
def norm2(s):
    mu = s.mean()
    sig = s.std()
    snew = []
    for x in s:
        normx = (float(x) - mu)/sig
        snew.append(normx)
    return snew

def normed2():
    norm_dict = dict.fromkeys(pokemon.keys())
    for x in pokemon.keys():
        if x in stat:
            norm_dict[x] = norm2(pokemon[x])
        else:
            norm_dict[x] = pokemon[x]
    newdf = pd.DataFrame(norm_dict)
    return newdf

pokestandard = normed2()
pokestandard[stat].describe()


Unnamed: 0,hp,attack,defense,special_attack,special_defense,speed
count,1036.0,1036.0,1036.0,1036.0,1036.0,1036.0
mean,-7.330044000000001e-17,-1.53513e-16,1.486638e-16,2.95345e-16,2.7862740000000004e-17,-4.7152330000000005e-17
std,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.539557,-2.409829,-2.265649,-1.980765,-1.845813,-2.148793
25%,-0.7145818,-0.7615278,-0.7669818,-0.6832168,-0.7633597,-0.7902042
50%,-0.1559159,-0.1022074,-0.1009077,-0.1966362,-0.07780571,-0.1109099
75%,0.4027499,0.7219431,0.5651665,0.6792089,0.6799119,0.738208
max,6.920518,3.392191,5.893759,3.533815,6.452998,4.474327


In [51]:
for x in stat:
    print(stats.normaltest(pokestandard[x]))

NormaltestResult(statistic=459.6621397971477, pvalue=1.5333262585554654e-100)
NormaltestResult(statistic=28.37614020681585, pvalue=6.889686253939714e-07)
NormaltestResult(statistic=209.99774426712742, pvalue=2.5093961441064216e-46)
NormaltestResult(statistic=66.15548133313932, pvalue=4.310421643034454e-15)
NormaltestResult(statistic=200.22281124269642, pvalue=3.3278900028676755e-44)
NormaltestResult(statistic=27.67304467501525, pvalue=9.792074298851535e-07)


# Not even normalization can fix this!

## Let's plot!

In [52]:
from pylab import rcParams
fig, axs = plt.subplots(2,3,figsize=(18,15),facecolor="xkcd:light blue grey")
kd = [[],[]]
for i in range(0,4):
    kd[0].append(stats.gaussian_kde(pokenorm[stat[i]].values))
for i in range(0,4):
    kd[1].append(stats.gaussian_kde(pokenorm[stat[i]].values))
kd = np.array(kd)
yl = "Estimated Density"
for i in range(2):
    for j in range(3):
        axs[i,j].hist(pokenorm[statarray[i,j]],bins = 'auto',density=True, label = f"{statarray[i,j].capitalize()} Density Histogram)",color=compcolors[i,j])
        x = np.linspace(0,1,10**4)
        axs[i,j].plot(x,kd[i,j](x),label=f"Kernel Density Estimation\nfor {statarray[i,j]}",color=colorarray[i,j])
        for pos in ['top','bottom','left','right']:
            axs[i,j].spines[pos].set_color('xkcd:grass green')
        axs[i,j].set_ylabel(yl,fontsize=15)
        axs[i,j].set_xlabel(f"{statarray[i,j].capitalize()}",fontsize=25)
        axs[i,j].legend(loc='upper left')
        axs[i,j].set_facecolor(color='xkcd:charcoal grey')

axs[1,1].set_title(f"\nThe Density Histograms For Each Pokemon Stat After Normalization",{'fontsize': rcParams['axes.titlesize'],
 'fontweight' : rcParams['axes.titleweight'],
 'verticalalignment': 'bottom',
 'horizontalalignment': 'center'},fontsize = 35,pad=20,color='xkcd:vivid purple')
plt.tight_layout()
plt.savefig('./assets/plots/stats-general/statdensity.png')
plt.close('all')


In [53]:
names = ['hp', 'attack', 'defense', 'special_attack', 'special_defense', 'speed']
name_dict = {'h':'hp', 'a':'attack','d': 'defense', 'spa':'special_attack', 'spd':'special_defense','s':'speed'}
mix = []
for x in names:
    for y in names:
        if x != y:
            mix.append((x,y))
        else:
            mix.append((x))

In [54]:
mixnames=[]
for x in names:
    mixnames.append([(x,y) for y in names])

In [55]:
namearray = np.array(mixnames)
namearray

array([[['hp', 'hp'],
        ['hp', 'attack'],
        ['hp', 'defense'],
        ['hp', 'special_attack'],
        ['hp', 'special_defense'],
        ['hp', 'speed']],

       [['attack', 'hp'],
        ['attack', 'attack'],
        ['attack', 'defense'],
        ['attack', 'special_attack'],
        ['attack', 'special_defense'],
        ['attack', 'speed']],

       [['defense', 'hp'],
        ['defense', 'attack'],
        ['defense', 'defense'],
        ['defense', 'special_attack'],
        ['defense', 'special_defense'],
        ['defense', 'speed']],

       [['special_attack', 'hp'],
        ['special_attack', 'attack'],
        ['special_attack', 'defense'],
        ['special_attack', 'special_attack'],
        ['special_attack', 'special_defense'],
        ['special_attack', 'speed']],

       [['special_defense', 'hp'],
        ['special_defense', 'attack'],
        ['special_defense', 'defense'],
        ['special_defense', 'special_attack'],
        ['special_defense', '

In [56]:
fig,axs = plt.subplots(6,6,facecolor='xkcd:light blue grey',figsize=(30,30))

for i in range(6):
    for j in range(6):
        if i != j:
            x,y = pokenorm[namearray[i,j][0]].values, pokenorm[namearray[i,j][1]].values
            xy = np.vstack([x,y])
            z = stats.gaussian_kde(xy)(xy)
            idx = z.argsort()
            x, y, z = x[idx], y[idx], z[idx]    
            axs[i,j].scatter(x,y,c=z,cmap='plasma')
            m, b, r, p, se = stats.linregress(x,y)
            axs[i,j].plot(x,m*x+b,color='xkcd:vivid green',linewidth=4, label=f'r2 score: {round(r2_score(y,m*x+b),2)}')
            axs[i,j].legend(loc="upper right",fontsize=16)
            axs[i,j].set_ylabel(namearray[i,j][1],color="xkcd:vivid purple")
            axs[i,j].set_xlabel(namearray[i,j][0],color="xkcd:vivid purple")
            axs[i,j].set_title(f"{namearray[i,j][0]} vs. {namearray[i,j][1]}",color="xkcd:vivid purple",fontsize=16)
        else:
            x = pokenorm[namearray[i,j][0]].values
            t = np.linspace(0,1,5000)
            kd = stats.gaussian_kde(x)(t)
            axs[i,j].plot(t,kd,color='xkcd:watermelon',linewidth=4)
            axs[i,j].hist(x, bins='auto', density=True, color='xkcd:vivid purple')
            axs[i,j].set_ylabel("")
            axs[i,j].set_xlabel("")
            axs[i,j].set_title(f"{namearray[i,j][0]} Density",fontsize=16,color="xkcd:vivid purple")
        axs[i,j].grid(False)
plt.colorbar(plt.cm.ScalarMappable(cmap='plasma'))
plt.tight_layout()
plt.savefig("./assets/plots/stats-general/statscatters.png")

plt.close('all')
