# EDA on Pokémon Dataset 1

This dataset is from: https://www.kaggle.com/datasets/rounakbanik/pokemon/data 

In [399]:
#Packages
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import ast

In [400]:
pokemon_data = pd.read_csv("pokemon_1.csv")

#Moving the name of the Pokémon to the first column 
pokemon_data = pokemon_data[['name'] + [col for col in pokemon_data.columns if col != 'name']]
pokemon_data.head()

Unnamed: 0,name,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,Bulbasaur,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,Ivysaur,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,Venusaur,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,Charmander,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,Charmeleon,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


Below we look at number of observations of our dataset and the total number of legendary Pokémon:

In [401]:
len(pokemon_data)

801

In [402]:
legendary_filt = pokemon_data['is_legendary'] == 1
pokemon_data.loc[legendary_filt, ['name']]

Unnamed: 0,name
143,Articuno
144,Zapdos
145,Moltres
149,Mewtwo
150,Mew
...,...
796,Celesteela
797,Kartana
798,Guzzlord
799,Necrozma


## Preprocessing Data

In [403]:
pokemon_data = pokemon_data.sample(frac=1, random_state=0).reset_index(drop=True)

#Getting rid of identifiers that do not contribute to prediction
pokemon_data.drop(columns=['name', 'japanese_name', 'pokedex_number', 'classfication', 'generation'], inplace=True)

pokemon_data = pokemon_data.copy()

# One-Hot Encode 'type1' and 'type2'
pokemon_data['type2'] = pokemon_data['type2'].fillna('None')
pokemon_data = pd.get_dummies(pokemon_data, columns=['type1', 'type2'], drop_first=True)

In [404]:
# Multi-Label Binarization for 'abilities' (splitting abilities into separate binary columns)

def convert_to_list(value):
    if isinstance(value, str):  
        return ast.literal_eval(value)  
    return value  

# Apply string conversion
pokemon_data['abilities'] = pokemon_data['abilities'].apply(convert_to_list)

mlb = MultiLabelBinarizer()

# Transform the abilities column into a binary matrix
abilities_encoded = pd.DataFrame(mlb.fit_transform(pokemon_data['abilities']), columns=mlb.classes_)

# Concatenate the new binary columns with the original dataset
pokemon_data = pd.concat([pokemon_data, abilities_encoded], axis=1)

# Drop the original abilities column
pokemon_data.drop(columns=['abilities'], inplace=True)


In [405]:
# Preprocessing numerical columns
numeric_columns = ['percentage_male', 'height_m', 'weight_kg', 'capture_rate',
                   'base_egg_steps', 'experience_growth', 'base_happiness']

for col in numeric_columns:
    pokemon_data[col] = pd.to_numeric(pokemon_data[col], errors='coerce')  

# Fill missing values
pokemon_data = pokemon_data.copy()  

pokemon_data = pokemon_data.assign(
    percentage_male=pokemon_data['percentage_male'].fillna(-1),  # -1 for genderless Pokémon
    height_m=pokemon_data['height_m'].fillna(pokemon_data['height_m'].median()),
    weight_kg=pokemon_data['weight_kg'].fillna(pokemon_data['weight_kg'].median()),
    capture_rate=pokemon_data['capture_rate'].fillna(pokemon_data['capture_rate'].mean()),
    base_egg_steps=pokemon_data['base_egg_steps'].fillna(pokemon_data['base_egg_steps'].median()),
    experience_growth=pokemon_data['experience_growth'].fillna(pokemon_data['experience_growth'].mean()),
    base_happiness=pokemon_data['base_happiness'].fillna(pokemon_data['base_happiness'].median())
)


In [406]:
# exporting preprocessed csv to models
pokemon_data.to_csv("preprocessed_pokemon_1.csv", index=False)