In [1]:
import gc
import ast
import math
import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
from sklearn.feature_extraction import FeatureHasher

In [94]:
%matplotlib inline

In [2]:
# Set packages options
#np.set_printoptions(threshold=np.nan)
plt.rcParams["figure.figsize"] = (12, 12)

In [3]:
# Constants
TypeColorMappings = {"Water": "#6890F0", "Fire": "#F08030", "Grass": "#78C850",
                     "Dark": "#705848", "Electric": "#F8D030", "Flying": "#A890F0",
                     "Normal": "#A8A878", "Fighting": "#C03028", "Poison": "#A040A0",
                     "Ground": "#E0C068", "Psychic": "#F85888", "Rock": "#B8A038", 
                     "Ice": "#98D8D8", "Bug": "#A8B820", "Dragon": "#7038F8", 
                     "Ghost": "#705898", "Steel": "#B8B8D0", "Fairy": "#EE99AC"}

def getColorList(typeCounts):
    assert type(typeCounts) == pd.core.series.Series, "Argument must be a Series object"
    return [TypeColorMappings[pokemonType] for pokemonType in typeCounts.index]

def getDistinctValues(dataFrame, columnName, sep):
    result = []
    for value in dataFrame[columnName].value_counts().index:
        result += value.split(sep)

    return set(result)

In [22]:
# Load the Pokemon data
fileName = r'.\Pokemon_Cleaned.tsv'
data = pd.read_csv(fileName, header=0, sep='\t')

# Transform list of abilities and egg groups to "string" numeric values
eggGroups = getDistinctValues(data, "EggGroups", "---")
abilities = getDistinctValues(data, "Abilities", "---")

eggGroupsMappings = {}
for i, eg in enumerate(eggGroups): eggGroupsMappings[eg] = i    
    
abilitiesMappings = {}
for i, abil in enumerate(abilities): abilitiesMappings[abil] = i
    
for i in range(len(data)):
    numberStr = ""
    values = sorted([abilitiesMappings[abil] for abil in data.loc[i, "Abilities"].split("---")])
    val = ''.join(map(lambda x: str(x).zfill(3), values))
    data.loc[i, "Abilities"] = val

for i in range(len(data)):
    numberStr = ""
    values = sorted([eggGroupsMappings[eg] for eg in data.loc[i, "EggGroups"].split("---")])
    val = ''.join(map(lambda x: str(x).zfill(3), values))
    data.loc[i, "EggGroups"] = val

In [33]:
d = pd.Series(abilitiesMappings)

In [48]:
reducedData = data.drop(["Pokedex#", "Generation", "Name", "Category", "Height (m)", "Weight (kg)"], axis=1)

In [88]:
fh = FeatureHasher(n_features=4, input_type="string")
hashedFeature = fh.fit_transform(data["Category"])
x = pd.concat([data[["Name", "Category"]], pd.DataFrame(hashedFeature.toarray())], axis=1)

In [None]:
# Plot Type 1 and Type 2 occurences to see the distributions
typeFig, typeAxes = plt.subplots(nrows=3, ncols=1)
typeFig.subplots_adjust(top=1.9)

type1Counts = data["Type 1"].value_counts()
type2Counts = data["Type 2"].value_counts().drop("None")
typesCounts = type1Counts.add(type2Counts, fill_value=0)

type1Counts.plot(title="Type 1 Occurrences", kind="bar", ax=typeAxes[0], color=getColorList(type1Counts))
type2Counts.plot(title="Type 2 Occurrences", kind="bar", ax=typeAxes[1], color=getColorList(type2Counts))
typesCounts.plot(title="Type 1 + 2 Occurrences", kind="bar", ax=typeAxes[2], color=getColorList(typesCounts))

In [None]:
# Plot Group occurences to see the distributions
groupFig, groupAxes = plt.subplots(nrows=1, ncols=1)
groupCounts = data["Group"].value_counts()
groupAxe = groupCounts.plot(title="Group", kind="bar")
groupAxe.set(xlabel="Group Types", ylabel="Count")

In [None]:
# Decision Tree

In [None]:
# Get data profile
profile = pdp.ProfileReport(data)
profile.to_file("Profile.html")
profile = None
gc.collect()