# Using Decision Tree & Naive Bayes to classify Legendary Pokemon

## Import libraries

In [1]:
import gc
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import pandas_profiling as pdp
# from sklearn.feature_extraction import FeatureHasher
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
import utils as ut
import decisionTree as dt
import naiveBayes as nb

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%reload_ext autoreload

In [None]:
# Set packages options
# np.set_printoptions(threshold=np.nan)
pd.set_option("display.max_columns", 600)
plt.rcParams["figure.figsize"] = (11, 6)

## Define constants and functions

In [3]:
# Constants
TypeColorMappings = {"Water": "#6890F0", "Fire": "#F08030", "Grass": "#78C850",
                     "Dark": "#705848", "Electric": "#F8D030", "Flying": "#A890F0",
                     "Normal": "#A8A878", "Fighting": "#C03028", "Poison": "#A040A0",
                     "Ground": "#E0C068", "Psychic": "#F85888", "Rock": "#B8A038", 
                     "Ice": "#98D8D8", "Bug": "#A8B820", "Dragon": "#7038F8", 
                     "Ghost": "#705898", "Steel": "#B8B8D0", "Fairy": "#EE99AC"}

# DropColumns = ["Pokedex#", "Name", "Type 1", "Type 2", "Generation", "Ability 1", "Ability 2", "Ability 3", 
#                "EggGroup 1", "EggGroup 2", "Category", "Height (m)", "Weight (kg)"]

DropColumns = ["Pokedex#", "Name", "Generation", "Category"]

def getColorList(typeCounts):
    assert type(typeCounts) == pd.core.series.Series, "Argument must be a Series object"
    return [TypeColorMappings[pokemonType] for pokemonType in typeCounts.index]

def getDistinctValues(dataFrame, columnName, sep):
    result = []
    for value in dataFrame[columnName].value_counts().index:
        result += value.split(sep)
    return set(result)

## Load the data from file

In [4]:
# Load the Pokemon data
fileName = r'.\Pokemon_Cleaned.tsv'
columnTypes = {"Name": str, "Category": str, "Type 1": str, "Type 2": str, 
               "Ability 1": str, "Ability 2": str, "Ability 3": str, "Group": str}
data = pd.read_csv(fileName, header=0, sep='\t', dtype=columnTypes)
data.head()

Unnamed: 0,Generation,Pokedex#,Name,Category,Type 1,Type 2,Ability 1,Ability 2,Ability 3,MaleRatio,...,EggGroup 1,EggGroup 2,HP,Attack,Defense,Sp.Attack,Sp.Defense,Speed,Total,Group
0,1.0,1.0,Bulbasaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,45.0,49.0,49.0,65.0,65.0,45.0,318,Ordinary
1,1.0,2.0,Ivysaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,60.0,62.0,63.0,80.0,80.0,60.0,405,Ordinary
2,6.0,3.0,Mega Venusaur,Seed Pokemon,Grass,Poison,Thick Fat,,,87.5,...,Monster,Grass,80.0,100.0,123.0,122.0,120.0,80.0,625,Ordinary
3,1.0,3.0,Venusaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,80.0,82.0,83.0,100.0,100.0,80.0,525,Ordinary
4,1.0,4.0,Charmander,Lizard Pokemon,Fire,,Blaze,Solar Power,,87.5,...,Monster,Dragon,39.0,52.0,43.0,60.0,50.0,65.0,309,Ordinary


## Plot graphs to visualize and understand the data

In [None]:
# Plot Group occurences to see the distributions
groupFig, groupAxes = plt.subplots(nrows=1, ncols=1)
groupCounts = data["Group"].value_counts()
groupAxe = groupCounts.plot(title="Group", kind="bar")
groupAxe.set(xlabel="Group Types", ylabel="Count")

In [None]:
# Box plot Total & Group
data.boxplot("Total", "Group")

<p> According to the boxplot above, we can see that despite having a small quanity in size, Legendary Pokemon has the highest Total stats compared to the other 2 groups.</p>

## PCA

In [None]:
reducedData = data.drop(DropColumns, axis=1)
# dummies = pd.get_dummies(data[["Type 1", "Type 2", "Ability 1", "Ability 2", "Ability 3", "EggGroup 1", "EggGroup 2"]])
# dataWithDummies = pd.concat([dummies, reducedData], axis=1, join_axes=[reducedData.index])

# x = dataWithDummies.loc[:, dataWithDummies.columns != "Group"]
# y = dataWithDummies.loc[:, "Group"]

# Scale the x data
#x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(0.95)
principalComponents = pca.fit_transform(x)
width, height = principalComponents.shape
principalDf = pd.DataFrame(data=principalComponents, columns=["pca %d" % i for i in range(1, height + 1)])
pcaData = pd.concat([principalDf, dataWithDummies[["Group"]]], axis=1)

xTrain = pcaData.sample(frac=0.5)

In [None]:
# # Plot Type 1 and Type 2 occurences to see the distributions
# typeFig, typeAxes = plt.subplots(nrows=3, ncols=1)
# typeFig.subplots_adjust(top=3)

# type1Counts = data["Type 1"].value_counts()
# type2Counts = data["Type 2"].value_counts().drop("None")
# typesCounts = type1Counts.add(type2Counts, fill_value=0)

# type1Counts.plot(title="Type 1 Occurrences", kind="bar", ax=typeAxes[0], color=getColorList(type1Counts))
# type2Counts.plot(title="Type 2 Occurrences", kind="bar", ax=typeAxes[1], color=getColorList(type2Counts))
# typesCounts.plot(title="Type 1 + 2 Occurrences", kind="bar", ax=typeAxes[2], color=getColorList(typesCounts))

# fh = FeatureHasher(n_features=2, input_type="string")
# hashedFeature1 = fh.fit_transform(data["Type 1"])
# hashedFeature2 = fh.fit_transform(data["Type 2"])

# x = pd.concat([data[["Name", "Type 1", "Type 2", "Category"]], pd.DataFrame(hashedFeature1.toarray())], axis=1)
# x = pd.concat([x, pd.DataFrame(hashedFeature2.toarray())], axis=1)

## Decision Tree

In [None]:
reducedData = data.drop(DropColumns, axis=1)
training, test = ut.splitData(reducedData, 0.60)
dtree = dtree.DecisionTree("Group")
dtree.train(training)
pred = dtree.classify(test)

In [None]:
matrix = ut.buildConfusionMatrix(pred, test[dtree.targetFeature], reducedData[dtree.targetFeature].unique())
precisions, recalls = ut.getPrecisionsAndRecalls(matrix, reducedData[dtree.targetFeature].unique())
fScores = ut.computeFScores(precisions, recalls)

In [None]:
fScores

In [None]:
gc.collect()
m = pd.DataFrame({"bool": [True, False, False], "number": [1,2,3]})
isinstance(m.dtypes["bool"].type, np.bool_)
m.dtypes["bool"].type == np.bool_

### Naive Bayes

In [58]:
reducedData = data.drop(DropColumns, axis=1)
training, test = ut.splitData("Group", reducedData, 0.60)
# nBayes.train(training)

In [62]:
nBayes = nb.NaiveBayes("Group")
nBayes.train(training, reducedData["Group"].unique())
# nBayes._getFeatureType(data, "Name")

****Skipping MaleRatio feature for label Ultra Beast. Standard deviation = 0
****Skipping FemaleRatio feature for label Ultra Beast. Standard deviation = 0


In [63]:
nBayes._categoricalProbTable["Ability 1=Slow Start"]

Ordinary       0.001529
Legendary      0.010000
Ultra Beast    0.006536
Name: Ability 1=Slow Start, dtype: float64

In [38]:
len(nBayes._getCategoricalFeatureMappings(training)["Ability 1"])

153

In [64]:
nBayes._continuousProbTable

Unnamed: 0,MaleRatio=mean,MaleRatio=std,FemaleRatio=mean,FemaleRatio=std,Height (m)=mean,Height (m)=std,Weight (kg)=mean,Weight (kg)=std,HP=mean,HP=std,...,Defense=mean,Defense=std,Sp.Attack=mean,Sp.Attack=std,Sp.Defense=mean,Sp.Defense=std,Speed=mean,Speed=std,Total=mean,Total=std
Ordinary,51.136364,22.164846,45.108696,21.647217,1.037352,0.896952,47.670158,79.84049,64.416996,21.841872,...,69.826087,31.267694,66.956522,28.907878,67.980237,25.746741,64.778656,28.288024,408.241107,107.488987
Legendary,7.692308,25.018847,5.769231,21.36023,2.288462,1.549782,203.75,229.429258,94.384615,29.537051,...,98.269231,25.94951,115.769231,34.195869,105.096154,29.63653,98.0,22.93298,622.769231,70.042176
Ultra Beast,,,,,3.4,3.44093,308.16,406.773749,90.6,19.462785,...,85.8,35.87757,97.4,33.208433,85.0,31.080541,87.4,24.223955,534.0,65.03845


In [65]:
nBayes._categoricalProbTable

Unnamed: 0,Type 1=Normal,Type 1=Ground,Type 1=Ghost,Type 1=Dark,Type 1=Dragon,Type 1=Poison,Type 1=Water,Type 1=Bug,Type 1=Rock,Type 1=Grass,...,EggGroup 2=Water 1,EggGroup 2=Water 3,EggGroup 2=Grass,EggGroup 2=Fairy,EggGroup 2=Human-Like,EggGroup 2=Dragon,EggGroup 2=Amorphous,EggGroup 2=Water 2,EggGroup 2=Flying,EggGroup 2=Bug
Ordinary,0.12253,0.037549,0.037549,0.039526,0.033597,0.033597,0.146245,0.081028,0.051383,0.086957,...,0.083004,0.011858,0.029644,0.057312,0.021739,0.061265,0.001976,0.009881,0.003953,0.001976
Legendary,0.042857,0.028571,0.042857,0.028571,0.142857,0.014286,0.085714,0.014286,0.042857,0.057143,...,0.015625,0.015625,0.015625,0.015625,0.015625,0.015625,0.015625,0.015625,0.015625,0.015625
Ultra Beast,0.043478,0.043478,0.043478,0.043478,0.043478,0.130435,0.043478,0.086957,0.086957,0.043478,...,0.058824,0.058824,0.058824,0.058824,0.058824,0.058824,0.058824,0.058824,0.058824,0.058824


In [43]:
training[training["Group"] == "Ordinary"]["Height (m)"].mean()

1.0424547283702215

In [79]:
training["Group"].unique()

array(['Ordinary', 'Legendary', 'Ultra Beast'], dtype=object)

In [None]:
for i in training["FemaleRatio"]: print(i)

In [None]:
mean = training["FemaleRatio"].mean()
math.sqrt(sum(map(lambda x: (x-mean)**2, training["FemaleRatio"])) / len(training))

In [70]:
s = [1 for i in range(100)]
s = []
mean = sum(s) / 100
print(mean)
math.sqrt(sum(map(lambda x: (x-mean)**2, s)) / len(training))

0.0


0.0

In [18]:
value = 1.1
mean = 20
std = 5
prob = (1 / (math.sqrt(2 * (std**2) * math.pi))) * math.exp(-(value - mean)**2 / (2 * std**2))
prob

6.298736258150442e-05

In [None]:
def filter(dataFrame, column, value):
    return dataFrame[dataFrame[column] == value]

In [None]:
filter(training, "Type 1", "Normal")["Group"].value_counts()

In [None]:
training["Group"].value_counts()

In [None]:
training["Type 1"].value_counts()

### Plot Error Rate

In [None]:
ratios = [0.2, 0.4, 0.6, 0.8]
dt = DecisionTree("Group")
x = []
y = []

for ratio in ratios:
    print("************ Split ratio: ", ratio)
    training, test = splitData(data, ratio)
    node = dt.train(training)
    predictions = dt.classify(test, node)
    error = computeError(predictions, test[dt.targetFeature])
    
    x.append(len(training))
    y.append(error)

In [None]:
plt.plot(x, y, 'bo-', label="With categorical and continuous features")
plt.xlabel("Trainging Set size")
plt.ylabel("Error Rate")
plt.legend(loc="best")

In [None]:
# Get data profile
profile = pdp.ProfileReport(dataWithDummies)
profile.to_file("Profile.html")
profile = None
gc.collect()