# Using Decision Tree & Naive Bayes to classify Legendary Pokemon

## Import libraries

In [1]:
import gc
import time
import math
import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
# from sklearn.feature_extraction import FeatureHasher
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
import utils as ut
import decisionTree as dt
import naiveBayes as nb

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [120]:
%reload_ext autoreload

In [None]:
# Set packages options
# np.set_printoptions(threshold=np.nan)
pd.set_option("display.max_columns", 600)
plt.rcParams["figure.figsize"] = (11, 6)

## Define constants and functions

In [2]:
# Constants
TypeColorMappings = {"Water": "#6890F0", "Fire": "#F08030", "Grass": "#78C850",
                     "Dark": "#705848", "Electric": "#F8D030", "Flying": "#A890F0",
                     "Normal": "#A8A878", "Fighting": "#C03028", "Poison": "#A040A0",
                     "Ground": "#E0C068", "Psychic": "#F85888", "Rock": "#B8A038", 
                     "Ice": "#98D8D8", "Bug": "#A8B820", "Dragon": "#7038F8", 
                     "Ghost": "#705898", "Steel": "#B8B8D0", "Fairy": "#EE99AC"}

# DropColumns = ["Pokedex#", "Name", "Type 1", "Type 2", "Generation", "Ability 1", "Ability 2", "Ability 3", 
#                "EggGroup 1", "EggGroup 2", "Category", "Height (m)", "Weight (kg)"]

DropColumns = ["Pokedex#", "Name", "Generation", "Category"]

def getColorList(typeCounts):
    assert type(typeCounts) == pd.core.series.Series, "Argument must be a Series object"
    return [TypeColorMappings[pokemonType] for pokemonType in typeCounts.index]

def getDistinctValues(dataFrame, columnName, sep):
    result = []
    for value in dataFrame[columnName].value_counts().index:
        result += value.split(sep)
    return set(result)

## Load the data from file

In [160]:
# Load the Pokemon data
fileName = r'.\Pokemon_Cleaned.tsv'
columnTypes = {"Name": str, "Category": str, "Type 1": str, "Type 2": str, 
               "Ability 1": str, "Ability 2": str, "Ability 3": str, "Group": str}
data = pd.read_csv(fileName, header=0, sep='\t', dtype=columnTypes)
data.head()

Unnamed: 0,Generation,Pokedex#,Name,Category,Type 1,Type 2,Ability 1,Ability 2,Ability 3,MaleRatio,...,EggGroup 1,EggGroup 2,HP,Attack,Defense,Sp.Attack,Sp.Defense,Speed,Total,Group
0,1.0,1.0,Bulbasaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,45.0,49.0,49.0,65.0,65.0,45.0,318,Ordinary
1,1.0,2.0,Ivysaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,60.0,62.0,63.0,80.0,80.0,60.0,405,Ordinary
2,6.0,3.0,Mega Venusaur,Seed Pokemon,Grass,Poison,Thick Fat,,,87.5,...,Monster,Grass,80.0,100.0,123.0,122.0,120.0,80.0,625,Ordinary
3,1.0,3.0,Venusaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,80.0,82.0,83.0,100.0,100.0,80.0,525,Ordinary
4,1.0,4.0,Charmander,Lizard Pokemon,Fire,,Blaze,Solar Power,,87.5,...,Monster,Dragon,39.0,52.0,43.0,60.0,50.0,65.0,309,Ordinary


## Plot graphs to visualize and understand the data

In [None]:
# Plot Group occurences to see the distributions
groupFig, groupAxes = plt.subplots(nrows=1, ncols=1)
groupCounts = data["Group"].value_counts()
groupAxe = groupCounts.plot(title="Group", kind="bar")
groupAxe.set(xlabel="Group Types", ylabel="Count")

In [None]:
# Box plot Total & Group
data.boxplot("Total", "Group")

<p> According to the boxplot above, we can see that despite having a small quanity in size, Legendary Pokemon has the highest Total stats compared to the other 2 groups.</p>

## PCA

In [None]:
reducedData = data.drop(DropColumns, axis=1)
# dummies = pd.get_dummies(data[["Type 1", "Type 2", "Ability 1", "Ability 2", "Ability 3", "EggGroup 1", "EggGroup 2"]])
# dataWithDummies = pd.concat([dummies, reducedData], axis=1, join_axes=[reducedData.index])

# x = dataWithDummies.loc[:, dataWithDummies.columns != "Group"]
# y = dataWithDummies.loc[:, "Group"]

# Scale the x data
#x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(0.95)
principalComponents = pca.fit_transform(x)
width, height = principalComponents.shape
principalDf = pd.DataFrame(data=principalComponents, columns=["pca %d" % i for i in range(1, height + 1)])
pcaData = pd.concat([principalDf, dataWithDummies[["Group"]]], axis=1)

xTrain = pcaData.sample(frac=0.5)

In [None]:
# # Plot Type 1 and Type 2 occurences to see the distributions
# typeFig, typeAxes = plt.subplots(nrows=3, ncols=1)
# typeFig.subplots_adjust(top=3)

# type1Counts = data["Type 1"].value_counts()
# type2Counts = data["Type 2"].value_counts().drop("None")
# typesCounts = type1Counts.add(type2Counts, fill_value=0)

# type1Counts.plot(title="Type 1 Occurrences", kind="bar", ax=typeAxes[0], color=getColorList(type1Counts))
# type2Counts.plot(title="Type 2 Occurrences", kind="bar", ax=typeAxes[1], color=getColorList(type2Counts))
# typesCounts.plot(title="Type 1 + 2 Occurrences", kind="bar", ax=typeAxes[2], color=getColorList(typesCounts))

# fh = FeatureHasher(n_features=2, input_type="string")
# hashedFeature1 = fh.fit_transform(data["Type 1"])
# hashedFeature2 = fh.fit_transform(data["Type 2"])

# x = pd.concat([data[["Name", "Type 1", "Type 2", "Category"]], pd.DataFrame(hashedFeature1.toarray())], axis=1)
# x = pd.concat([x, pd.DataFrame(hashedFeature2.toarray())], axis=1)

## Decision Tree

In [None]:
reducedData = data.drop(DropColumns, axis=1)
training, test = ut.splitData(reducedData, 0.60)
dtree = dtree.DecisionTree("Group")
dtree.train(training)
pred = dtree.classify(test)

In [None]:
matrix = ut.buildConfusionMatrix(pred, test[dtree.targetFeature], reducedData[dtree.targetFeature].unique())
precisions, recalls = ut.getPrecisionsAndRecalls(matrix, reducedData[dtree.targetFeature].unique())
fScores = ut.computeFScores(precisions, recalls)

In [None]:
fScores

In [None]:
gc.collect()
m = pd.DataFrame({"bool": [True, False, False], "number": [1,2,3]})
isinstance(m.dtypes["bool"].type, np.bool_)
m.dtypes["bool"].type == np.bool_

### Naive Bayes

In [141]:
import naiveBayes as nb

In [161]:
reducedData = data.drop(DropColumns, axis=1)
training, test = ut.splitData(reducedData, 0.60)
# nBayes.train(training)

In [162]:
nBayes = nb.NaiveBayes("Group")
d = nBayes.train(training)
# nBayes._getFeatureType(data, "Name")

Ordinary MaleRatio 51.65187376725838 21.89278608241621 3011.493536670707
Ordinary FemaleRatio 44.20611439842209 21.17528534815576 2817.334484653236
Ordinary Height (m) 1.065680473372781 1.0333976734325958 6.709881142926833
Ordinary Weight (kg) 51.81794871794872 83.16938769011321 43461.7167042963
Ordinary HP 65.73570019723866 23.433472555739847 3450.2706944109727
Ordinary Attack 75.88757396449704 30.5154205725453 5850.844935313479
Ordinary Defense 71.10453648915187 30.72405102513998 5931.121541417426
Ordinary Sp.Attack 68.29388560157791 29.70431394407339 5543.945099955898
Ordinary Sp.Defense 68.12623274161736 25.393345161035075 4051.5359808532385
Ordinary Speed 65.57988165680473 28.732358904331385 5187.073880141084
Ordinary Total 414.7278106508876 108.93438607757636 74560.67803842375
Legendary MaleRatio 7.291666666666667 25.24261706543235 4003.581063417842
Legendary FemaleRatio 5.208333333333333 21.237219769689705 2833.839117932282
Legendary Height (m) 2.310416666666667 1.70559025576701

ZeroDivisionError: float division by zero

In [164]:
training[training["Group"] == "Ultra Beast"].describe()

Unnamed: 0,MaleRatio,FemaleRatio,Height (m),Weight (kg),HP,Attack,Defense,Sp.Attack,Sp.Defense,Speed,Total
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,0.0,0.0,2.5875,187.3625,78.0,102.75,87.25,111.75,76.5,91.25,547.5
std,0.0,0.0,1.609292,277.547628,20.479955,34.449756,58.782286,45.980586,28.839457,40.55948,52.576475
min,0.0,0.0,0.6,1.8,53.0,53.0,37.0,53.0,37.0,13.0,420.0
25%,0.0,0.0,1.65,22.0,65.5,73.0,51.5,68.0,63.5,77.5,562.5
50%,0.0,0.0,2.1,77.75,72.0,108.0,69.0,127.0,72.0,93.0,570.0
75%,0.0,0.0,3.65,195.9,89.0,132.5,89.5,140.5,84.5,110.5,570.0
max,0.0,0.0,5.5,820.0,109.0,139.0,211.0,173.0,131.0,151.0,570.0


In [159]:
training.head()

Unnamed: 0,Type 1,Type 2,Ability 1,Ability 2,Ability 3,MaleRatio,FemaleRatio,Height (m),Weight (kg),EggGroup 1,EggGroup 2,HP,Attack,Defense,Sp.Attack,Sp.Defense,Speed,Total,Group
425,Water,,Rattled,Shell Armor,,50.0,50.0,0.4,52.5,Water 1,,35.0,64.0,85.0,74.0,55.0,32.0,345,Ordinary
403,Water,,Competitive,Marvel Scale,Cute Charm,50.0,50.0,6.2,162.0,Water 1,Dragon,95.0,60.0,79.0,100.0,125.0,81.0,540,Ordinary
707,Dragon,,Mold Breaker,Unnerve,Rivalry,50.0,50.0,1.8,105.5,Monster,Dragon,76.0,147.0,90.0,60.0,70.0,97.0,540,Ordinary
661,Normal,,Skill Link,Cute Charm,Technician,25.0,75.0,0.4,5.8,Field,,55.0,50.0,40.0,40.0,40.0,75.0,300,Ordinary
567,Steel,Dragon,Telepathy,Pressure,,0.0,0.0,5.4,683.0,Undiscovered,,100.0,120.0,120.0,150.0,100.0,90.0,680,Legendary


In [None]:
for i in training["FemaleRatio"]: print(i)

In [168]:
mean = training["FemaleRatio"].mean()
math.sqrt(sum(map(lambda x: (x-mean)**2, training["FemaleRatio"])) / len(training))

24.1340315302847

In [172]:
s = [1 for i in range(100)]
mean = sum(s) / 100
print(mean)
math.sqrt(sum(map(lambda x: (x-mean)**2, s)) / len(training))

1.0


0.0

In [54]:
def filter(dataFrame, column, value):
    return dataFrame[dataFrame[column] == value]

In [56]:
filter(training, "Type 1", "Normal")["Group"].value_counts()

Ordinary     74
Legendary     1
Name: Group, dtype: int64

In [50]:
training["Group"].value_counts()

Ordinary       495
Legendary       61
Ultra Beast      7
Name: Group, dtype: int64

In [51]:
training["Type 1"].value_counts()

Normal      75
Water       71
Bug         51
Grass       48
Psychic     39
Fire        37
Rock        30
Electric    29
Poison      25
Ground      24
Dragon      24
Dark        23
Fairy       19
Steel       18
Fighting    18
Ice         15
Ghost       14
Flying       3
Name: Type 1, dtype: int64

### Plot Error Rate

In [None]:
ratios = [0.2, 0.4, 0.6, 0.8]
dt = DecisionTree("Group")
x = []
y = []

for ratio in ratios:
    print("************ Split ratio: ", ratio)
    training, test = splitData(data, ratio)
    node = dt.train(training)
    predictions = dt.classify(test, node)
    error = computeError(predictions, test[dt.targetFeature])
    
    x.append(len(training))
    y.append(error)

In [None]:
plt.plot(x, y, 'bo-', label="With categorical and continuous features")
plt.xlabel("Trainging Set size")
plt.ylabel("Error Rate")
plt.legend(loc="best")

In [None]:
# Get data profile
profile = pdp.ProfileReport(dataWithDummies)
profile.to_file("Profile.html")
profile = None
gc.collect()