# Using Decision Tree & Naive Bayes to classify Legendary Pokemon

## Import libraries

In [70]:
import gc
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import pandas_profiling as pdp
# from sklearn.feature_extraction import FeatureHasher
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
import utils as ut
import decisionTree as dt
import naiveBayes as nb

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
%reload_ext autoreload

In [None]:
# Set packages options
# np.set_printoptions(threshold=np.nan)
pd.set_option("display.max_columns", 600)
plt.rcParams["figure.figsize"] = (11, 6)

## Define constants and functions

In [3]:
# Constants
TypeColorMappings = {"Water": "#6890F0", "Fire": "#F08030", "Grass": "#78C850",
                     "Dark": "#705848", "Electric": "#F8D030", "Flying": "#A890F0",
                     "Normal": "#A8A878", "Fighting": "#C03028", "Poison": "#A040A0",
                     "Ground": "#E0C068", "Psychic": "#F85888", "Rock": "#B8A038", 
                     "Ice": "#98D8D8", "Bug": "#A8B820", "Dragon": "#7038F8", 
                     "Ghost": "#705898", "Steel": "#B8B8D0", "Fairy": "#EE99AC"}

# DropColumns = ["Pokedex#", "Name", "Type 1", "Type 2", "Generation", "Ability 1", "Ability 2", "Ability 3", 
#                "EggGroup 1", "EggGroup 2", "Category", "Height (m)", "Weight (kg)"]

DropColumns = ["Pokedex#", "Name", "Generation", "Category"]

def getColorList(typeCounts):
    assert type(typeCounts) == pd.core.series.Series, "Argument must be a Series object"
    return [TypeColorMappings[pokemonType] for pokemonType in typeCounts.index]

def getDistinctValues(dataFrame, columnName, sep):
    result = []
    for value in dataFrame[columnName].value_counts().index:
        result += value.split(sep)
    return set(result)

## Load the data from file

In [4]:
# Load the Pokemon data
fileName = r'.\Pokemon_Cleaned.tsv'
columnTypes = {"Name": str, "Category": str, "Type 1": str, "Type 2": str, 
               "Ability 1": str, "Ability 2": str, "Ability 3": str, "Group": str}
data = pd.read_csv(fileName, header=0, sep='\t', dtype=columnTypes)
data.head()

Unnamed: 0,Generation,Pokedex#,Name,Category,Type 1,Type 2,Ability 1,Ability 2,Ability 3,MaleRatio,...,EggGroup 1,EggGroup 2,HP,Attack,Defense,Sp.Attack,Sp.Defense,Speed,Total,Group
0,1.0,1.0,Bulbasaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,45.0,49.0,49.0,65.0,65.0,45.0,318,Ordinary
1,1.0,2.0,Ivysaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,60.0,62.0,63.0,80.0,80.0,60.0,405,Ordinary
2,6.0,3.0,Mega Venusaur,Seed Pokemon,Grass,Poison,Thick Fat,,,87.5,...,Monster,Grass,80.0,100.0,123.0,122.0,120.0,80.0,625,Ordinary
3,1.0,3.0,Venusaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,...,Monster,Grass,80.0,82.0,83.0,100.0,100.0,80.0,525,Ordinary
4,1.0,4.0,Charmander,Lizard Pokemon,Fire,,Blaze,Solar Power,,87.5,...,Monster,Dragon,39.0,52.0,43.0,60.0,50.0,65.0,309,Ordinary


## Plot graphs to visualize and understand the data

In [None]:
# Plot Group occurences to see the distributions
groupFig, groupAxes = plt.subplots(nrows=1, ncols=1)
groupCounts = data["Group"].value_counts()
groupAxe = groupCounts.plot(title="Group", kind="bar")
groupAxe.set(xlabel="Group Types", ylabel="Count")

In [None]:
# Box plot Total & Group
data.boxplot("Total", "Group")

<p> According to the boxplot above, we can see that despite having a small quanity in size, Legendary Pokemon has the highest Total stats compared to the other 2 groups.</p>

## PCA

In [None]:
reducedData = data.drop(DropColumns, axis=1)
# dummies = pd.get_dummies(data[["Type 1", "Type 2", "Ability 1", "Ability 2", "Ability 3", "EggGroup 1", "EggGroup 2"]])
# dataWithDummies = pd.concat([dummies, reducedData], axis=1, join_axes=[reducedData.index])

# x = dataWithDummies.loc[:, dataWithDummies.columns != "Group"]
# y = dataWithDummies.loc[:, "Group"]

# Scale the x data
#x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(0.95)
principalComponents = pca.fit_transform(x)
width, height = principalComponents.shape
principalDf = pd.DataFrame(data=principalComponents, columns=["pca %d" % i for i in range(1, height + 1)])
pcaData = pd.concat([principalDf, dataWithDummies[["Group"]]], axis=1)

xTrain = pcaData.sample(frac=0.5)

In [None]:
# # Plot Type 1 and Type 2 occurences to see the distributions
# typeFig, typeAxes = plt.subplots(nrows=3, ncols=1)
# typeFig.subplots_adjust(top=3)

# type1Counts = data["Type 1"].value_counts()
# type2Counts = data["Type 2"].value_counts().drop("None")
# typesCounts = type1Counts.add(type2Counts, fill_value=0)

# type1Counts.plot(title="Type 1 Occurrences", kind="bar", ax=typeAxes[0], color=getColorList(type1Counts))
# type2Counts.plot(title="Type 2 Occurrences", kind="bar", ax=typeAxes[1], color=getColorList(type2Counts))
# typesCounts.plot(title="Type 1 + 2 Occurrences", kind="bar", ax=typeAxes[2], color=getColorList(typesCounts))

# fh = FeatureHasher(n_features=2, input_type="string")
# hashedFeature1 = fh.fit_transform(data["Type 1"])
# hashedFeature2 = fh.fit_transform(data["Type 2"])

# x = pd.concat([data[["Name", "Type 1", "Type 2", "Category"]], pd.DataFrame(hashedFeature1.toarray())], axis=1)
# x = pd.concat([x, pd.DataFrame(hashedFeature2.toarray())], axis=1)

## Training & Test Data

In [146]:
target = "Group"
reducedData = data.drop(DropColumns, axis=1)
training, test = ut.splitData(target, reducedData, 0.60)
kTrainings, kTests = ut.kFoldCrossValidation(10, training)

In [147]:
dtree = dt.DecisionTree(target)
dtree.train(training)
dtPred = dtree.classify(test.drop([target], axis=1))

Function "train" took 18.99 seconds to complete


In [148]:
dtMatrix = ut.buildConfusionMatrix(dtPred, test[target], reducedData[target].unique())
dtPrecisions, dtRecalls = ut.getPrecisionsAndRecalls(dtMatrix, reducedData[target].unique())
dtFScores = ut.computeFScores(dtPrecisions, dtRecalls)
"Error {0:.2f}%".format(ut.computeError(dtPred, test["Group"]) * 100)

'Error 0.80%'

In [149]:
errors = []
matrices = []
precisions = []
recalls = []
fScores = []

for kTraining, kTest in zip(kTrainings, kTests):
    dtree.train(kTraining)
    kPred = dtree.classify(kTest.drop([target], axis=1))
    
    kMatrix = ut.buildConfusionMatrix(kPred, kTest[target], reducedData[target].unique())
    kPrecisions, kRecalls = ut.getPrecisionsAndRecalls(kMatrix, reducedData[target].unique())
    kFScores = ut.computeFScores(kPrecisions, kRecalls)
    error = ut.computeError(kPred, kTest["Group"])
    
    errors.append(error)
    matrices.append(kMatrix)
    precisions.append(kPrecisions)
    recalls.append(kRecalls)
    fScores.append(kFScores)

Function "train" took 19.80 seconds to complete


  recall = confusionMatrix.loc[index, column] / confusionMatrix.loc[index, "Total"]


Function "train" took 18.95 seconds to complete
Function "train" took 19.44 seconds to complete
Function "train" took 18.66 seconds to complete
Function "train" took 19.30 seconds to complete
Function "train" took 17.15 seconds to complete
Function "train" took 19.62 seconds to complete
Function "train" took 19.58 seconds to complete
Function "train" took 12.82 seconds to complete
Function "train" took 16.19 seconds to complete


In [151]:
sum(errors)/10

0.012318401937046008

### Naive Bayes

In [152]:
nBayes = nb.NaiveBayes(target)
nBayes.train(training, reducedData[target].unique())
nbPred = nBayes.classify(test.drop([target], axis=1))

Function "train" took 4.55 seconds to complete
Function "classify" took 16.83 seconds to complete


In [153]:
nbMatrix = ut.buildConfusionMatrix(nbPred, test[target], reducedData[target].unique())
nbPrecisions, nbRecalls = ut.getPrecisionsAndRecalls(nbMatrix, reducedData[target].unique())
nbFScores = ut.computeFScores(nbPrecisions, nbRecalls)
"Error: {0:.2f}%".format(ut.computeError(nbPred, test["Group"]) * 100)

'Error: 20.74%'

In [158]:
nbFScores

(0.6197844536770711,
 {'Legendary': 0.8611111111111112,
  'Ordinary': 0.8791946308724832,
  'Ultra Beast': 0.11904761904761907})

In [None]:
nberrors = []
nbmatrices = []
nbprecisions = []
nbrecalls = []
nbfScores = []

for kTraining, kTest in zip(kTrainings, kTests):
    nBayes.train(kTraining,  reducedData[target].unique())
    kPred = nBayes.classify(kTest.drop([target], axis=1))
    
    kMatrix = ut.buildConfusionMatrix(kPred, kTest[target], reducedData[target].unique())
    kPrecisions, kRecalls = ut.getPrecisionsAndRecalls(kMatrix, reducedData[target].unique())
    kFScores = ut.computeFScores(kPrecisions, kRecalls)
    error = ut.computeError(kPred, kTest["Group"])
    
    nberrors.append(error)
    nbmatrices.append(kMatrix)
    nbprecisions.append(kPrecisions)
    nbrecalls.append(kRecalls)
    nbfScores.append(kFScores)

In [185]:
#from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
s = set()
for i, j in kf.split(training):
    s.add(training.iloc[i[0], :]["Group"])
s

{'Legendary', 'Ordinary'}

### Playground

In [207]:
from sklearn.model_selection import StratifiedKFold

In [191]:
train = data.loc[:190, :]
test = data.loc[850:, :]

In [198]:
dtree.train(train)
pred = dtree.classify(test.drop(["Group"], axis=1))

Function "train" took 10.08 seconds to complete


In [235]:
m = ut.buildConfusionMatrix(pred, test["Group"], data["Group"].unique())
p, r = ut.getPrecisionsAndRecalls(m, data[target].unique())

In [231]:
skf = StratifiedKFold(n_splits=6)

In [None]:
for i, j in skf.split(training, training["Group"]):
    print("***Training:\n", training.iloc[i]["Group"].value_counts())
    print(training.iloc[i][training.iloc[i]["Group"] == "Ultra Beast"])
    print()

In [262]:
u = ut._kFoldSample(10, data, "Group")

{'Ordinary': 83, 'Legendary': 9, 'Ultra Beast': 1}


In [None]:
l = []
for i in range(10):
    print(u[i]["Group"].value_counts())
    l += list(u[i].index)

In [274]:
ut.kFoldCrossValidation(10, data, True, "Group")[1][0]
    


Unnamed: 0,Generation,Pokedex#,Name,Category,Type 1,Type 2,Ability 1,Ability 2,Ability 3,MaleRatio,...,EggGroup 1,EggGroup 2,HP,Attack,Defense,Sp.Attack,Sp.Defense,Speed,Total,Group
703,5.0,608.0,Lampent,Lamp Pokemon,Ghost,Fire,Infiltrator,Flame Body,Flash Fire,50.0,...,Amorphous,,60.0,40.0,60.0,95.0,60.0,55.0,370,Ordinary
83,6.0,65.0,Mega Alakazam,Psi Pokemon,Psychic,,Trace,,,75.0,...,Human-Like,,55.0,50.0,65.0,175.0,105.0,150.0,600,Ordinary
401,3.0,348.0,Armaldo,Plate Pokemon,Rock,Bug,Battle Armor,Swift Swim,,87.5,...,Water 3,,75.0,125.0,100.0,70.0,80.0,45.0,495,Ordinary
852,7.0,732.0,Trumbeak,Bugle Beak Pokemon,Normal,Flying,Keen Eye,Skill Link,Pickup,50.0,...,Flying,,55.0,85.0,50.0,40.0,50.0,75.0,355,Ordinary
420,6.0,362.0,Mega Glalie,Face Pokemon,Ice,,Refrigerate,,,50.0,...,Mineral,Fairy,80.0,120.0,80.0,120.0,80.0,100.0,580,Ordinary
525,6.0,448.0,Mega Lucario,Aura Pokemon,Fighting,Steel,Adaptability,,,87.5,...,Field,Human-Like,70.0,145.0,88.0,140.0,70.0,112.0,625,Ordinary
112,7.0,88.0,Alolan Grimer,Sludge Pokemon,Poison,Dark,Power of Alchemy,Poison Touch,Gluttony,50.0,...,Amorphous,,80.0,80.0,50.0,40.0,50.0,25.0,325,Ordinary
861,7.0,741.0,Oricorio Pom-Pom Style,Dancing Pokemon,Electric,Flying,Dancer,,,25.0,...,Flying,,75.0,70.0,70.0,98.0,70.0,93.0,476,Ordinary
372,3.0,321.0,Wailord,Float Whale Pokemon,Water,,Oblivious,Pressure,Water Veil,50.0,...,Field,Water 2,170.0,90.0,45.0,90.0,45.0,60.0,500,Ordinary
772,6.0,669.0,Flabebe Blue Flower,Single Bloom Pokemon,Fairy,,Symbiosis,Flower Veil,,0.0,...,Fairy,,44.0,38.0,39.0,61.0,79.0,42.0,303,Ordinary


### Plot Error Rate

In [None]:
ratios = [0.2, 0.4, 0.6, 0.8]
dt = DecisionTree("Group")
x = []
y = []

for ratio in ratios:
    print("************ Split ratio: ", ratio)
    training, test = splitData(data, ratio)
    node = dt.train(training)
    predictions = dt.classify(test, node)
    error = computeError(predictions, test[dt.targetFeature])
    
    x.append(len(training))
    y.append(error)

In [None]:
plt.plot(x, y, 'bo-', label="With categorical and continuous features")
plt.xlabel("Trainging Set size")
plt.ylabel("Error Rate")
plt.legend(loc="best")

In [None]:
# Get data profile
profile = pdp.ProfileReport(dataWithDummies)
profile.to_file("Profile.html")
profile = None
gc.collect()