# Using Decision Tree & Naive Bayes to classify Legendary Pokemon

## Import libraries

In [None]:
import gc
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import pandas_profiling as pdp
# from sklearn.feature_extraction import FeatureHasher
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
import utils as ut
import decisionTree as dt
import naiveBayes as nb

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
%reload_ext autoreload

In [None]:
# Set packages options
# np.set_printoptions(threshold=np.nan)
pd.set_option("display.max_columns", 600)
plt.rcParams["figure.figsize"] = (11, 6)

## Define constants and functions

In [None]:
# Constants
TypeColorMappings = {"Water": "#6890F0", "Fire": "#F08030", "Grass": "#78C850",
                     "Dark": "#705848", "Electric": "#F8D030", "Flying": "#A890F0",
                     "Normal": "#A8A878", "Fighting": "#C03028", "Poison": "#A040A0",
                     "Ground": "#E0C068", "Psychic": "#F85888", "Rock": "#B8A038", 
                     "Ice": "#98D8D8", "Bug": "#A8B820", "Dragon": "#7038F8", 
                     "Ghost": "#705898", "Steel": "#B8B8D0", "Fairy": "#EE99AC"}

# DropColumns = ["Pokedex#", "Name", "Type 1", "Type 2", "Generation", "Ability 1", "Ability 2", "Ability 3", 
#                "EggGroup 1", "EggGroup 2", "Category", "Height (m)", "Weight (kg)"]

DropColumns = ["Pokedex#", "Name", "Generation", "Category"]

def getColorList(typeCounts):
    assert type(typeCounts) == pd.core.series.Series, "Argument must be a Series object"
    return [TypeColorMappings[pokemonType] for pokemonType in typeCounts.index]

def getDistinctValues(dataFrame, columnName, sep):
    result = []
    for value in dataFrame[columnName].value_counts().index:
        result += value.split(sep)
    return set(result)

## Load the data from file

In [None]:
# Load the Pokemon data
fileName = r'.\Pokemon_Cleaned.tsv'
columnTypes = {"Name": str, "Category": str, "Type 1": str, "Type 2": str, 
               "Ability 1": str, "Ability 2": str, "Ability 3": str, "Group": str}
data = pd.read_csv(fileName, header=0, sep='\t', dtype=columnTypes)
data.head()

## Plot graphs to visualize and understand the data

In [None]:
# Plot Group occurences to see the distributions
groupFig, groupAxes = plt.subplots(nrows=1, ncols=1)
groupCounts = data["Group"].value_counts()
groupAxe = groupCounts.plot(title="Group", kind="bar")
groupAxe.set(xlabel="Group Types", ylabel="Count")

In [None]:
# Box plot Total & Group
data.boxplot("Total", "Group")

<p> According to the boxplot above, we can see that despite having a small quanity in size, Legendary Pokemon has the highest Total stats compared to the other 2 groups.</p>

## PCA

In [None]:
reducedData = data.drop(DropColumns, axis=1)
# dummies = pd.get_dummies(data[["Type 1", "Type 2", "Ability 1", "Ability 2", "Ability 3", "EggGroup 1", "EggGroup 2"]])
# dataWithDummies = pd.concat([dummies, reducedData], axis=1, join_axes=[reducedData.index])

# x = dataWithDummies.loc[:, dataWithDummies.columns != "Group"]
# y = dataWithDummies.loc[:, "Group"]

# Scale the x data
#x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(0.95)
principalComponents = pca.fit_transform(x)
width, height = principalComponents.shape
principalDf = pd.DataFrame(data=principalComponents, columns=["pca %d" % i for i in range(1, height + 1)])
pcaData = pd.concat([principalDf, dataWithDummies[["Group"]]], axis=1)

xTrain = pcaData.sample(frac=0.5)

In [None]:
# # Plot Type 1 and Type 2 occurences to see the distributions
# typeFig, typeAxes = plt.subplots(nrows=3, ncols=1)
# typeFig.subplots_adjust(top=3)

# type1Counts = data["Type 1"].value_counts()
# type2Counts = data["Type 2"].value_counts().drop("None")
# typesCounts = type1Counts.add(type2Counts, fill_value=0)

# type1Counts.plot(title="Type 1 Occurrences", kind="bar", ax=typeAxes[0], color=getColorList(type1Counts))
# type2Counts.plot(title="Type 2 Occurrences", kind="bar", ax=typeAxes[1], color=getColorList(type2Counts))
# typesCounts.plot(title="Type 1 + 2 Occurrences", kind="bar", ax=typeAxes[2], color=getColorList(typesCounts))

# fh = FeatureHasher(n_features=2, input_type="string")
# hashedFeature1 = fh.fit_transform(data["Type 1"])
# hashedFeature2 = fh.fit_transform(data["Type 2"])

# x = pd.concat([data[["Name", "Type 1", "Type 2", "Category"]], pd.DataFrame(hashedFeature1.toarray())], axis=1)
# x = pd.concat([x, pd.DataFrame(hashedFeature2.toarray())], axis=1)

## Training & Test Data

In [None]:
target = "Group"
reducedData = data.drop(DropColumns, axis=1)
training, test = ut.splitData(target, reducedData, 0.60)
k = min(training["Group"].value_counts())
kTrainings, kTests = ut.kFoldCrossValidation(k, training, True, target)

## Decision Tree

In [None]:
dtree = dt.DecisionTree(target, 5)
dtree.train(training)
dtPred = dtree.classify(test.drop([target], axis=1))

In [None]:
dtree.countTreeDepth()

In [None]:
dtree.countLeafNodes()

In [None]:
dtMatrix = ut.buildConfusionMatrix(dtPred, test[target], reducedData[target].unique())
dtPrecisions, dtRecalls = ut.getPrecisionsAndRecalls(dtMatrix, reducedData[target].unique())
dtFScores = ut.computeFScores(dtPrecisions, dtRecalls)
"Error {0:.2f}%".format(ut.computeError(dtPred, test["Group"]) * 100)

In [None]:
errors = []
matrices = []
precisions = []
recalls = []
fScores = []

for kTraining, kTest in zip(kTrainings, kTests):
    dtree.train(kTraining)
    kPred = dtree.classify(kTest.drop([target], axis=1))
    
    kMatrix = ut.buildConfusionMatrix(kPred, kTest[target], reducedData[target].unique())
    kPrecisions, kRecalls = ut.getPrecisionsAndRecalls(kMatrix, reducedData[target].unique())
    kFScores = ut.computeFScores(kPrecisions, kRecalls)
    error = ut.computeError(kPred, kTest["Group"])
    
    errors.append(error)
    matrices.append(kMatrix)
    precisions.append(kPrecisions)
    recalls.append(kRecalls)
    fScores.append(kFScores)

### Naive Bayes

In [None]:
nBayes = nb.NaiveBayes(target)
nBayes.train(training, reducedData[target].unique())
nbPred = nBayes.classify(test.drop([target], axis=1))

In [None]:
nbMatrix = ut.buildConfusionMatrix(nbPred, test[target], reducedData[target].unique())
nbPrecisions, nbRecalls = ut.getPrecisionsAndRecalls(nbMatrix, reducedData[target].unique())
nbFScores = ut.computeFScores(nbPrecisions, nbRecalls)
nbSens, nbSpec = ut.getSensitivityAndSpecifiicy(nbMatrix, reducedData[target].unique())
"Error: {0:.2f}%".format(ut.computeError(nbPred, test["Group"]) * 100)

In [None]:
data.loc[0:1,"Group"].value_counts().index[0]

In [None]:
nberrors = []
nbmatrices = []
nbprecisions = []
nbrecalls = []
nbfScores = []

for kTraining, kTest in zip(kTrainings, kTests):
    nBayes.train(kTraining,  reducedData[target].unique())
    kPred = nBayes.classify(kTest.drop([target], axis=1))
    
    kMatrix = ut.buildConfusionMatrix(kPred, kTest[target], reducedData[target].unique())
    kPrecisions, kRecalls = ut.getPrecisionsAndRecalls(kMatrix, reducedData[target].unique())
    kFScores = ut.computeFScores(kPrecisions, kRecalls)
    error = ut.computeError(kPred, kTest["Group"])
    
    nberrors.append(error)
    nbmatrices.append(kMatrix)
    nbprecisions.append(kPrecisions)
    nbrecalls.append(kRecalls)
    nbfScores.append(kFScores)

### Evaluation Metric

In [None]:
k = 10
nBayes = nb.NaiveBayes(target)
sens = []
specs = []

for i in range(k):
    training, test = ut.splitData(target, reducedData, 0.60)    
    nBayes.train(training, reducedData[target].unique())
    nbPred = nBayes.classify(test.drop([target], axis=1))
    
    nbMatrix = ut.buildConfusionMatrix(nbPred, test[target], reducedData[target].unique())
    nbSens, nbSpec = ut.getSensitivityAndSpecifiicy(nbMatrix, reducedData[target].unique())
    
    sens.append(nbSens)
    specs.append(nbSpec)

In [None]:
ordinarySens = [sens[i]["Ordinary"] for i in range(k)]
ordinarySpecs = sorted([specs[i]["Ordinary"] for i in range(k)])

In [None]:
t = "Legendary"
s = sorted([(sens[i][t], specs[i][t]) for i in range(k)], key=lambda x: x[1])

In [None]:
plt.plot(list(map(lambda x: 1- x, [i[1] for i in s])), [i[0] for i in s], 'bo-', label="Ordinary ROC Curve")
plt.xlabel("1 - Specificity")
plt.ylabel("Sensitivy")
plt.legend(loc="best")

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
training, test = ut.splitData(target, reducedData, 0.60)    
nBayes.train(training, reducedData[target].unique())
nbPred = nBayes.classify(test.drop([target], axis=1))

nbMatrix = ut.buildConfusionMatrix(nbPred, test[target], reducedData[target].unique())
nbSens, nbSpec = ut.getSensitivityAndSpecifiicy(nbMatrix, reducedData[target].unique())

In [None]:
p = list(map(lambda x: 1 if x == "Ordinary" else 0, nbPred.values))
a = list(map(lambda x: 1 if x == "Ordinary" else 0, test["Group"]))

false_positive_rate, true_positive_rate, thresholds = roc_curve(a, p)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_true=test["Group"], pos_label="Ordinary")
roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
1-nbSpec["Ordinary"]

In [None]:
nbSens["Ordinary"]

### Playground

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
train = data.loc[:190, :]
test = data.loc[850:, :]

In [None]:
dtree.train(train)
pred = dtree.classify(test.drop(["Group"], axis=1))

In [None]:
m = ut.buildConfusionMatrix(pred, test["Group"], data["Group"].unique())
p, r = ut.getPrecisionsAndRecalls(m, data[target].unique())

In [None]:
skf = StratifiedKFold(n_splits=6)

In [None]:
for i, j in skf.split(training, training["Group"]):
    print("***Training:\n", training.iloc[i]["Group"].value_counts())
    print(training.iloc[i][training.iloc[i]["Group"] == "Ultra Beast"])
    print()

In [None]:
u = ut._kFoldSample(10, data, "Group")

In [None]:
l = []
for i in range(10):
    print(u[i]["Group"].value_counts())
    l += list(u[i].index)

In [None]:
for (a,b),(c,d) in zip(nbprecisions[3].items(), nbprecisions[2].items()):
    print(a, b, "-----", c, d)

In [None]:
nbprecisions[2].items()

### Plot Error Rate

In [None]:
def plotPrecisionRecall(precisions, recalls):
    assert len(precisions) == len(recalls), "Length of precisions and recalls must match"
    assert precisions.keys() == recalls.keys(), "Keys in precisions and recalls must match"
    
    for label in precisions.keys():
        pValue = precisions[label]
        rValue = recalls[label]
        

In [None]:
ratios = [0.2, 0.4, 0.6, 0.8]
dt = DecisionTree("Group")
x = []
y = []

for ratio in ratios:
    print("************ Split ratio: ", ratio)
    training, test = splitData(data, ratio)
    node = dt.train(training)
    predictions = dt.classify(test, node)
    error = computeError(predictions, test[dt.targetFeature])
    
    x.append(len(training))
    y.append(error)

In [None]:
plt.plot(x, y, 'bo-', label="With categorical and continuous features")
plt.xlabel("Trainging Set size")
plt.ylabel("Error Rate")
plt.legend(loc="best")

In [None]:
# Get data profile
profile = pdp.ProfileReport(dataWithDummies)
profile.to_file("Profile.html")
profile = None
gc.collect()