# Using Decision Tree & Naive Bayes to classify Legendary Pokemon

## Import libraries

In [126]:
import gc
import time
import math
import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
# from sklearn.feature_extraction import FeatureHasher
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
from decisionTree import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# Set packages options
# np.set_printoptions(threshold=np.nan)
pd.set_option("display.max_columns", 600)
plt.rcParams["figure.figsize"] = (11, 6)

## Define constants and functions

In [3]:
# Constants
TypeColorMappings = {"Water": "#6890F0", "Fire": "#F08030", "Grass": "#78C850",
                     "Dark": "#705848", "Electric": "#F8D030", "Flying": "#A890F0",
                     "Normal": "#A8A878", "Fighting": "#C03028", "Poison": "#A040A0",
                     "Ground": "#E0C068", "Psychic": "#F85888", "Rock": "#B8A038", 
                     "Ice": "#98D8D8", "Bug": "#A8B820", "Dragon": "#7038F8", 
                     "Ghost": "#705898", "Steel": "#B8B8D0", "Fairy": "#EE99AC"}

DropColumns = ["Pokedex#", "Name", "Type 1", "Type 2", "Generation", "Ability 1", "Ability 2", "Ability 3", 
               "EggGroup 1", "EggGroup 2", "Category", "Height (m)", "Weight (kg)"]

def getColorList(typeCounts):
    assert type(typeCounts) == pd.core.series.Series, "Argument must be a Series object"
    return [TypeColorMappings[pokemonType] for pokemonType in typeCounts.index]

def getDistinctValues(dataFrame, columnName, sep):
    result = []
    for value in dataFrame[columnName].value_counts().index:
        result += value.split(sep)
    return set(result)

## Load the data from file

In [4]:
# Load the Pokemon data
fileName = r'.\Pokemon_Cleaned.tsv'
columnTypes = {"Name": str, "Category": str, "Type 1": str, "Type 2": str, 
               "Ability 1": str, "Ability 2": str, "Ability 3": str, "Group": str}
data = pd.read_csv(fileName, header=0, sep='\t', dtype=columnTypes).fillna("None")
data.head()

Unnamed: 0,Generation,Pokedex#,Name,Category,Type 1,Type 2,Ability 1,Ability 2,Ability 3,MaleRatio,FemaleRatio,Height (m),Weight (kg),EggGroup 1,EggGroup 2,HP,Attack,Defense,Sp.Attack,Sp.Defense,Speed,Total,Group
0,1.0,1.0,Bulbasaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,12.5,0.7,6.9,Monster,Grass,45.0,49.0,49.0,65.0,65.0,45.0,318,Ordinary
1,1.0,2.0,Ivysaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,12.5,1.0,13.0,Monster,Grass,60.0,62.0,63.0,80.0,80.0,60.0,405,Ordinary
2,6.0,3.0,Mega Venusaur,Seed Pokemon,Grass,Poison,Thick Fat,,,87.5,12.5,2.0,100.0,Monster,Grass,80.0,100.0,123.0,122.0,120.0,80.0,625,Ordinary
3,1.0,3.0,Venusaur,Seed Pokemon,Grass,Poison,Chlorophyll,Overgrow,,87.5,12.5,2.0,100.0,Monster,Grass,80.0,82.0,83.0,100.0,100.0,80.0,525,Ordinary
4,1.0,4.0,Charmander,Lizard Pokemon,Fire,,Blaze,Solar Power,,87.5,12.5,0.6,8.5,Monster,Dragon,39.0,52.0,43.0,60.0,50.0,65.0,309,Ordinary


## Plot graphs to visualize and understand the data

In [None]:
# Plot Group occurences to see the distributions
groupFig, groupAxes = plt.subplots(nrows=1, ncols=1)
groupCounts = data["Group"].value_counts()
groupAxe = groupCounts.plot(title="Group", kind="bar")
groupAxe.set(xlabel="Group Types", ylabel="Count")

In [None]:
# Box plot Total & Group
data.boxplot("Total", "Group")

<p> According to the boxplot above, we can see that despite having a small quanity in size, Legendary Pokemon has the highest Total stats compared to the other 2 groups.</p>

## PCA

In [None]:
reducedData = data.drop(DropColumns, axis=1)
# dummies = pd.get_dummies(data[["Type 1", "Type 2", "Ability 1", "Ability 2", "Ability 3", "EggGroup 1", "EggGroup 2"]])
# dataWithDummies = pd.concat([dummies, reducedData], axis=1, join_axes=[reducedData.index])

# x = dataWithDummies.loc[:, dataWithDummies.columns != "Group"]
# y = dataWithDummies.loc[:, "Group"]

# Scale the x data
#x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(0.95)
principalComponents = pca.fit_transform(x)
width, height = principalComponents.shape
principalDf = pd.DataFrame(data=principalComponents, columns=["pca %d" % i for i in range(1, height + 1)])
pcaData = pd.concat([principalDf, dataWithDummies[["Group"]]], axis=1)

xTrain = pcaData.sample(frac=0.5)

In [None]:
# # Plot Type 1 and Type 2 occurences to see the distributions
# typeFig, typeAxes = plt.subplots(nrows=3, ncols=1)
# typeFig.subplots_adjust(top=3)

# type1Counts = data["Type 1"].value_counts()
# type2Counts = data["Type 2"].value_counts().drop("None")
# typesCounts = type1Counts.add(type2Counts, fill_value=0)

# type1Counts.plot(title="Type 1 Occurrences", kind="bar", ax=typeAxes[0], color=getColorList(type1Counts))
# type2Counts.plot(title="Type 2 Occurrences", kind="bar", ax=typeAxes[1], color=getColorList(type2Counts))
# typesCounts.plot(title="Type 1 + 2 Occurrences", kind="bar", ax=typeAxes[2], color=getColorList(typesCounts))

# fh = FeatureHasher(n_features=2, input_type="string")
# hashedFeature1 = fh.fit_transform(data["Type 1"])
# hashedFeature2 = fh.fit_transform(data["Type 2"])

# x = pd.concat([data[["Name", "Type 1", "Type 2", "Category"]], pd.DataFrame(hashedFeature1.toarray())], axis=1)
# x = pd.concat([x, pd.DataFrame(hashedFeature2.toarray())], axis=1)

## Decision Tree

In [129]:
def elapsedTime(func):
    def wrapper(*args, **kwargs):
        timeStart = time.time()
        result = func(*args, **kwargs)
        elapsedTime = time.time() - timeStart
        print("Elapsed time:", elapsedTime, "seconds")
        return result
    return wrapper

@elapsedTime
def splitData(dataFrame, trainingRatio):
    ''' '''
    trainingSize = math.floor(len(dataFrame) * trainingRatio)
    testSize = len(dataFrame) - trainingSize
    training = dataFrame.sample(n=trainingSize, replace=False)
    test = dataFrame.drop(training.index)
    return training, test

def computeError(predictions, actuals):
    assert len(predictions) == len(actuals), "Number of predictions and actuals must match"
    assert type(predictions) == type(actuals), "Type of predictions and actuals must match"
    misClassified = 0
    for i in predictions.index:
        if (predictions[i] != actuals[i]):
            misClassified += 1
    return 1 - ((len(actuals) - misClassified) / len(actuals))

def buildConfusionMatrix(predictions, actuals, features):
    assert len(predictions) == len(actuals), "Number of predictions and actuals must match"
    assert type(predictions) == type(actuals), "Type of predictions and actuals must match"
    table = {}
    features.sort()
    
    # Initialize the table with column header and cell values with 0
    for feature in features:
        table[feature] = [0 for i in range(len(features))]
    
    matrix = pd.DataFrame(data=table, index=features)
    
    # Count the misclassifications
    for i in predictions.index:
        if (predictions[i] == actuals[i]):
            matrix.loc[predictions[i], predictions[i]] += 1
        else:
            matrix.loc[actuals[i], predictions[i]] += 1
        
    # Rename column names and row indeces for clarity
    renamedColumns = {}
    renamedRows = {}
    for feature in features:
        renamedColumns[feature] = "Predicted " + feature
        renamedRows[feature] = "Actual " + feature
    matrix.rename(columns=renamedColumns, index=renamedRows, inplace=True)
    
    # Add Total column and Total index
    matrix["Total"] = pd.Series([0 for i in range(len(features))], index=matrix.index)
    matrix.loc["Total"] = [0 for i in range(len(matrix.columns))]
    
    # Sum the Total values
    for feature in features:
        matrix.loc["Total", "Predicted " + feature] = matrix["Predicted " + feature].sum()
        
    for i in matrix.index:
        matrix.loc[i, "Total"] = matrix.loc[i].sum()
    
    return matrix

def getPrecisionsAndRecalls(confusionMatrix, features):
    features.sort()
    precisions = {}
    recalls = {}
    
    for feature in features:
        index = "Actual " + feature
        column = "Predicted " + feature
        
        precision = confusionMatrix.loc[index, column] / confusionMatrix.loc["Total", column]
        precisions[feature] = precision
        recall = confusionMatrix.loc[index, column] / confusionMatrix.loc[index, "Total"]
        recalls[feature] = recall
        
    return precisions, recalls

def kFoldSample(k, dataFrame):
    subsetSize = int(len(dataFrame) / k)
    subsetSizes = (subsetSize, subsetSize + len(dataFrame) % k)    
    samples = []

    for i in range(k):
        randomSample = None
        if (i < k - 1):
            randomSample = dataFrame.sample(n=subsetSizes[0], replace=False)
        else:
            randomSample = dataFrame.sample(n=subsetSizes[1], replace=False)
        dataFrame = dataFrame.drop(randomSample.index)
        samples.append(randomSample)
    return samples

def kFoldCrossValidation(k, dataFrame, model):
    samples = kFoldSample(k, dataFrame)
    accuracies = []
    
    for i in range(len(samples)):
        test = samples[i]
        training = dataFrame.drop(test.index)
        model.train(training)
        predictions = model.classify(test)
        accuracies.append(1 - computeError(predictions, test[model.targetFeature]))
    return accuracies

### Confusion Matrix

In [130]:
training, test = splitData(data, 0.66)
dt = DecisionTree("Group")
dt.train(training)
# errors = kFoldCrossValidation(10, training, dt)
predictions = dt.classify(test)

Elapsed time: 0.0 seconds
Wating for threads to complete
Best feature: Ability 1 Best gain: 0.18404848484848482
Best feature: None Best gain: 0.0
Best feature: Defense Best gain: 0.18718655082291435
Best feature: None Best gain: 0.0
Best feature: FemaleRatio Best gain: 0.23964497041420116
Best feature: None Best gain: 0.0
Best feature: Generation Best gain: 0.375
Best feature: None Best gain: 0.0
Best feature: None Best gain: 0.0
Best feature: Name Best gain: 0.0036427292825645134
Best feature: None Best gain: 0.0
Best feature: Name Best gain: 0.0036629043681236815
Best feature: None Best gain: 0.0
Best feature: Name Best gain: 0.0036832287142059056
Best feature: None Best gain: 0.0
Best feature: None Best gain: 0.0


In [123]:
errors

[0.016129032258064502,
 0.0,
 0.016129032258064502,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.016129032258064502,
 0.0]

In [None]:
matrix = buildConfusionMatrix(predictions, test["Group"], test["Group"].unique())
matrix

In [None]:
getPrecisionsAndRecalls(matrix, test["Group"].unique())

### Plot Error Rate

In [None]:
ratios = [0.2, 0.4, 0.6, 0.8]
dt = DecisionTree("Group")
x = []
y = []

for ratio in ratios:
    print("************ Split ratio: ", ratio)
    training, test = splitData(data, ratio)
    node = dt.train(training)
    predictions = dt.classify(test, node)
    error = computeError(predictions, test[dt.targetFeature])
    
    x.append(len(training))
    y.append(error)

In [None]:
plt.plot(x, y, 'bo-', label="With categorical and continuous features")
plt.xlabel("Trainging Set size")
plt.ylabel("Error Rate")
plt.legend(loc="best")

In [None]:
# Get data profile
profile = pdp.ProfileReport(dataWithDummies)
profile.to_file("Profile.html")
profile = None
gc.collect()

In [106]:
from multiprocessing.pool import ThreadPool
import time

In [114]:
class Foo:
    def foo(self, r):
        for i in range(r):
            print("Thread id:", threading.current_thread(), i)
            time.sleep(1)
    
    def start(self):

        pool = ThreadPool(processes=2)
        t1 = pool.apply_async(self.foo, (5,))
        t2 = pool.apply_async(self.foo, (8,))

        t1.wait()
        t2.wait()
        print("Complete", t1.get(), t2.get())

In [116]:
foo = Foo()
foo.start()

Thread id:Thread id: <DummyProcess(Thread-52, started daemon 7500)> 0 <DummyProcess(Thread-53, started daemon 6920)> 0

Thread id: <DummyProcess(Thread-53, started daemon 6920)> 1
Thread id: <DummyProcess(Thread-52, started daemon 7500)> 1
Thread id: <DummyProcess(Thread-53, started daemon 6920)> 2
Thread id: <DummyProcess(Thread-52, started daemon 7500)> 2
Thread id: <DummyProcess(Thread-53, started daemon 6920)> 3
Thread id: <DummyProcess(Thread-52, started daemon 7500)> 3
Thread id: <DummyProcess(Thread-53, started daemon 6920)> 4
Thread id: <DummyProcess(Thread-52, started daemon 7500)> 4
Thread id: <DummyProcess(Thread-53, started daemon 6920)> 5
Thread id: <DummyProcess(Thread-53, started daemon 6920)> 6
Thread id: <DummyProcess(Thread-53, started daemon 6920)> 7
Complete None None
