In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

pd.set_option("display.width", 130)


In [2]:
# Pokemon dataset
pk = pd.read_csv("/home/marcos/Dropbox/NEMO/datasets/pokemon-challenge/pokemon.csv", index_col=0)

pkType = pd.DataFrame(pk["Type 1"])
pkNum = pk.iloc[:, 3:len(pk.columns) - 1]

combats = pd.read_csv("/home/marcos/Dropbox/NEMO/datasets/pokemon-challenge/combats.csv")

combats = pd.merge(combats, pkNum, "inner", left_on=combats["First_pokemon"], right_index=True)
combats = pd.merge(combats, pkNum, "inner", left_on=combats["Second_pokemon"], right_index=True)

combats["First_win"] = (combats.Winner == combats["First_pokemon"]).astype(int)
combats = combats.iloc[:, 3:len(combats.columns)]

X = combats.iloc[:, :len(combats.columns) - 1]
y = combats.iloc[:, len(combats.columns) - 1]

# print(pkNum.head())
# print(combats.head())
print(X.head())
# print(y.head())


       HP_x  Attack_x  Defense_x  Sp. Atk_x  Sp. Def_x  Speed_x  Generation_x  HP_y  Attack_y  Defense_y  Sp. Atk_y  Sp. Def_y  \
0        50        64         50         45         50       41             2    70        70         40         60         40   
14235    70        80         50         35         35       35             1    70        70         40         60         40   
21465   110        78         73         76         71       60             3    70        70         40         60         40   
31953    50        65        107        105        107       86             4    70        70         40         60         40   
40327    73       115         60         60         60       90             3    70        70         40         60         40   

       Speed_y  Generation_y  
0           60             3  
14235       60             3  
21465       60             3  
31953       60             3  
40327       60             3  


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X.iloc[:1000], y.iloc[:1000])


In [4]:
test = pd.read_csv("/home/marcos/Dropbox/NEMO/datasets/pokemon-challenge/tests.csv")

test = pd.merge(test, pkNum, "inner", left_on=test["First_pokemon"], right_index=True)
test = pd.merge(test, pkNum, "inner", left_on=test["Second_pokemon"], right_index=True)

test = test.iloc[:, 2:len(combats.columns)]

# print(test.head())


In [5]:
class MyNaiveBayes:
    def __init__(self):
        self.X_train = pd.DataFrame()
        self.y_train = pd.Series()
        self.separated = {}
        self.mean_std_class = {}

    # Separate rows by label
    def separate_class(self):
        X = self.X_train
        y = self.y_train
        s = self.separated

        for i in range(len(X)):
            row = X.iloc[i]
            label = y.iloc[i]
            if label not in s:
                s[label] = []
            s[label].append(row)

    @staticmethod
    def std(x):
        avg = np.average(x)
        variance = np.sum([(i - avg) ** 2 for i in x]) / float(len(x) - 1)
        return np.sqrt(variance)

    # Calculate mean and std for the features of each class
    def calc_mean_std_class(self):
        for label, X in self.separated.items():
            self.mean_std_class[label] = [(np.average(col), self.std(col)) for col in zip(*X)]

    @staticmethod
    def calc_prob(x, mean, std):
        exp = np.exp(-((x - mean) ** 2 / (2 * std ** 2)))
        return (1 / (np.sqrt(2 * np.pi) * std)) * exp

    # Calculate probability of belonging to one class
    def calc_prob_class(self, X_test):
        X_test = pd.Series(X_test)
        prob = []
        for label, mean_std in self.mean_std_class.items():
            p = 1
            for i in range(len(mean_std)):
                mean, std = mean_std[i]
                x = X_test.iloc[i]
                p *= self.calc_prob(x, mean, std)
            prob.append((label, p))
        return prob

    # Train classifier
    def fit(self, X_train, y_train):
        self.X_train = pd.DataFrame(X_train)
        self.y_train = pd.Series(y_train)

        self.separate_class()
        self.calc_mean_std_class()

    def predict(self, X_test):
        X_test = pd.DataFrame(X_test)
        best_label = []
        for row in X_test.values:
            prob_class = self.calc_prob_class(row)
            best_label.append(max(prob_class, key=(lambda i: i[1]))[0])
        return best_label


In [6]:
my_clf = MyNaiveBayes()
my_clf.fit(X_train, y_train)

predictions = my_clf.predict(X_test)
# print(predictions)

clf = GaussianNB()
clf.fit(X_train, y_train)

# CHECK ACCURACY
print("Accuracy MyNaiveBayes:", accuracy_score(y_test, predictions))
print("Accuracy sklearn:", clf.score(X_test, y_test))


Accuracy MyNaiveBayes: 0.828
Accuracy sklearn: 0.828
