In [30]:
import pandas as pd
from heapq import *
import operator
from scipy.spatial import distance

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

pd.set_option("display.width", 130)


In [31]:
# Pokemon dataset
pk = pd.read_csv("/home/marcos/Dropbox/NEMO/datasets/pokemon-challenge/pokemon.csv", index_col=0)

pkType = pd.DataFrame(pk["Type 1"])
pkNum = pk.iloc[:, 3:len(pk.columns) - 1]

combats = pd.read_csv("/home/marcos/Dropbox/NEMO/datasets/pokemon-challenge/combats.csv")

combats = pd.merge(combats, pkNum, "inner", left_on=combats["First_pokemon"], right_index=True)
combats = pd.merge(combats, pkNum, "inner", left_on=combats["Second_pokemon"], right_index=True)

combats["First_win"] = (combats.Winner == combats["First_pokemon"]).astype(int)
combats = combats.iloc[:, 3:len(combats.columns)]

X = combats.iloc[:, :len(combats.columns) - 1]
y = combats.iloc[:, len(combats.columns) - 1]

# print(pkNum.head())
# print(combats.head())
# print(X.head())
# print(y.head())


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X.iloc[:100], y.iloc[:100])


In [33]:
test = pd.read_csv("/home/marcos/Dropbox/NEMO/datasets/pokemon-challenge/tests.csv")

test = pd.merge(test, pkNum, "inner", left_on=test["First_pokemon"], right_index=True)
test = pd.merge(test, pkNum, "inner", left_on=test["Second_pokemon"], right_index=True)

test = test.iloc[:, 2:len(combats.columns)]

# print(test.head())


In [38]:
class MyKNN:
    def __init__(self, K=5):
        self.K = K
        self.X_train = pd.DataFrame()
        self.y_train = pd.Series()

    def calc_K_closest(self, X_test):
        c = []
        X = self.X_train
        y = self.y_train

        for i in range(len(X)):
            heappush(c, (distance.euclidean(X_test, X.iloc[i]), y.iloc[i]))

        return [heappop(c) for _ in range(self.K)]

    @staticmethod
    def better_closest(K_closest):
        labels = [c[1] for c in K_closest]
        count = {}
        for i in labels:
            count[i] = labels.count(i)
        sortedCount = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
        return sortedCount[0][0]


    # Train classifier
    def fit(self, X_train, y_train):
        self.X_train = pd.DataFrame(X_train)
        self.y_train = pd.Series(y_train)

    def predict(self, X_test):
        X_test = pd.DataFrame(X_test)
        best_label = []

        for row in X_test.values:
            K_closest = self.calc_K_closest(row)
            best_label.append(self.better_closest(K_closest))
        return best_label


In [39]:
for k in range(2, 11): 
    my_clf = MyKNN(K=k)
    my_clf.fit(X_train, y_train)
    
    predictions = my_clf.predict(X_test)
    # print(predictions)
    
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train, y_train)
    
    # CHECK ACCURACY
    print("K=", k, "Accuracy MyKNN:", accuracy_score(y_test, predictions))
    print("K=", k, "Accuracy sklearn:", clf.score(X_test, y_test))


K= 2 Accuracy MyKNN: 0.96
K= 2 Accuracy sklearn: 0.96


K= 3 Accuracy MyKNN: 0.88
K= 3 Accuracy sklearn: 0.88


K= 4 Accuracy MyKNN: 0.92
K= 4 Accuracy sklearn: 0.92


K= 5 Accuracy MyKNN: 0.88
K= 5 Accuracy sklearn: 0.88


K= 6 Accuracy MyKNN: 0.96
K= 6 Accuracy sklearn: 0.96


K= 7 Accuracy MyKNN: 0.92
K= 7 Accuracy sklearn: 0.92


K= 8 Accuracy MyKNN: 0.96
K= 8 Accuracy sklearn: 0.96


K= 9 Accuracy MyKNN: 0.96
K= 9 Accuracy sklearn: 0.96


K= 10 Accuracy MyKNN: 0.96
K= 10 Accuracy sklearn: 0.96
