In [1]:
import csv

In [2]:
pokeData = []
Atrbts = ['Name', 'Type1', 'Type2', 'HP' ,'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed', 'Generation', 'Legendary']
with open('pokemon.csv') as f:
    f_csv = csv.reader(f)
    headers = next(f_csv)
    for row in f_csv:
        Dict = {}
        for i in range(len(Atrbts)):
            if Atrbts[i] in ['HP' ,'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed', 'Generation']:
                Dict[Atrbts[i]] = int(row[i + 1])
            elif Atrbts[i] == 'Legendary':
                Dict[Atrbts[i]] = bool(row[i + 1])
            else:
                Dict[Atrbts[i]] = row[i + 1]
        pokeData.append(Dict)

In [3]:
combats = []
with open('combats.csv') as f:
    f_csv = csv.reader(f)
    headers = next(f_csv)
    for row in f_csv:
        combats.append([int(row[0]) - 1, int(row[1]) - 1, int(row[2]) - 1])     

In [4]:
import numpy
import scipy.optimize
import random
from math import exp
from math import log
from sklearn import svm

In [5]:
def feat_abs(data):
    
    pokeFir = data[0]
    pokeSec = data[1]
        
    features = ['HP' ,'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']
    re = []
    
    for f in features:
        re.append(pokeData[pokeFir][f] - pokeData[pokeSec][f])
    return re

In [6]:
def feat_rel(data):
    
    pokeFir = data[0]
    pokeSec = data[1]
        
    features = ['HP' ,'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']
    re = []
    
    for f in features:
        re.append((pokeData[pokeFir][f] - pokeData[pokeSec][f]) / pokeData[pokeSec][f])
    return re

In [7]:
def inner(x,y):
    return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
    return 1.0 / (1 + exp(-x))

In [8]:
##################################################
# Logistic regression by gradient ascent         #
##################################################

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
    loglikelihood = 0
    for i in range(len(X)):
        logit = inner(X[i], theta)
        loglikelihood -= log(1 + exp(-logit))
        if not y[i]:
            loglikelihood -= logit
    for k in range(len(theta)):
        loglikelihood -= lam * theta[k]*theta[k]
  # for debugging
  # print("ll =" + str(loglikelihood))
    return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    dl = [0]*len(theta)
    for i in range(len(X)):
        logit = inner(X[i], theta)
        for k in range(len(theta)):
            dl[k] += X[i][k] * (1 - sigmoid(logit))
            if not y[i]:
                dl[k] -= X[i][k]
    for k in range(len(theta)):
        dl[k] -= lam*2*theta[k]
    return numpy.array([-x for x in dl])

In [9]:
##################################################
# Train                                          #
##################################################

def train(lam, X_train, y_train):
    theta_f, _, _ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X_train[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
    return theta_f

In [10]:
##################################################
# Predict                                        #
##################################################

def performance(theta, X, y):
    scores = [inner(theta,x) for x in X]
    predictions = [s > 0 for s in scores]
    correct = [(a==b) for (a,b) in zip(predictions,y)]
    acc = sum(correct) * 1.0 / len(correct)
    return acc

In [20]:
def logistic(trainData, testData, feat):
    X_train = []
    y_train = []
    for d in trainData:
        X_train.append(feat(d))
        if d[2] == d[0]:
            y_train.append(1)
        else:
            y_train.append(0)
    
    X_test = []
    y_test = []
    for d in testData:
        X_test.append(feat(d))
        if d[2] == d[0]:
            y_test.append(1)
        else:
            y_test.append(0)
        
    theta_f = train(1, X_train, y_train)
    
    print('classifier = logistic\t training accuracy = ' + str(performance(theta_f, X_train, y_train)))
    print('classifier = logistic\taccuracy = ' + str(performance(theta_f, X_test, y_test)))

In [21]:
def svmClassifier(kerneltype, trainData, testData, feat):
    X_train = []
    y_train = []
    for d in trainData:
        X_train.append(feat(d))
        if d[2] == d[0]:
            y_train.append(1)
        else:
            y_train.append(0)
    
    X_test = []
    y_test = []
    for d in testData:
        X_test.append(feat(d))
        if d[2] == d[0]:
            y_test.append(1)
        else:
            y_test.append(0)
        
    clf = svm.SVC(kernel = kerneltype)
    clf.fit(X_train, y_train)
    
    predictions = clf.predict(X_train)
    accuracy = [(a == b) for (a, b) in zip(y_train, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    print('classifier = svm '+ kerneltype + ' training' + '\taccuracy = ' + str(accuracy))
    
    predictions = clf.predict(X_test)
    accuracy = [(a == b) for (a, b) in zip(y_test, predictions)]
    #print(predictions[:20])
    accuracy = sum(accuracy) / len(accuracy)
    
    print('classifier = svm '+ kerneltype + '\taccuracy = ' + str(accuracy))

In [30]:
from sklearn import neighbors  
import numpy as np 

def knnClassifier(trainData, testData, feat):
    X_train = []
    y_train = []
    for d in trainData:
        X_train.append(feat(d))
        if d[2] == d[0]:
            y_train.append(1)
        else:
            y_train.append(0)
    
    X_test = []
    y_test = []
    for d in testData:
        X_test.append(feat(d))
        if d[2] == d[0]:
            y_test.append(1)
        else:
            y_test.append(0)
        
    knn = neighbors.KNeighborsClassifier()  
    knn.fit(X_train, y_train)
    
    predictions = knn.predict(X_train)
    accuracy = [(a == b) for (a, b) in zip(y_train, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    print('classifier = knn\t training accuracy = ' + str(accuracy))
    
    predictions = knn.predict(X_test)
    accuracy = [(a == b) for (a, b) in zip(y_test, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    
    print('classifier = knn\taccuracy = ' + str(accuracy))

In [23]:
from sklearn.naive_bayes import GaussianNB  
     
def nb(trainData, testData, feat):
    X_train = []
    y_train = []
    for d in trainData:
        X_train.append(feat(d))
        if d[2] == d[0]:
            y_train.append(1)
        else:
            y_train.append(0)
    
    X_test = []
    y_test = []
    for d in testData:
        X_test.append(feat(d))
        if d[2] == d[0]:
            y_test.append(1)
        else:
            y_test.append(0)
            
    clf = GaussianNB() 
    clf.fit(X_train, y_train)
    
    predictions = clf.predict(X_train)
    accuracy = [(a == b) for (a, b) in zip(y_train, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    print('classifier = nb\t training accuracy = ' + str(accuracy))
    
    predictions = clf.predict(X_test) 
    accuracy = [(a == b) for (a, b) in zip(y_test, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    
    print('classifier = nb\taccuracy = ' + str(accuracy))  

In [45]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
     
def rf(trainData, testData, feat):
    X_train = []
    y_train = []
    for d in trainData:
        X_train.append(feat(d))
        if d[2] == d[0]:
            y_train.append(1)
        else:
            y_train.append(0)
    
    X_test = []
    y_test = []
    for d in testData:
        X_test.append(feat(d))
        if d[2] == d[0]:
            y_test.append(1)
        else:
            y_test.append(0)
            
    clf = RandomForestClassifier() 
    clf.fit(X_train, y_train)
    
    predictions = clf.predict(X_train)
    accuracy = [(a == b) for (a, b) in zip(y_train, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    print('classifier = rf\t training accuracy = ' + str(accuracy))
    
    predictions = clf.predict(X_test) 
    accuracy = [(a == b) for (a, b) in zip(y_test, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    
    print('classifier = rf\taccuracy = ' + str(accuracy)) 

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
     
def dt(crit, maxDepth, trainData, testData, feat):
    X_train = []
    y_train = []
    for d in trainData:
        X_train.append(feat(d))
        if d[2] == d[0]:
            y_train.append(1)
        else:
            y_train.append(0)
    
    X_test = []
    y_test = []
    for d in testData:
        X_test.append(feat(d))
        if d[2] == d[0]:
            y_test.append(1)
        else:
            y_test.append(0)
            
    clf = DecisionTreeClassifier(criterion = crit, random_state = 100, max_depth = maxDepth) 
    clf.fit(X_train, y_train)
    
    predictions = clf.predict(X_train)
    accuracy = [(a == b) for (a, b) in zip(y_train, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    print('classifier = dt\t training accuracy = ' + str(accuracy))
    
    predictions = clf.predict(X_test) 
    accuracy = [(a == b) for (a, b) in zip(y_test, predictions)]
    accuracy = sum(accuracy) / len(accuracy)
    
    print('classifier = dt\taccuracy = ' + str(accuracy)) 

In [24]:
type1 = set()
type2 = set()
for d in pokeData:
    type1.add(d['Type1'])
    type2.add(d['Type2'])

print(type1)
print(type2)

{'Steel', 'Grass', 'Ghost', 'Flying', 'Rock', 'Bug', 'Fire', 'Fighting', 'Ice', 'Fairy', 'Dark', 'Psychic', 'Normal', 'Water', 'Poison', 'Electric', 'Dragon', 'Ground'}
{'', 'Steel', 'Grass', 'Ghost', 'Flying', 'Ice', 'Rock', 'Fighting', 'Fire', 'Bug', 'Fairy', 'Dark', 'Psychic', 'Normal', 'Water', 'Poison', 'Electric', 'Dragon', 'Ground'}


In [25]:
# Encode 'Type1' and 'Type2'
type1 = list(type1)
type1Id = dict(zip(type1, range(len(type1))))

type2 = list(type2)
type2Id = dict(zip(type2, range(len(type2))))

In [26]:
# Use all features provided in the data
# feature = (poke1_type1, poke2_type1, poke1_iflen, poke2_iflen, diff_HP, diff_Attack,...)
def feat_com(data):
    
    pokeFir = data[0]
    pokeSec = data[1]
     
    re = [type1Id[pokeData[pokeFir]['Type1']]]
    re.append(type1Id[pokeData[pokeSec]['Type1']])
    re.append(pokeData[pokeFir]['Legendary'])
    re.append(pokeData[pokeSec]['Legendary'])
    
    features = ['HP', 'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']   
    for f in features:
        re.append((pokeData[pokeFir][f] - pokeData[pokeSec][f])/ pokeData[pokeSec][f])
        
    return re

In [27]:
# Size of training set
trainPercent = 0.8

trainData = combats[:int(len(combats) * trainPercent)]
testData = combats[int(len(combats) * trainPercent):]

In [28]:
logistic(trainData, testData, feat_abs)
logistic(trainData, testData, feat_rel)
logistic(trainData, testData, feat_com)

classifier = logistic	 training accuracy = 0.889675
classifier = logistic	accuracy = 0.8939
classifier = logistic	 training accuracy = 0.8883
classifier = logistic	accuracy = 0.8944
classifier = logistic	 training accuracy = 0.85055
classifier = logistic	accuracy = 0.8555


In [31]:
knnClassifier(trainData, testData, feat_abs)
knnClassifier(trainData, testData, feat_rel)
knnClassifier(trainData, testData, feat_com)

classifier = knn	 training accuracy = 0.930925
classifier = knn	accuracy = 0.9078
classifier = knn	 training accuracy = 0.934175
classifier = knn	accuracy = 0.9085
classifier = knn	 training accuracy = 0.91145
classifier = knn	accuracy = 0.867


In [33]:
nb(trainData, testData, feat_abs)
nb(trainData, testData, feat_rel)
nb(trainData, testData, feat_com)

classifier = nb	 training accuracy = 0.81505
classifier = nb	accuracy = 0.8214
classifier = nb	 training accuracy = 0.670875
classifier = nb	accuracy = 0.6697
classifier = nb	 training accuracy = 0.672
classifier = nb	accuracy = 0.6737


In [46]:
rf(trainData, testData, feat_abs)
rf(trainData, testData, feat_rel)
rf(trainData, testData, feat_com)

classifier = rf	 training accuracy = 0.994925
classifier = rf	accuracy = 0.9461
classifier = rf	 training accuracy = 0.995175
classifier = rf	accuracy = 0.9548
classifier = rf	 training accuracy = 0.9968
classifier = rf	accuracy = 0.9612


In [50]:
dt('gini', 5, trainData, testData, feat_abs)
dt('gini', 5, trainData, testData, feat_rel)
dt('gini', 5, trainData, testData, feat_com)

classifier = rf	 training accuracy = 0.94435
classifier = rf	accuracy = 0.9446
classifier = rf	 training accuracy = 0.951575
classifier = rf	accuracy = 0.9527
classifier = rf	 training accuracy = 0.951625
classifier = rf	accuracy = 0.9526


In [34]:
svmClassifier('rbf', trainData, testData, feat_abs)
svmClassifier('rbf', trainData, testData, feat_rel)
svmClassifier('rbf', trainData, testData, feat_com)

classifier = svm rbf training	accuracy = 0.999875
classifier = svm rbf	accuracy = 0.5595
classifier = svm rbf training	accuracy = 0.9282
classifier = svm rbf	accuracy = 0.9295
classifier = svm rbf training	accuracy = 0.919375
classifier = svm rbf	accuracy = 0.9099


In [37]:
from time import clock
start = clock()
logistic(trainData, testData, feat_rel)
finish = clock()
print('logistic: time = ' + str(finish - start))

start = clock()
knnClassifier(trainData, testData, feat_rel)
finish = clock()
print('knn: time = ' + str(finish - start))

start = clock()
nb(trainData, testData, feat_rel)
finish = clock()
print('nb: time = ' + str(finish - start))

start = clock()
svmClassifier('rbf', trainData, testData, feat_rel)
finish = clock()
print('svm: time = ' + str(finish - start))

classifier = logistic	 training accuracy = 0.8883
classifier = logistic	accuracy = 0.8944
logistic: time = 6.621151999999995
classifier = knn	 training accuracy = 0.934175
classifier = knn	accuracy = 0.9085
knn: time = 1.8680870000000027
classifier = nb	 training accuracy = 0.670875
classifier = nb	accuracy = 0.6697
nb: time = 0.2963629999999853
classifier = svm rbf training	accuracy = 0.9282
classifier = svm rbf	accuracy = 0.9295
svm: time = 25.234667


In [51]:
start = clock()
rf(trainData, testData, feat_rel)
finish = clock()
print('random forest: time = ' + str(finish - start))

start = clock()
dt('gini', 5, trainData, testData, feat_rel)
finish = clock()
print('decision tree: time = ' + str(finish - start))

classifier = rf	 training accuracy = 0.995225
classifier = rf	accuracy = 0.9554
random forest: time = 0.960160999999971
classifier = rf	 training accuracy = 0.951575
classifier = rf	accuracy = 0.9527
decision tree: time = 0.33588399999996454


In [43]:
y_train = []
for d in trainData:
    if d[2] == d[0]:
        y_train.append(1)
    else:
        y_train.append(0)
print('train set')
print('total: ' + str(len(y_train)))
print('positive: ' + str(sum(y_train)))
print('negative: ' + str(len(y_train) - sum(y_train)))

train set
total: 40000
positive: 18841
negative: 21159


In [44]:
y_test = []
for d in testData:
    if d[2] == d[0]:
        y_test.append(1)
    else:
        y_test.append(0)
print('train set')
print('total: ' + str(len(y_test)))
print('positive: ' + str(sum(y_test)))
print('negative: ' + str(len(y_test) - sum(y_test)))

train set
total: 10000
positive: 4760
negative: 5240
