# Project Introduction
Pokemon series created by Nintendo is one of the most popular game series in the world. People can capture Pokemon in the game world and train them to combat with others' Pokemon. There are thousands of Pokemon with totally different properties and of course, some of them are stronger than the others.
The main purpose of this project is training a model to predict the winner of combats between Pokemons. We downloaded the Pokemon data from https://www.kaggle.com/abcsds/pokemon which include about 800 Pokemon and their properties. We also got combat data from  include the result of 50000 combats in the game world.

There are two parts of this project. 
* The first part is a warm-up. In this part, we will train a model to predict if a Pokemon is 'Legendary' according to its properties by SVM.
* In the second part, we would like to construct a model to predict the combat result as we mentioned above. We will try different ways to train the data in order to make the predicting accuracy as high as possible.

## Data Specification

In [16]:
# import packages
import pandas as pd
import numpy as np
from numpy import argmax
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn import linear_model
import numpy.polynomial.polynomial as poly
from keras.models import Model, Sequential
from keras.layers import Dense, Activation
import keras.backend as K
from keras import optimizers

In [2]:
# read pokemon data from dataset
pk = pd.read_csv('Pokemon.csv', na_values = ['', ' ', 'NaN', np.nan], index_col = 0)
pk = np.array(pk)[:,1:]

In [3]:
# Data preprocessing
# merge two type into one
type_merge = []
for pair in zip(pk[:,0], pk[:,1]):
    pair = set(pair)
    type_merge.append(pair)
values = np.array(type_merge)
# transform types into one hot coding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse = False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
pk = np.delete(pk, [0,1], 1)
pk = np.hstack((onehot_encoded, pk))
# transform bool to int
lb = LabelBinarizer()
pk[:,-1] = np.transpose(lb.fit_transform(pk[:,-1].tolist()))

In [4]:
# Take the first 160 rows (20%) to test
Xts = pk[:160,:-1]
Xtr = pk[161:,:-1]
yts = pk[:160,-1]
ytr = pk[161:,-1]

In [5]:
svc = svm.SVC(probability = False, kernel = "rbf", C = 100, gamma = 1, verbose = False)
svc.fit(Xtr, ytr.astype(float))
yhat = svc.predict(Xts)
acc = np.mean(yhat == yts)
print('Accuracy = {0:f}'.format(acc))

Accuracy = 0.981250


In [6]:
# read combats data from dataset
cb = pd.read_csv('combats.csv', na_values = ['', ' ', 'NaN', np.nan])
pk = pd.read_csv('pokemon_new.csv', na_values = ['', ' ', 'NaN', np.nan])
pk = np.array(pk)
type_merge = []
for pair in zip(pk[:,2], pk[:,3]):
    pair = set(pair)
    type_merge.append(pair)
values = np.array(type_merge)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse = False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
onehot_len = onehot_encoded.shape[1]
pk = np.delete(pk, [1,2,3], 1)
pk = np.insert(pk, [1], onehot_encoded, axis = 1)
pk = np.delete(pk, -1, 1) # delete the last column which represent whether it is legendary

cb = np.array(cb)
Xtr_cb = cb[:cb.shape[0]*4//5]
Xts_cb = cb[cb.shape[0]*4//5+1:]
ytr = []
yts = []
# If the first Pokemon win, label y as 0, if the second Pokemon win label y as 1
for pair in Xtr_cb:
    if pair[2] == pair[0]: ytr.append(0)
    else: ytr.append(1)
for pair in Xts_cb:
    if pair[2] == pair[0]: yts.append(0)
    else: yts.append(1)
ytr = np.array(ytr)
yts = np.array(yts)

In [7]:
Xtr_new = np.zeros((Xtr_cb.shape[0], pk.shape[1]-1+onehot_len))
Xts_new = np.zeros((Xts_cb.shape[0], pk.shape[1]-1+onehot_len))
for i, pair in enumerate(Xtr_cb):
    Xtr_new[i] = np.concatenate((pk[pair[0]-1,1:onehot_len+1], pk[pair[1]-1,1:onehot_len+1], pk[pair[0]-1,onehot_len+1:]-pk[pair[1]-1,onehot_len+1:]))
for i, pair in enumerate(Xts_cb):
    Xts_new[i] = np.concatenate((pk[pair[0]-1,1:onehot_len+1], pk[pair[1]-1,1:onehot_len+1], pk[pair[0]-1,onehot_len+1:]-pk[pair[1]-1,onehot_len+1:]))
logreg = linear_model.LogisticRegression()
logreg.fit(Xtr_new, ytr)
yhat = logreg.predict(Xts_new)
acc = np.mean(yhat == yts)
print("Accuracy = {0:f}".format(acc))

Accuracy = 0.889589


In [8]:
svc_new = svm.SVC(probability = False, kernel = "rbf", C = 2.8, gamma = .0073, verbose = 10)
svc_new.fit(Xtr_new, ytr.astype(float))
yhat_svc = svc_new.predict(Xts)
acc = np.mean(yhat_svc == yts)
print('Accuracy = {0:f}'.format(acc))

[LibSVM]

ValueError: X.shape[1] = 529 should be equal to 1049, the number of features at training time

In [25]:
Xtr_scale = (Xtr_new-np.mean(Xtr_new,axis=0))/np.std(Xtr_new,axis=0)
Xts_scale = (Xts_new-np.mean(Xtr_new,axis=0))/np.std(Xtr_new,axis=0)
K.clear_session()
nin = Xtr_new.shape[1]
nh = 256
nout = int(np.max(ytr)+1)
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=Xtr_new.shape[1]))
model.add(Dense(1, activation='softmax', name='output'))
model.summary()

  if __name__ == '__main__':


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                33600     
_________________________________________________________________
output (Dense)               (None, 1)                 33        
Total params: 33,633
Trainable params: 33,633
Non-trainable params: 0
_________________________________________________________________


  from ipykernel import kernelapp as app


In [26]:
opt = optimizers.Adam(lr=0.001)
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
batch_size = 100
model.fit(Xtr_scale, ytr, epochs=10, batch_size=batch_size, validation_data=(Xts_scale,yts))

Train on 40000 samples, validate on 9999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c79403c748>