# Project Introduction
Pokemon series created by Nintendo is one of the most popular game series in the world. People can capture Pokemon in the game world and train them to combat with others' Pokemon. There are thousands of Pokemon with totally different properties and of course, some of them are stronger than the others.
The main purpose of this project is training a model to predict the winner of combats between Pokemons. We downloaded the Pokemon data and combat data from https://www.kaggle.com/sekarmg/pokemon/data which include 800 Pokemon's information with their properties and 50000 combat results.

There are two parts of this project. 
* The first part is a warm-up. In this part, we will train a model to predict if a Pokemon is 'Legendary' according to its properties by SVM.
* In the second part, we would like to construct a model to predict the combat result as we mentioned above. We will try different ways to train the data in order to make the predicting accuracy as high as possible.

## Data Specification

In [1]:
# import packages
import pandas as pd
import numpy as np
from numpy import argmax
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn import linear_model
import numpy.polynomial.polynomial as poly

In [2]:
# read pokemon data from dataset
pk = pd.read_csv('Pokemon.csv', na_values = ['', ' ', 'NaN', np.nan], index_col = 0)
pk = np.array(pk)[:,1:]

In [3]:
# Data preprocessing
# merge two type into one
type_merge = []
for pair in zip(pk[:,0], pk[:,1]):
    pair = set(pair)
    type_merge.append(pair)
values = np.array(type_merge)
# transform types into one hot coding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse = False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
pk = np.delete(pk, [0,1], 1)
pk = np.hstack((onehot_encoded, pk))
# transform bool to int
lb = LabelBinarizer()
pk[:,-1] = np.transpose(lb.fit_transform(pk[:,-1].tolist()))

In [4]:
# Take the first 160 rows (20%) to test
Xts = pk[:160,:-1]
Xtr = pk[161:,:-1]
yts = pk[:160,-1]
ytr = pk[161:,-1]

In [5]:
svc = svm.SVC(probability = False, kernel = "rbf", C = 100, gamma = 1, verbose = False)
svc.fit(Xtr, ytr.astype(float))
yhat = svc.predict(Xts)
acc = np.mean(yhat == yts)
print('Accuracy = {0:f}'.format(acc))

Accuracy = 0.981250


In [6]:
# read combats data from dataset
cb = pd.read_csv('combats.csv', na_values = ['', ' ', 'NaN', np.nan])
pk = pd.read_csv('pokemon_new.csv', na_values = ['', ' ', 'NaN', np.nan])
pk = np.array(pk)
type_merge = []
for pair in zip(pk[:,2], pk[:,3]):
    pair = set(pair)
    type_merge.append(pair)
values = np.array(type_merge)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse = False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
onehot_len = onehot_encoded.shape[1]
pk = np.delete(pk, [1,2,3], 1)
pk = np.insert(pk, [1], onehot_encoded, axis = 1)
pk = np.delete(pk, -1, 1) # Delete the Legendary column
pk = np.delete(pk, -1, 1) # Delete the Generation column

cb = np.array(cb)
Xtr_cb = cb[:cb.shape[0]*4//5]
Xts_cb = cb[cb.shape[0]*4//5+1:]
ytr = []
yts = []
# If the first Pokemon win, label y as 1, if the second Pokemon win label y as 0
for pair in Xtr_cb:
    if pair[2] == pair[0]: ytr.append(1)
    else: ytr.append(0)
for pair in Xts_cb:
    if pair[2] == pair[0]: yts.append(1)
    else: yts.append(0)
ytr = np.array(ytr)
yts = np.array(yts)

In [7]:
Xtr_new = np.zeros((Xtr_cb.shape[0], pk.shape[1]-1+onehot_len))
Xts_new = np.zeros((Xts_cb.shape[0], pk.shape[1]-1+onehot_len))
for i, pair in enumerate(Xtr_cb):
    Xtr_new[i] = np.concatenate((pk[pair[0]-1,1:onehot_len+1], pk[pair[1]-1,1:onehot_len+1], pk[pair[0]-1,onehot_len+1:]-pk[pair[1]-1,onehot_len+1:]))
for i, pair in enumerate(Xts_cb):
    Xts_new[i] = np.concatenate((pk[pair[0]-1,1:onehot_len+1], pk[pair[1]-1,1:onehot_len+1], pk[pair[0]-1,onehot_len+1:]-pk[pair[1]-1,onehot_len+1:]))
print(Xtr_new.shape)
print(onehot_len)
logreg = linear_model.LogisticRegression()
logreg.fit(Xtr_new, ytr)
yhat = logreg.predict(Xts_new)
acc = np.mean(yhat == yts)
print("Accuracy = {0:f}".format(acc))

(40000, 1048)
521
Accuracy = 0.889289


In [8]:
svc = svm.SVC(probability = False, kernel = "linear", C = .0073, gamma = 2.8, verbose = False)
svc.fit(Xtr_new, ytr.astype(float))
yhat = svc.predict(Xts_new)
acc = np.mean(yhat == yts)
print('Accuracy = {0:f}'.format(acc))

Accuracy = 0.915292
