In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, LabelBinarizer
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin, BaseEstimator
import numpy as np
import re
from sklearn.feature_selection import SelectKBest

# Data Cleaning

In [2]:
data = pd.read_csv("data/pokemon.csv", keep_default_na = False)
train = pd.read_csv("data/train.csv")
test = pd.read_csv("test.csv")

In [3]:
data.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,5,Charmander,Fire,,39,52,43,60,50,65,1,False
4,6,Charmeleon,Fire,,58,64,58,80,65,80,1,False


We are going to combine types columns into 1 column so that we can use **MultiLabelBinarizer** later

In [4]:
data["type"] = [[data["Type 1"][i], data["Type 2"][i]]
                for i in range(0, len(data))]

In [5]:
data.drop(["Type 1", "Type 2", "Generation", "Legendary"], axis = 1, inplace=True)

In [6]:
data.head()

Unnamed: 0,#,Name,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,type
0,1,Bulbasaur,45,49,49,65,65,45,"[Grass, Poison]"
1,2,Ivysaur,60,62,63,80,80,60,"[Grass, Poison]"
2,3,Venusaur,80,82,83,100,100,80,"[Grass, Poison]"
3,5,Charmander,39,52,43,60,50,65,"[Fire, ]"
4,6,Charmeleon,58,64,58,80,65,80,"[Fire, ]"


Let's look at our training dataset

In [7]:
train.head()

Unnamed: 0,First_pokemon,Second_pokemon,Winner
0,5,49,5
1,119,5,5
2,130,52,130
3,123,139,139
4,166,108,166


In [8]:
train.shape

(1402, 3)

To make things easier to predict, we will change the **Winner** column to 1 if **First_pokemon** wins, else 0. This eventually becomes a **Classification** problem.

In [9]:
train["Winner"] = [1 if train["First_pokemon"][i] == train["Winner"][i] else 0 for i in range(0,len(train))]

We are going to generate more rows which are just the reverse of **First_pokemon** and **Second_pokemon** column the original data. Hence, the values in the **Winner** column will also change.

In [10]:
train_reverse = train.copy()

In [11]:
train_reverse[["First_pokemon","Second_pokemon"]] = train_reverse[["Second_pokemon","First_pokemon"]] 

In [12]:
train_reverse["Winner"] = 1 - train["Winner"]

In [13]:
train = pd.concat([train, train_reverse], axis = 0)

In [14]:
train.head()

Unnamed: 0,First_pokemon,Second_pokemon,Winner
0,5,49,1
1,119,5,0
2,130,52,1
3,123,139,0
4,166,108,1


In [15]:
train.shape

(2804, 3)

The next thing that we need to do is to map all the stats corresponing to **First_pokemon** and **Second_pokemon** to a same DataFrame

In [16]:
first_step = pd.merge(train,data,left_on="First_pokemon", right_on="#", how = "left")

In [17]:
second_step = pd.merge(first_step,data,left_on="Second_pokemon", right_on="#", how = "left")

In [18]:
second_step.drop(["First_pokemon","Second_pokemon","#_x", "Name_x", "#_y", "Name_y"], axis = 1, inplace=True)

In [19]:
train = second_step.copy()

In [20]:
train

Unnamed: 0,Winner,HP_x,Attack_x,Defense_x,Sp. Atk_x,Sp. Def_x,Speed_x,type_x,HP_y,Attack_y,Defense_y,Sp. Atk_y,Sp. Def_y,Speed_y,type_y
0,1,39,52,43,60,50,65,"[Fire, ]",45,50,55,75,65,30,"[Grass, Poison]"
1,0,65,90,120,85,70,60,"[Poison, ]",39,52,43,60,50,65,"[Fire, ]"
2,1,30,45,55,70,55,85,"[Water, ]",35,70,55,45,55,25,"[Bug, Grass]"
3,0,65,55,115,100,40,60,"[Grass, ]",75,100,95,40,70,110,"[Normal, ]"
4,1,100,100,100,100,100,100,"[Psychic, ]",55,130,115,50,50,75,"[Water, ]"
5,1,80,82,78,95,80,85,"[Water, ]",70,45,48,60,65,35,"[Fairy, ]"
6,1,70,110,80,55,80,105,"[Bug, Flying]",46,57,40,40,40,50,"[Poison, ]"
7,1,90,92,87,75,85,76,"[Poison, Ground]",60,62,63,80,80,60,"[Grass, Poison]"
8,1,50,105,79,35,110,76,"[Fighting, ]",70,80,50,35,35,35,"[Fighting, ]"
9,1,52,65,55,58,62,60,"[Normal, Flying]",90,55,75,60,75,30,"[Normal, ]"


# Modelling

In [21]:
# Train test splitting
X = train.drop(["Winner"], axis=1)
y = train["Winner"]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [22]:
X_train.head()

Unnamed: 0,HP_x,Attack_x,Defense_x,Sp. Atk_x,Sp. Def_x,Speed_x,type_x,HP_y,Attack_y,Defense_y,Sp. Atk_y,Sp. Def_y,Speed_y,type_y
611,30,45,55,70,55,85,"[Water, ]",60,65,60,130,75,110,"[Ghost, Poison]"
530,45,49,49,65,65,45,"[Grass, Poison]",55,81,60,50,70,97,"[Normal, ]"
1497,95,70,73,95,90,60,"[Fairy, ]",91,134,95,100,100,80,"[Dragon, Flying]"
49,130,65,60,110,95,65,"[Water, ]",60,40,80,60,45,40,"[Grass, Psychic]"
1309,38,41,40,50,65,65,"[Fire, ]",105,130,120,45,45,40,"[Ground, Rock]"


In [23]:
# Getting category columns and numerical columns
category_col = ["type_x", "type_y"]
numeric_col = set(X_train.columns) - set(category_col)

For category columns, we are going to perform **MultiLabelBinarizer**, while for numerical columns, we are going to perform **Standard Scaler**

In [24]:
category_col = [(i, MultiLabelBinarizer())  for i in category_col]
numeric_col = [([i], StandardScaler())  for i in numeric_col]

In [25]:
mapper = DataFrameMapper(category_col + numeric_col , df_out=True)

In [26]:
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

In [27]:
# function to create a dataframe having the training, testing and cross validation score for the models we want
def metric_df(*argv):
    model= {re.sub(r'\([^)]*\)', '', str(i)) : [accuracy_score(y_train, i.fit(Z_train,y_train).predict(Z_train)), 
                                                cross_val_score(i,Z_train,y_train,cv=20).mean(),
                                                accuracy_score(y_test, i.fit(Z_train,y_train).predict(Z_test))]
            for i in argv}
    return pd.DataFrame(model, index=["Accuracy Score Train", "CV Score","Accuracy Score Test"]).T

In [28]:
metric_df(DecisionTreeClassifier(), ExtraTreeClassifier(), AdaBoostClassifier(n_estimators = 100), 
          BaggingClassifier(n_estimators = 100), GradientBoostingClassifier(n_estimators = 100), 
          RandomForestClassifier(n_estimators = 100), LogisticRegression(solver = "lbfgs"))

Unnamed: 0,Accuracy Score Train,CV Score,Accuracy Score Test
DecisionTreeClassifier,1.0,0.901988,0.900143
ExtraTreeClassifier,1.0,0.779271,0.81883
AdaBoostClassifier,0.840704,0.810153,0.837375
BaggingClassifier,1.0,0.933868,0.934379
GradientBoostingClassifier,0.943414,0.89865,0.917261
RandomForestClassifier,1.0,0.891955,0.911555
LogisticRegression,0.863528,0.853508,0.871612


As we can see, even thought Bagging Classifier is totally ovefit in the training set, it still performs really well on the cross validation and the testing. Hence, we are going to use this model to predict our data

In [29]:
first_step = pd.merge(test,data,left_on="First_pokemon", right_on="#", how = "left")
second_step = pd.merge(first_step,data,left_on="Second_pokemon", right_on="#", how = "left")
second_step.drop(["First_pokemon","Second_pokemon","#_x", "Name_x", "#_y", "Name_y"], axis = 1, inplace=True)
test_transform = second_step.copy()

In [30]:
model = BaggingClassifier()

In [31]:
pipe = make_pipeline(mapper,model)

In [32]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('dataframemapper',
                 DataFrameMapper(default=False, df_out=True,
                                 features=[('type_x',
                                            MultiLabelBinarizer(classes=None,
                                                                sparse_output=False)),
                                           ('type_y',
                                            MultiLabelBinarizer(classes=None,
                                                                sparse_output=False)),
                                           (['Sp. Atk_y'],
                                            StandardScaler(copy=True,
                                                           with_mean=True,
                                                           with_std=True)),
                                           (['Attack_y'],
                                            StandardScaler(copy=True,
                                   

In [36]:
predict = pipe.predict(test_transform)

In [37]:
test["Winner"] = [test["First_pokemon"][index] if values == 1 else test["Second_pokemon"][index] 
                  for index,values in enumerate(predict)]

In [38]:
test.to_csv("../teams/test_minh.csv")

In [39]:
import pickle

In [40]:
pickle.dump(pipe, open("pipe.pkl", "wb"))
pickle.dump(data, open("data.pkl", "wb"))