In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier 
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score

In [2]:
n = 10
k10 = KFold(n_splits=n, shuffle=True, random_state=42)

In [3]:
train_file = 'data/pokemon-challenge-dataset/train.csv'
test_file = 'data/pokemon-challenge-dataset/test.csv'
info_file = 'data/pokemon-challenge-dataset/pokemon.csv'

In [4]:
def get_data(file):
    # load file
    df = pd.read_csv(file, dtype=str)
    pokemon = pd.read_csv(info_file, dtype={'#': str, 'Legendary':int}).set_index("#")
    pokemon.Name.fillna("???", inplace=True)  # fill missing name with ???
    
    # get y
    labels = (df['First_pokemon'] == df['Winner']).astype(int)

    # get X: 
    # 1. merge first_pokemon features
    # 2. merge second_pokemon features
    df = pd.merge(df, pokemon, left_on='First_pokemon', right_index=True, how='left')
    df = pd.merge(df, pokemon, left_on='Second_pokemon', right_index=True, how='left'
                  , suffixes=('_1', '_2'))

    # drop columns [First_pokemon, Second_pokemon, Winner]
    return df.iloc[:, 3:], labels  

def get_cv_scores(model, X, y, cv, score_func):
    scores = []
    for tr_idx, val_idx in cv.split(X, y):
        x_tr, x_val = X.iloc[tr_idx,:], X.iloc[val_idx, :]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
        model.fit(x_tr, y_tr)
        y_pred = model.predict(x_val)
        score = score_func(y_val, y_pred)
        scores.append(score)
    return np.array(scores)

In [5]:
X_train, y_train = get_data(train_file)
X_test, y_test = get_data(test_file)

In [6]:
X_train.head()

Unnamed: 0,Name_1,Type 1_1,Type 2_1,HP_1,Attack_1,Defense_1,Sp. Atk_1,Sp. Def_1,Speed_1,Generation_1,...,Type 1_2,Type 2_2,HP_2,Attack_2,Defense_2,Sp. Atk_2,Sp. Def_2,Speed_2,Generation_2,Legendary_2
0,Drapion,Poison,Dark,70,90,110,60,75,95,4,...,Water,Electric,125,58,58,76,76,67,2,0
1,Wailord,Water,,170,90,45,90,45,60,3,...,Ice,Flying,45,55,45,65,45,75,2,0
2,Magmar,Fire,,65,95,57,100,85,93,1,...,Fighting,,45,85,50,55,50,65,5,0
3,Doduo,Normal,Flying,35,85,45,35,35,75,1,...,Fire,,58,64,58,80,65,80,1,0
4,Shedinja,Bug,Ghost,1,90,45,30,30,40,3,...,Bug,Ground,31,45,90,30,30,40,3,0


In [7]:
# define group of columns for particular preprocessing step

stat_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']

num_cols =  ['HP_1', 'Attack_1', 'Defense_1', 'Sp. Atk_1', 'Sp. Def_1', 'Speed_1',
             'Generation_1', 'Legendary_1', 'HP_2', 'Attack_2', 'Defense_2',
             'Sp. Atk_2', 'Sp. Def_2', 'Speed_2', 'Generation_2', 'Legendary_2']

cat_cols = ['Type 1_1', 'Type 2_1', 'Type 1_2', 'Type 2_2']

diff_cols = [s+n  for n in ['_1', '_2'] for s in stat_cols]

In [8]:
type_chart = {
    'Normal': {'strong': [], 
               'weak': "Rock, Ghost, Steel".split(', ')},
    'Flying': {'strong': "Fighting, Bug, Grass".split(', '), 
                 'weak': "Rock, Steel, Electric".split(', ')},
    'Fighting': {'strong': "Normal, Rock, Steel, Ice, Dark".split(', '), 
               'weak': "Flying, Poison, Psychic, Bug, Ghost, Fairy".split(', ')},
    'Poison': {'strong': "Grass, Fairy".split(', '), 
               'weak': "Poison, Ground, Rock, Ghost, Steel".split(', ')},
    'Ground': {'strong': "Poison, Rock, Steel, Fire, Electric".split(', '), 
               'weak': "Flying, Bug, Grass".split(', ')},
    'Rock': {'strong': "Flying, Bug, Fire, Ice".split(', '), 
             'weak': "Fighting, Ground, Steel".split(', ')},
    'Bug': {'strong': "Grass, Psychic, Dark".split(', '), 
            'weak': "Fighting, Flying, Poison, Ghost, Steel, Fire, Fairy".split(', ')},
    'Ghost': {'strong': "Ghost, Psychic".split(', '), 
              'weak': "Normal, Dark".split(', ')},
    'Steel': {'strong': "Rock, Ice, Fairy".split(', '), 
              'weak': "Steel, Fire, Water, Electric".split(', ')},
    'Fire': {'strong': "Bug, Steel, Grass, Ice".split(', '), 
             'weak': "Rock, Fire, Water, Dragon".split(', ')},
    'Water': {'strong': "Ground, Rock, Fire".split(', '), 
              'weak': "Water, Grass, Dragon".split(', ')},
    'Grass': {'strong': "Ground, Rock, Water".split(', '), 
              'weak': "Flying, Poison, Bug, Steel, Fire, Grass, Dragon".split(', ')},
    'Electric': {'strong': "Flying, Water".split(', '), 
                 'weak': "Ground, Grass, Electric, Dragon".split(', ')},
    'Psychic': {'strong': ['Fighting','Poison'], 
                'weak': ['Psychic','Dark','Steel']},
    'Ice': {'strong': ['Grass','Ground','Flying','Dragon'], 
            'weak': ['Steel','Ice','Fire','Water']},
    'Dragon': {'strong': ['Dragon'], 
               'weak': ['Steel','Fairy']},
    'Fairy': {'strong': ['Fighting','Dark','Dragon'], 
              'weak': ['Steel','Fire','Poison']},
    'Dark': {'strong': ['Ghost','Psychic'], 
             'weak': ['Fighting','Dark','Fairy']},
}

type_table = pd.DataFrame(data=0, columns=type_chart.keys(), index=type_chart.keys())

# fill value in type chart: row=First_pokemon, col=Second_pokemon
for me, chart in type_chart.items():
    
    # strong
    for other in chart['strong']:
        type_table.loc[me, other] = 1
        type_table.loc[other, me] = -1
    # weak
    for other in chart['weak']:
        type_table.loc[me, other] = -1
        type_table.loc[other, me] = 1

In [84]:
type_table

Unnamed: 0,Normal,Flying,Fighting,Poison,Ground,Rock,Bug,Ghost,Steel,Fire,Water,Grass,Electric,Psychic,Ice,Dragon,Fairy,Dark
Normal,0,0,-1,0,0,-1,0,1,-1,0,0,0,0,0,0,0,0,0
Flying,0,0,1,0,1,-1,1,0,-1,0,0,1,-1,0,-1,0,0,0
Fighting,1,-1,0,-1,0,1,1,-1,1,0,0,0,0,-1,1,0,-1,1
Poison,0,0,1,1,-1,-1,1,-1,-1,0,0,1,0,-1,0,0,1,0
Ground,0,-1,0,1,0,1,-1,0,1,1,-1,-1,1,0,-1,0,0,0
Rock,1,1,-1,1,-1,0,1,0,-1,1,-1,-1,0,0,1,0,0,0
Bug,0,-1,-1,-1,1,-1,0,-1,-1,-1,0,1,0,1,0,0,-1,1
Ghost,-1,0,1,1,0,0,1,-1,0,0,0,0,0,1,0,0,0,-1
Steel,1,1,-1,1,-1,1,1,0,1,-1,-1,1,-1,1,1,1,1,0
Fire,0,0,0,0,-1,-1,1,0,1,1,-1,1,0,0,1,-1,1,0


### Pipeline

In [27]:
# selector[cat_cols] -> CatDummy
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer

class CatDummy(BaseEstimator, TransformerMixin):
    '''onehot encoder for type_1 and type_2 features of each pokemon
    by combining them into single dummy variable for each pokemon'''
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X
    
class CombineTypeTransformer(BaseEstimator, TransformerMixin):
    '''combine 2 columns of pokemon types'''
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        '''combine Type_1 and Type_2 into a single column seperate by whitespace'''
        assert X.shape[1]==2, 'expected X has 2 columns of string'
        result = X[:, 0] + " " + X[:, 1]
        return result#.reshape((-1,1))     # 2-d array

class StatDiffTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # expect first half belong to first_pokemon
        mid = X.shape[1]//2
        return X[:, :mid] - X[:, mid:]
    
class StrongWeakTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        '''use value from chart table to determine 
        strong/weak of first pokemon against second pokemon'''
        result = []
        for row in X.values:
            total = 0
            first = row[:2]
            second = row[2:]
            first = first[~pd.isnull(first)]
            second = second[~pd.isnull(second)]
            for name_1 in first:
                for name_2 in second:
                    total += type_table.loc[name_1, name_2]
            result.append(total)
        return np.array(result).reshape(-1,1) 


# pokemon type preprocessing
first_type_select_imputer = ColumnTransformer(transformers=[
        ('impute', SimpleImputer(strategy='constant', fill_value=''), cat_cols[:2])
], remainder='drop')

second_type_select_imputer = ColumnTransformer(transformers=[
        ('impute', SimpleImputer(strategy='constant', fill_value=''), cat_cols[2:])
], remainder='drop')

first_type_prep = Pipeline(steps=[
    ('clean', first_type_select_imputer),
    ('combine', CombineTypeTransformer()),
    ('dummy', CountVectorizer())
])

second_type_prep = Pipeline(steps=[
    ('clean', second_type_select_imputer),
    ('combine', CombineTypeTransformer()),
    ('dummy', CountVectorizer())
])

# numeric columns preprocessing
num_prep = ColumnTransformer(transformers=[
    ('impute', SimpleImputer(), num_cols)
])

# diff stat
diff_selector = ColumnTransformer(transformers=[
    ('impute', SimpleImputer(strategy='constant', fill_value=0), diff_cols)
])
diff_prep = Pipeline(steps=[
    ('selector', diff_selector),
    ('diff', StatDiffTransformer())
])

# feature engineering -> strong/weak
strength_prep = ColumnTransformer(transformers=[
    ('strongweak', StrongWeakTransformer(), cat_cols)
], remainder='drop')

# plain dummy type
cat_selector = ColumnTransformer(transformers=[
    ('selector', SimpleImputer(strategy='constant', fill_value='UNK'), cat_cols)
], remainder='drop')

plain_dummy_type_prep = Pipeline(steps=[
    ('selector', cat_selector),
    ('onehot', OneHotEncoder())
])


# Hp1- Hp2
# Attack_1 - Defense_2
# Defense_1 - Attack_2
# Sp. Atk_1 - Sp. Def_2
# Sp. Def_1 - Sp. Atk_2
# Speed_1 - Speed_2

# select 2 columns [pokemon_1, pokemon_2]
# create diff [pokemon_1 - pokemon_2]
class PairStatDiffTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = np.array(X)
        result = X[:, 0] - X[:, 1]
        return result.reshape(-1,1)
    
def make_pair_diff(col_1, col_2):
    pair_prep = ColumnTransformer(transformers=[
        ('pair', PairStatDiffTransformer(), [col_1, col_2])
    ], remainder='drop')
    return pair_prep

##
all_prep = FeatureUnion(transformer_list=[
    #('first_type_prep', first_type_prep),
    #('second_type_prep', second_type_prep),
    ('numeric_prep', num_prep),
    ('diff_prep', diff_prep),
    ('strength_prep', strength_prep),
    ('plain_dummy', plain_dummy_type_prep),
    ('atk_def_diff', make_pair_diff('Attack_1', 'Defense_2')),
    ('def_atk_diff', make_pair_diff('Defense_1', 'Attack_2')),
    ('spAtk_spDef_diff', make_pair_diff('Sp. Atk_1', 'Sp. Def_2')),
    ('spDef_spAtk_diff', make_pair_diff('Sp. Def_1', 'Sp. Atk_2'))
])

test_pipe = Pipeline(steps=[
    ('prep', all_prep),
    ('clf', GradientBoostingClassifier())
])

In [11]:
# helper function

def check_cv_score(pipeline, X, y, cv, scoring, name=""):
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scoring, n_jobs=4)
    mean = scores.mean()
    std = scores.std()
    print("cv-score: {}\n mean={:.4f}, std={:.4f}".format(name, mean, std))

In [56]:
# type dummy
scores = cross_val_score(test_pipe, X_train, y_train, cv=k10, scoring='accuracy', n_jobs=4)

In [55]:
scores.mean()

0.6582

In [59]:
# num + type dummy
scores = cross_val_score(test_pipe, X_train, y_train, cv=k10, scoring='accuracy', n_jobs=4)

In [60]:
scores.mean()

0.9260999999999999

In [65]:
check_cv_score(test_pipe, X_train, y_train, k10, 'accuracy', name="numeric + dummy_type + gb")

cv-score: numeric + dummy_type + gb
 mean=0.9261, std=0.0031


In [79]:
check_cv_score(test_pipe, X_train, y_train, k10, 'accuracy', name="numeric + dummy_type + stat_diff + gb")

cv-score: numeric + dummy_type + stat_diff + gb
 mean=0.9554, std=0.0023


In [105]:
check_cv_score(test_pipe, X_train, y_train, k10, 'accuracy', 
               name="numeric + dummy_type + stat_diff + strength/weakness + gb")

cv-score: numeric + dummy_type + stat_diff + strength/weakness + gb
 mean=0.9583, std=0.0029


In [26]:
check_cv_score(test_pipe, X_train, y_train, k10, 'accuracy', 
               name="numeric + dummy_type + diff(stat) + strength/weakness + diff(atk,def) + gb")

cv-score: numeric + dummy_type + diff(stat) + strength/weakness + diff(atk,def) + gb
 mean=0.9630, std=0.0025


In [28]:
check_cv_score(test_pipe, X_train, y_train, k10, 'accuracy', 
               name="add diff(atk,def), diff(def,atk), diff(spAtk,spDef), diff(spDef,spAtk) + gb")

cv-score: add diff(atk,def), diff(def,atk), diff(spAtk,spDef), diff(spDef,spAtk) + gb
 mean=0.9631, std=0.0024
