In [1]:
import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn_pandas import DataFrameMapper

from sklearn.preprocessing import LabelBinarizer, StandardScaler

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import cross_val_score

In [2]:
# Read in the "pokemon.csv" and "train.csv" files and assign them to appropriately named pandas dataframes.

pokemon = pd.read_csv("./data/pokemon.csv")

train = pd.read_csv("./data/train.csv")

In [3]:
# Take a look on pokemon dataframe

pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,5,Charmander,Fire,,39,52,43,60,50,65,1,False
4,6,Charmeleon,Fire,,58,64,58,80,65,80,1,False


In [4]:
# Filling null values in "Type 2" column with "Unknown" value

pokemon['Type 2'].fillna("Unknown", inplace = True)

# Rename "#" column

pokemon.rename(columns={'#':'index'}, inplace=True)

In [5]:
# Checking if there are any null values 

pokemon.isnull().sum()

index         0
Name          0
Type 1        0
Type 2        0
HP            0
Attack        0
Defense       0
Sp. Atk       0
Sp. Def       0
Speed         0
Generation    0
Legendary     0
dtype: int64

In [6]:
pokemon.head()

Unnamed: 0,index,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,5,Charmander,Fire,Unknown,39,52,43,60,50,65,1,False
4,6,Charmeleon,Fire,Unknown,58,64,58,80,65,80,1,False


In [7]:
# Create a list of pokemon names and a dataframe containing pokemon names and index values

poke_name = pokemon.Name.unique()

map_df = pokemon[["Name","index"]]

In [8]:
# Dropping "Generation", "Name", and "legendary" columns

pokemon.drop(['Generation', 'Name', 'Legendary'], axis = 1, inplace = True)

pokemon.head()

Unnamed: 0,index,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Grass,Poison,45,49,49,65,65,45
1,2,Grass,Poison,60,62,63,80,80,60
2,3,Grass,Poison,80,82,83,100,100,80
3,5,Fire,Unknown,39,52,43,60,50,65
4,6,Fire,Unknown,58,64,58,80,65,80


In [9]:
# Converting "Winner" values numerically

convert_bool = train['Winner'] == train.First_pokemon.values

convert_bool.to_frame()

train['Winner'] = convert_bool

train = train*1

train.head()

Unnamed: 0,First_pokemon,Second_pokemon,Winner
0,5,49,1
1,119,5,0
2,130,52,1
3,123,139,0
4,166,108,1


In [10]:
# Merging "pokemon" and "train" dataframe based on "First_pokemon" column

# Dropping unnecessary columns

# Rename meaningful column names

first_pokemon = pd.merge(train, pokemon, left_on = 'First_pokemon', right_on = 'index', how = 'left')

first_pokemon.drop(['Second_pokemon', 'Winner', 'index'], axis = 1, inplace = True)

first_pokemon.rename(columns={'Type 1':'type1_1', 'Type 2': 'type1_2', 'HP': 'hp_1', 'Attack': 'attack_1',
                              'Defense': 'defense_1', 'Sp. Atk' : 'sp.atk_1', 'Sp. Def': 'sp.def_1',
                              'Speed': 'speed_1'}, inplace=True)

first_pokemon.head()

Unnamed: 0,First_pokemon,type1_1,type1_2,hp_1,attack_1,defense_1,sp.atk_1,sp.def_1,speed_1
0,5,Fire,Unknown,39,52,43,60,50,65
1,119,Poison,Unknown,65,90,120,85,70,60
2,130,Water,Unknown,30,45,55,70,55,85
3,123,Grass,Unknown,65,55,115,100,40,60
4,166,Psychic,Unknown,100,100,100,100,100,100


In [11]:
# Merging "pokemon" and "train" dataframe based on "Second_pokemon" column

# Dropping unnecessary columns

# Rename meaningful column names

second_pokemon = pd.merge(train, pokemon, left_on = "Second_pokemon", right_on = 'index', how = 'left')

second_pokemon.drop(['First_pokemon', 'index', 'Winner'], axis = 1, inplace = True)

second_pokemon.rename(columns={'Type 1':'type2_1', 'Type 2': 'type2_2', 'HP': 'hp_2', 'Attack': 'attack_2',
                              'Defense': 'defense_2', 'Sp. Atk' : 'sp.atk_2', 'Sp. Def': 'sp.def_2',
                              'Speed': 'speed_2'}, inplace=True)

second_pokemon.head()

Unnamed: 0,Second_pokemon,type2_1,type2_2,hp_2,attack_2,defense_2,sp.atk_2,sp.def_2,speed_2
0,49,Grass,Poison,45,50,55,75,65,30
1,5,Fire,Unknown,39,52,43,60,50,65
2,52,Bug,Grass,35,70,55,45,55,25
3,139,Normal,Unknown,75,100,95,40,70,110
4,108,Water,Unknown,55,130,115,50,50,75


In [12]:
# Creating a new dataframe "data" for pre-processing

data = pd.concat([first_pokemon, second_pokemon], axis=1)

data.drop(['First_pokemon', 'Second_pokemon'], axis = 1, inplace = True)

data.head()

Unnamed: 0,type1_1,type1_2,hp_1,attack_1,defense_1,sp.atk_1,sp.def_1,speed_1,type2_1,type2_2,hp_2,attack_2,defense_2,sp.atk_2,sp.def_2,speed_2
0,Fire,Unknown,39,52,43,60,50,65,Grass,Poison,45,50,55,75,65,30
1,Poison,Unknown,65,90,120,85,70,60,Fire,Unknown,39,52,43,60,50,65
2,Water,Unknown,30,45,55,70,55,85,Bug,Grass,35,70,55,45,55,25
3,Grass,Unknown,65,55,115,100,40,60,Normal,Unknown,75,100,95,40,70,110
4,Psychic,Unknown,100,100,100,100,100,100,Water,Unknown,55,130,115,50,50,75


In [13]:
train.shape

(1402, 3)

In [14]:
data.shape

(1402, 16)

In [15]:
data.head()

Unnamed: 0,type1_1,type1_2,hp_1,attack_1,defense_1,sp.atk_1,sp.def_1,speed_1,type2_1,type2_2,hp_2,attack_2,defense_2,sp.atk_2,sp.def_2,speed_2
0,Fire,Unknown,39,52,43,60,50,65,Grass,Poison,45,50,55,75,65,30
1,Poison,Unknown,65,90,120,85,70,60,Fire,Unknown,39,52,43,60,50,65
2,Water,Unknown,30,45,55,70,55,85,Bug,Grass,35,70,55,45,55,25
3,Grass,Unknown,65,55,115,100,40,60,Normal,Unknown,75,100,95,40,70,110
4,Psychic,Unknown,100,100,100,100,100,100,Water,Unknown,55,130,115,50,50,75


In [16]:
# THE MAPPER

mapper = DataFrameMapper([
    ('type1_1', LabelBinarizer()),
    ('type1_2', LabelBinarizer()),
    (['hp_1'], StandardScaler()),
    (['attack_1'], StandardScaler()),
    (['defense_1'], StandardScaler()),
    (['sp.atk_1'], StandardScaler()),
    (['sp.def_1'], StandardScaler()),
    (['speed_1'], StandardScaler()),
    ('type2_1', LabelBinarizer()),
    ('type2_2', LabelBinarizer()),
    (['hp_2'], StandardScaler()),
    (['attack_2'], StandardScaler()),
    (['defense_2'], StandardScaler()),
    (['sp.atk_2'], StandardScaler()),
    (['sp.def_2'], StandardScaler()),
    (['speed_2'], StandardScaler())
], df_out = True)

data_trans = mapper.fit_transform(data)

In [17]:
# indentify target and train test split

X = data_trans

y = train.Winner

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.20)

In [18]:
data.head()

Unnamed: 0,type1_1,type1_2,hp_1,attack_1,defense_1,sp.atk_1,sp.def_1,speed_1,type2_1,type2_2,hp_2,attack_2,defense_2,sp.atk_2,sp.def_2,speed_2
0,Fire,Unknown,39,52,43,60,50,65,Grass,Poison,45,50,55,75,65,30
1,Poison,Unknown,65,90,120,85,70,60,Fire,Unknown,39,52,43,60,50,65
2,Water,Unknown,30,45,55,70,55,85,Bug,Grass,35,70,55,45,55,25
3,Grass,Unknown,65,55,115,100,40,60,Normal,Unknown,75,100,95,40,70,110
4,Psychic,Unknown,100,100,100,100,100,100,Water,Unknown,55,130,115,50,50,75


# Modeling

##  Bagging Classifier

In [19]:
from sklearn.ensemble import BaggingClassifier

bag = BaggingClassifier()

bag.fit(X_train, y_train)

bag.score(X_train, y_train)

0.9937555753791257

In [20]:
scores_bag = cross_val_score(bag, X_test, y_test, cv=5)  

np.mean(scores_bag)

0.8294953292321715

## AdaBoost

In [21]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)

abc.fit(X_train, y_train)

abc.score(X_train, y_train)

0.8626226583407671

In [22]:
scores_abc = cross_val_score(abc, X_test, y_test, cv=5)  

np.mean(scores_abc)

0.7832478924584187

## Gradient Boosting Classifier

In [23]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

gbc.fit(X_train, y_train)

gbc.score(X_train, y_train)

0.9634255129348795

In [24]:
scores_gbc = cross_val_score(gbc, X_test, y_test, cv=5)  

np.mean(scores_gbc)

0.8293700159489633

## Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=2, random_state=0)

clf.fit(X_train, y_train)

clf.score(X_train, y_train)



0.9928635147190009

In [26]:
scores_clf= cross_val_score(clf, X_test, y_test, cv=5)  

np.mean(scores_clf)

0.7688972431077694

## Decision Tree Classifier

In [27]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

dtc.fit(X_train, y_train)

dtc.score(X_train, y_train)

1.0

In [28]:
scores_dtc = cross_val_score(dtc, X_test, y_test, cv=5)  

np.mean(scores_dtc)

0.7866381863750285

# Choosing the right model

In [29]:
import pickle

from sklearn.pipeline import Pipeline

pipe = Pipeline([("mapper", mapper),("model", bag)])

# serializing model to a file

pickle.dump(pipe, open("model.pkl", "wb"))

pickle.dump(poke_name, open("poke_name.pkl", "wb"))

pickle.dump(pokemon, open("pokemon.pkl", "wb"))

pickle.dump(map_df, open("map_df.pkl", "wb"))

In [30]:
# Prediction for X_test

bag.predict(X_test)

array([0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1])

In [31]:
# Read in the "test.csv"

test = pd.read_csv("data/test.csv")

test.head()

Unnamed: 0,First_pokemon,Second_pokemon
0,94,141
1,104,124
2,107,113
3,74,52
4,91,70


In [32]:
# We do the same thing as "train" dataset

first_pokemon_test = pd.merge(test, pokemon, left_on = 'First_pokemon', right_on = 'index', how = 'left')

first_pokemon_test.drop(['Second_pokemon', 'index'], axis = 1, inplace = True)

first_pokemon_test.rename(columns={'Type 1':'type1_1', 'Type 2': 'type1_2', 'HP': 'hp_1', 'Attack': 'attack_1',
                              'Defense': 'defense_1', 'Sp. Atk' : 'sp.atk_1', 'Sp. Def': 'sp.def_1',
                              'Speed': 'speed_1'}, inplace=True)

first_pokemon_test.head()

Unnamed: 0,First_pokemon,type1_1,type1_2,hp_1,attack_1,defense_1,sp.atk_1,sp.def_1,speed_1
0,94,Water,Unknown,65,45,55,45,70,45
1,104,Rock,Ground,35,45,160,30,45,70
2,107,Water,Unknown,30,105,90,25,25,50
3,74,Fighting,Unknown,80,100,70,50,60,45
4,91,Normal,Flying,52,65,55,58,62,60


In [33]:
second_pokemon_test = pd.merge(test, pokemon, left_on = "Second_pokemon", right_on = 'index', how = 'left')

second_pokemon_test.drop(['First_pokemon', 'index'], axis = 1, inplace = True)

second_pokemon_test.rename(columns={'Type 1':'type2_1', 'Type 2': 'type2_2', 'HP': 'hp_2', 'Attack': 'attack_2',
                              'Defense': 'defense_2', 'Sp. Atk' : 'sp.atk_2', 'Sp. Def': 'sp.def_2',
                              'Speed': 'speed_2'}, inplace=True)

second_pokemon_test.head()

Unnamed: 0,Second_pokemon,type2_1,type2_2,hp_2,attack_2,defense_2,sp.atk_2,sp.def_2,speed_2
0,141,Water,Flying,95,125,79,60,100,81
1,124,Normal,Unknown,105,95,80,40,80,90
2,113,Ground,Unknown,50,50,95,40,50,35
3,52,Bug,Grass,35,70,55,45,55,25
4,70,Psychic,Unknown,40,35,30,120,70,105


In [34]:
# New dataframe "final"

final = pd.concat([first_pokemon_test, second_pokemon_test], axis=1)

final.drop(['First_pokemon', 'Second_pokemon'], axis = 1, inplace = True)

final.head()

Unnamed: 0,type1_1,type1_2,hp_1,attack_1,defense_1,sp.atk_1,sp.def_1,speed_1,type2_1,type2_2,hp_2,attack_2,defense_2,sp.atk_2,sp.def_2,speed_2
0,Water,Unknown,65,45,55,45,70,45,Water,Flying,95,125,79,60,100,81
1,Rock,Ground,35,45,160,30,45,70,Normal,Unknown,105,95,80,40,80,90
2,Water,Unknown,30,105,90,25,25,50,Ground,Unknown,50,50,95,40,50,35
3,Fighting,Unknown,80,100,70,50,60,45,Bug,Grass,35,70,55,45,55,25
4,Normal,Flying,52,65,55,58,62,60,Psychic,Unknown,40,35,30,120,70,105


In [35]:
final.shape

(248, 16)

In [36]:
# Predictions for "test" dataset

prediction = pipe.predict(final)

prediction

array([0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0])

In [37]:
# Adding new column "prediction" 

test['predicion'] = prediction

# Function to get Winner values

def label_winner (test):
    
    if test['predicion'] == 1 :
        return test['First_pokemon']
    
    if test['predicion'] == 0 :
        return test['Second_pokemon']


test['Winner'] = test.apply (lambda test: label_winner(test), axis=1)

test.drop(['predicion'], axis = 1, inplace = True)

test.head()

Unnamed: 0,First_pokemon,Second_pokemon,Winner
0,94,141,141
1,104,124,124
2,107,113,107
3,74,52,74
4,91,70,70


In [38]:
# Exporting to csv file

test.to_csv('test_lucas.csv')