In [1]:
import pandas as pd

In [2]:
battles = pd.read_csv('pokemans/combats.csv')
pokemon = pd.read_csv('pokemans/pokemon.csv')

In [4]:
pokemon

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,4,Mega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,5,Charmander,Fire,,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
795,796,Diancie,Rock,Fairy,50,100,150,100,150,50,6,True
796,797,Mega Diancie,Rock,Fairy,50,160,110,160,110,110,6,True
797,798,Hoopa Confined,Psychic,Ghost,80,110,60,150,130,70,6,True
798,799,Hoopa Unbound,Psychic,Dark,80,160,60,170,130,80,6,True


In [6]:
types = list(pokemon["Type 1"].unique())

In [7]:
def make_features(row):
    first = row['First_pokemon']
    second = row['Second_pokemon']
    winner = int(row['Winner']==first)
    first_stats = pokemon.iloc[first-1]
    second_stats = pokemon.iloc[second-1]

    types_first = [first_stats["Type 1"], first_stats["Type 2"]]
    types_second = [second_stats["Type 1"], second_stats["Type 2"]]
    types_first = [1 if type in types_first else 0 for type in types]
    types_second = [1 if type in types_second else 0 for type in types]

    #calculate difference in battle stats for each battle
    stats_first = list(first_stats[first_stats.index[4:]])
    stats_second = list(second_stats[second_stats.index[4:]])
    battle_stats = zip(stats_first, stats_second)

    battle_stats_diff = [int(x)-int(y) for x,y in battle_stats]

    features = types_first+types_second+battle_stats_diff+[winner]
    return features


column_names = [f'first_{type}' for type in types]
column_names.extend([f'second_{type}' for type in types])
column_names.extend([f'diff_{stat}' for stat in pokemon.columns[4:]])
column_names.append('Winner')

battles[column_names] = battles.apply(make_features, axis=1, result_type='expand')

In [8]:
battles

Unnamed: 0,First_pokemon,Second_pokemon,Winner,first_Grass,first_Fire,first_Water,first_Bug,first_Normal,first_Poison,first_Electric,...,second_Steel,second_Flying,diff_HP,diff_Attack,diff_Defense,diff_Sp. Atk,diff_Sp. Def,diff_Speed,diff_Generation,diff_Legendary
0,266,298,0,0,0,0,0,0,0,0,...,0,0,-20,-6,10,-15,10,-19,-1,0
1,702,701,0,1,0,0,0,0,0,0,...,0,0,0,-39,-18,18,39,0,0,0
2,191,668,0,0,0,0,0,0,0,0,...,0,0,-20,-35,10,-45,10,0,-3,0
3,237,683,0,0,1,0,0,0,0,0,...,0,0,-37,-80,-50,10,-50,-28,-3,0
4,151,231,1,0,0,1,0,0,0,0,...,0,0,50,50,-105,105,-160,50,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,707,126,1,0,1,0,0,0,0,0,...,0,0,70,80,30,80,95,30,4,1
49996,589,664,1,0,0,0,0,0,0,0,...,0,0,25,30,0,-15,5,8,0,0
49997,303,368,0,0,0,1,0,0,0,0,...,0,0,-13,-65,40,25,10,-25,0,0
49998,109,89,1,0,0,0,0,0,0,1,...,1,0,15,-5,-20,-40,0,55,0,0


In [17]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(battles, test_size=.25, shuffle=True)

In [18]:
train_x = train_df.drop(columns='First_pokemon Second_pokemon Winner'.split()).to_numpy()
train_y = list(train_df['Winner'])

test_x = test_df.drop(columns='First_pokemon Second_pokemon Winner'.split()).to_numpy()
test_y = list(test_df['Winner'])

In [8]:
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost.sklearn import XGBClassifier




#only fit on the train set to avoid data leak!
#dont want to scale dummy variables only battle stats
scaler = StandardScaler().fit(train_x[:, -6:])
scaled_features_train_x = scaler.transform(train_x[:, -6:])
scaled_features_test_x = scaler.transform(test_x[:, -6:])

scaled_train_x = np.concatenate((train_x[:, :-6], scaled_features_train_x),axis=1)
scaled_test_x = np.concatenate((test_x[:, :-6], scaled_features_test_x),axis=1)

clf_log_reg = LogisticRegression(max_iter=1000).fit(train_x, train_y)
clf_dec_tree = DecisionTreeClassifier().fit(train_x, train_y)
clf_forest = RandomForestClassifier().fit(train_x, train_y)
clf_gnb = GaussianNB().fit(train_x, train_y)
clf_xgb = XGBClassifier.fit(train_x, train_y)
clf_svm = svm.SVC().fit(scaled_train_x, train_y)
clf_sgd = SGDClassifier().fit(scaled_train_x, train_y)
clf_knn = KNeighborsClassifier().fit(scaled_train_x, train_y)

In [27]:
print(clf_log_reg.score(test_x, test_y))
print(clf_dec_tree.score(test_x, test_y))
print(clf_forest.score(test_x, test_y))
print(clf_xgb.score(train_x, train_y))
print(clf_gnb.score(test_x, test_y))
print(clf_svm.score(scaled_test_x, test_y))
print(clf_sgd.score(scaled_test_x, test_y))
print(clf_knn.score(scaled_test_x, test_y))

0.88904
0.94536
0.95672
0.9780266666666667
0.76064
0.866
0.88168
0.7628


In [21]:
#xgboost is the best! (no suprise)
from xgboost.sklearn import  XGBClassifier
from sklearn.metrics import accuracy_score
from hyperopt import hp, STATUS_OK, Trials, fmin, tpe
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'seed': 0
    }

def objective(space):
    clf= XGBClassifier(
                    max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))

    clf.fit(train_x, train_y)


    accuracy = clf.score(test_x, test_y)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [22]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

100%|██████████| 100/100 [01:06<00:00,  1.51trial/s, best loss: -0.98532]
The best hyperparameters are :  

{'colsample_bytree': 0.5183105533989817, 'gamma': 8.436753703597821, 'max_depth': 15.0, 'min_child_weight': 6.0, 'reg_alpha': 179.0, 'reg_lambda': 0.5828202883351563}


In [24]:
for hyp in ['max_depth','reg_alpha', 'min_child_weight', 'colsample_bytree']:
    best_hyperparams[hyp] = int(best_hyperparams[hyp])
final_clf = XGBClassifier(**best_hyperparams).fit(train_x, train_y)
print(final_clf.score(test_x,test_y))

0.98532


In [46]:
def print_battle_log(row):
    first = pokemon.iloc[row['First_pokemon']-1]['Name']
    second = pokemon.iloc[row['Second_pokemon']-1]['Name']
    test = row[3:]
    pred = final_clf.predict([test])
    if pred == 1:
        pred = 'Ash wins'
    else:
        pred = 'Red wins'
    winner = row['Winner']
    if winner == 1:
        actual = 'Ash wins'
    else:
        actual = 'Red wins'

    prob = final_clf.predict_proba([test])
    print(f'Ash throws out {first}! His opponent Red uses {second}!')
    print(f'\tXGBoost predicts that {pred} with {round(max(prob[0]),2)} probability!')
    print(f'\tActual result: {actual}\n')


battle_sample = battles.sample(100)

for idx, row in battle_sample.iterrows():
    print_battle_log(row)

Ash throws out Flygon! His opponent Red uses Hitmonchan!
	XGBoost predicts that Ash wins with 0.9900000095367432 probability!
	Actual result: Ash wins

Ash throws out Pangoro! His opponent Red uses Miltank!
	XGBoost predicts that Red wins with 0.9200000166893005 probability!
	Actual result: Red wins

Ash throws out Scizor! His opponent Red uses Munchlax!
	XGBoost predicts that Ash wins with 0.9900000095367432 probability!
	Actual result: Ash wins

Ash throws out Lampent! His opponent Red uses Poochyena!
	XGBoost predicts that Ash wins with 0.9300000071525574 probability!
	Actual result: Ash wins

Ash throws out Feraligatr! His opponent Red uses Bronzong!
	XGBoost predicts that Ash wins with 0.9599999785423279 probability!
	Actual result: Ash wins

Ash throws out Gurdurr! His opponent Red uses Thundurus Incarnate Forme!
	XGBoost predicts that Red wins with 1.0 probability!
	Actual result: Red wins

Ash throws out Smoochum! His opponent Red uses Mega Kangaskhan!
	XGBoost predicts that Re