In [61]:
from fun import allowed_by_robots_txt, get_pokedex

# fundamental modules
import pandas as pd
import numpy as np

# web scraping modules
import requests
import bs4

# visual modules
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning modules
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

import re

%matplotlib inline

In [62]:
gen1_url = "https://pokemondb.net/pokedex/stats/gen1"
gen2_url = "https://pokemondb.net/pokedex/stats/gen2"
gen3_url = "https://pokemondb.net/pokedex/stats/gen3"
gen4_url = "https://pokemondb.net/pokedex/stats/gen4"
gen5_url = "https://pokemondb.net/pokedex/stats/gen5"

gen1_allowed = allowed_by_robots_txt(gen1_url)
gen2_allowed = allowed_by_robots_txt(gen2_url)
gen3_allowed = allowed_by_robots_txt(gen3_url)
gen4_allowed = allowed_by_robots_txt(gen4_url)
gen5_allowed = allowed_by_robots_txt(gen5_url)

print(f'Gen 1 scrapable: {gen1_allowed}\nGen 2 scrapable: {gen2_allowed}\nGen 3 scrapable: {gen3_allowed} \
\nGen 4 scrapable: {gen4_allowed}\nGen 5 scrapable: {gen5_allowed}')

Gen 1 scrapable: True
Gen 2 scrapable: True
Gen 3 scrapable: True 
Gen 4 scrapable: True
Gen 5 scrapable: True


In [63]:
gen1 = get_pokedex(gen1_url)
gen2 = get_pokedex(gen2_url)
gen3 = get_pokedex(gen3_url)
gen4 = get_pokedex(gen4_url)
gen5 = get_pokedex(gen5_url)

df = pd.concat(objs=[gen1, gen2, gen3, gen4, gen5], axis=0, ignore_index=True)

In [64]:
df

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,0001,Bulbasaur,Grass Poison,318,45,49,49,65,65,45
1,0002,Ivysaur,Grass Poison,405,60,62,63,80,80,60
2,0003,Venusaur,Grass Poison,525,80,82,83,100,100,80
3,0004,Charmander,Fire,309,39,52,43,60,50,65
4,0005,Charmeleon,Fire,405,58,64,58,80,65,80
...,...,...,...,...,...,...,...,...,...,...
670,0647,Keldeo Ordinary Form,Water Fighting,580,91,72,90,129,90,108
671,0647,Keldeo Resolute Form,Water Fighting,580,91,72,90,129,90,108
672,0648,Meloetta Aria Forme,Normal Psychic,600,100,77,77,128,128,90
673,0648,Meloetta Pirouette Forme,Normal Fighting,600,100,128,90,77,77,128


In [65]:
df = (df
      .drop_duplicates(subset='#', keep='first', inplace=False)
      .reset_index(drop=True))
df.tail()

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
644,645,Landorus Incarnate Forme,Ground Flying,600,89,125,90,115,80,101
645,646,Kyurem,Dragon Ice,660,125,130,90,130,90,95
646,647,Keldeo Ordinary Form,Water Fighting,580,91,72,90,129,90,108
647,648,Meloetta Aria Forme,Normal Psychic,600,100,77,77,128,128,90
648,649,Genesect,Bug Steel,600,71,120,95,120,95,99


In [66]:
df['Type'].value_counts()

Normal              58
Water               53
Psychic             30
Grass               29
Fire                26
                    ..
Poison Bug           1
Psychic Fighting     1
Ice Ghost            1
Electric Ghost       1
Dragon Ice           1
Name: Type, Length: 125, dtype: int64

In [67]:
df['Type'] = df['Type'].str.strip()
df = df[~df['Type'].str.contains(' ')]
value_counts = df['Type'].value_counts()
df = df[df['Type'].isin(value_counts.index[value_counts >= 20])]

In [68]:
df['Type'].value_counts()

Normal      58
Water       53
Psychic     30
Grass       29
Fire        26
Electric    26
Name: Type, dtype: int64

### Preprocessing

In [69]:
# We won't be needing # and Name columns in predicting Type, so remove.
df = df.iloc[:, 2:]
df

Unnamed: 0,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
3,Fire,309,39,52,43,60,50,65
4,Fire,405,58,64,58,80,65,80
6,Water,314,44,48,65,50,64,43
7,Water,405,59,63,80,65,80,58
8,Water,530,79,83,100,85,105,78
...,...,...,...,...,...,...,...,...
603,Electric,515,85,115,80,105,80,50
604,Psychic,335,55,55,55,85,55,30
605,Psychic,485,75,75,75,125,95,40
625,Normal,490,95,110,95,40,95,55


In [71]:
preproc = ColumnTransformer( 
    transformers=[
        ('categorical_col', OneHotEncoder(sparse_output=False), ['Type']), 
        ('stdscaler', StandardScaler(), ['Total', 'Attack', 'Defense', 
                                         'Sp. Atk', 'Sp. Def', 'Speed']),
    ], 
)

In [72]:
transformed_data = preproc.fit_transform(df)
transformed_columns = (preproc
                       .transformers_[0][1]
                       .get_feature_names_out(['Type'])
                       .tolist() 
                       + 
                       preproc
                       .named_transformers_['stdscaler']
                       .get_feature_names_out(['Total', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'])
                       .tolist())
df_transformed = pd.DataFrame(transformed_data, columns=transformed_columns)

In [76]:
X = df_transformed[df_transformed.columns[6:]]
y = df['Type'].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(penalty='none',
                           fit_intercept=True, 
                           solver='newton-cg', 
                           verbose=1, 
                           random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:", accuracy)

classification_rep = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:", classification_rep)

Accuracy: 0.34328358208955223
Classification Report:               precision    recall  f1-score   support

    Electric       0.20      0.08      0.11        13
        Fire       0.11      0.17      0.13         6
       Grass       1.00      0.00      0.00        10
      Normal       0.47      0.50      0.48        14
     Psychic       0.50      0.67      0.57         9
       Water       0.31      0.53      0.39        15

    accuracy                           0.34        67
   macro avg       0.43      0.32      0.28        67
weighted avg       0.43      0.34      0.30        67



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [77]:
X = df_transformed[df_transformed.columns[6:]]
y = df['Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tree_classifier = DecisionTreeClassifier(random_state=42)
tree_classifier.fit(X_train, y_train)

y_pred = tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:", accuracy)

classification_rep = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:", classification_rep)

Accuracy: 0.22388059701492538
Classification Report:               precision    recall  f1-score   support

    Electric       0.17      0.08      0.11        13
        Fire       0.06      0.17      0.09         6
       Grass       0.00      0.00      0.00        10
      Normal       0.35      0.50      0.41        14
     Psychic       0.43      0.33      0.38         9
       Water       0.27      0.20      0.23        15

    accuracy                           0.22        67
   macro avg       0.21      0.21      0.20        67
weighted avg       0.23      0.22      0.22        67



In [78]:
X = df_transformed[df_transformed.columns[6:]]
y = df['Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cross_val_scores = cross_val_score(rf_classifier, X, y, cv=kf, scoring='accuracy')

for fold, accuracy in enumerate(cross_val_scores, start=1):
    print(f"Fold {fold}: Accuracy = {accuracy:.2f}")

print(f"Mean Accuracy: {cross_val_scores.mean():.2f}")

Fold 1: Accuracy = 0.31
Fold 2: Accuracy = 0.36
Fold 3: Accuracy = 0.41
Fold 4: Accuracy = 0.30
Fold 5: Accuracy = 0.39
Mean Accuracy: 0.35


In [79]:
X = df_transformed[df_transformed.columns[6:]]
y = df['Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

svm_classifier = SVC(kernel='rbf', random_state=42)
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:", accuracy)

classification_rep = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:", classification_rep)

Accuracy: 0.34328358208955223
Classification Report:               precision    recall  f1-score   support

    Electric       0.00      0.00      0.00        13
        Fire       0.14      0.17      0.15         6
       Grass       1.00      0.00      0.00        10
      Normal       0.43      0.64      0.51        14
     Psychic       0.70      0.78      0.74         9
       Water       0.21      0.40      0.28        15

    accuracy                           0.34        67
   macro avg       0.41      0.33      0.28        67
weighted avg       0.39      0.34      0.28        67



In [80]:
X = df_transformed[df_transformed.columns[6:]]
y = df['Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

knn_classifier = KNeighborsClassifier(n_neighbors=13)
knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:", accuracy)

classification_rep = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:", classification_rep)

Accuracy: 0.29850746268656714
Classification Report:               precision    recall  f1-score   support

    Electric       0.00      0.00      0.00        13
        Fire       0.14      0.33      0.20         6
       Grass       0.50      0.10      0.17        10
      Normal       0.42      0.57      0.48        14
     Psychic       1.00      0.11      0.20         9
       Water       0.27      0.53      0.36        15

    accuracy                           0.30        67
   macro avg       0.39      0.27      0.23        67
weighted avg       0.37      0.30      0.25        67



In [81]:
X = df_transformed[df_transformed.columns[6:]]
y = df['Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train, y_train)

y_pred = gb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:", accuracy)

classification_rep = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:", classification_rep)

Accuracy: 0.26865671641791045
Classification Report:               precision    recall  f1-score   support

    Electric       0.22      0.15      0.18        13
        Fire       0.11      0.17      0.13         6
       Grass       0.00      0.00      0.00        10
      Normal       0.36      0.57      0.44        14
     Psychic       0.50      0.44      0.47         9
       Water       0.18      0.20      0.19        15

    accuracy                           0.27        67
   macro avg       0.23      0.26      0.24        67
weighted avg       0.24      0.27      0.25        67

