<a href="https://colab.research.google.com/github/nick-kann/Xatu-AI/blob/main/BuildDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import sqlite3
import json
import pandas as pd

# **Creating the Dataset**:

The focus will be on games in the Gen 5 OU format since it has the smallest variety of Pokemon, helping reduce model dimensionality. Only the top 5000 games in elo (>= 1250) will be included, as higher elo players typically use logic when selecting their leading Pokémon. In contrast, lower elo players often choose the same Pokémon repeatedly or pick randomly, which complicates the model’s learning process. Data will be obtained by making HTTP GET requests to the Pokémon Showdown server.

In [7]:
import requests
from IPython.display import clear_output

base_url = "https://replay.pokemonshowdown.com/search.json?format=gen9ou"

all_data = []
last_uploadtime = None
total_fetched = 0

while True:
    if last_uploadtime is None:
        url = base_url
    else:
        url = f"{base_url}&before={last_uploadtime}"

    response = requests.get(url)

    if response.status_code != 200:
        print("Error fetching data:", response.status_code)
        break

    data = response.json()

    if not data:
        break

    all_data.extend(data)

    last_uploadtime = data[-1]["uploadtime"]

    total_fetched = len(all_data)
    print(f"\rTotal replays fetched: {total_fetched}", end='')


print(f"\nTotal replays fetched: {total_fetched}")


Total replays fetched: 635070
Total replays fetched: 635070


In [8]:
df = pd.DataFrame(all_data)
df

Unnamed: 0,uploadtime,id,format,players,rating,private,password
0,1727636955,gen9ou-2212314009,[Gen 9] OU,"[Rj yoip, oufcourse]",1790.0,0,
1,1727636955,gen9ou-2212312220,[Gen 9] OU,"[gilberr123, Doctor VIbez]",1190.0,0,
2,1727636955,gen9ou-2212312100,[Gen 9] OU,"[Nikator98, Gr8 crit m8]",1522.0,0,
3,1727636955,gen9ou-2212312029,[Gen 9] OU,"[airman99, noahthebadplayer]",1045.0,0,
4,1727636817,gen9ou-2212313153,[Gen 9] OU,"[Haou29, Delibird=Santa]",1127.0,0,
...,...,...,...,...,...,...,...
635065,1669316114,smogtours-gen9ou-662498,[Gen 9] OU,"[Charmflash, HarryBW247]",,0,
635066,1669315924,smogtours-gen9ou-662497,[Gen 9] OU,"[Charmflash, HarryBW247]",,0,
635067,1669313957,smogtours-gen9ou-662495,[Gen 9] OU,"[Vileman, BeatsBlack]",,0,
635068,1669313259,smogtours-gen9ou-662491,[Gen 9] OU,"[Vileman, BeatsBlack]",,0,


In [15]:
df = df.dropna(subset=['rating'])
df

Unnamed: 0,uploadtime,id,format,players,rating,private,password
0,1727636955,gen9ou-2212314009,[Gen 9] OU,"[Rj yoip, oufcourse]",1790.0,0,
1,1727636955,gen9ou-2212312220,[Gen 9] OU,"[gilberr123, Doctor VIbez]",1190.0,0,
2,1727636955,gen9ou-2212312100,[Gen 9] OU,"[Nikator98, Gr8 crit m8]",1522.0,0,
3,1727636955,gen9ou-2212312029,[Gen 9] OU,"[airman99, noahthebadplayer]",1045.0,0,
4,1727636817,gen9ou-2212313153,[Gen 9] OU,"[Haou29, Delibird=Santa]",1127.0,0,
...,...,...,...,...,...,...,...
629219,1701532085,gen9ou-2003211714,[Gen 9] OU,"[mywifenkids, i am ass2]",1435.0,0,
629220,1701532074,gen9ou-2003211704,[Gen 9] OU,"[ortegajd, Seltzer Time]",1359.0,0,
629221,1701532061,gen9ou-2003211656,[Gen 9] OU,"[Ehdhdhdh, alle43]",1457.0,0,
629222,1701532057,gen9ou-2003211428,[Gen 9] OU,"[Sknmdeelectricidad, Adel19]",1654.0,0,


In [20]:
df_high_elo = df[df['rating'] >= 1600]
df_high_elo

Unnamed: 0,uploadtime,id,format,players,rating,private,password
0,1727636955,gen9ou-2212314009,[Gen 9] OU,"[Rj yoip, oufcourse]",1790.0,0,
11,1727636550,gen9ou-2212311598,[Gen 9] OU,"[gewwge, SMGs]",1720.0,0,
17,1727636402,gen9ou-2212307086,[Gen 9] OU,"[illuzionist low, Weed-le Plug]",1641.0,0,
21,1727636263,gen9ou-2212307285,[Gen 9] OU,"[besudo, ballz2012]",1651.0,0,
38,1727635849,gen9ou-2212300433,[Gen 9] OU,"[suhayb1910, fallen_gengar0]",1632.0,0,
...,...,...,...,...,...,...,...
629151,1701532579,gen9ou-2003212810,[Gen 9] OU,"[ruebs, Hoot-hoot Shiny]",1703.0,0,
629158,1701532548,gen9ou-2003213133,[Gen 9] OU,"[alvar03, Kurosu eX]",1658.0,0,
629175,1701532412,gen9ou-2003212100,[Gen 9] OU,"[TrepYT, StazMTA]",1621.0,0,
629195,1701532248,gen9ou-2003211245,[Gen 9] OU,"[Msousagamer, repete64]",1678.0,0,


**With 5000 high-elo replays collected, the next step is to obtain the specific game-data for each replay.**

In [None]:
game_logs = []
i = 0
for id in df['id']:
    url = f"https://replay.pokemonshowdown.com/{id}.json"o
    response = requests.get(url)
    data = response.json()
    game_logs.append(data)
    i += 1
    print(f"\r{i}/77732 games processed", end='')

73306/77732 games processed

In [2]:
len(game_logs)

NameError: name 'game_logs' is not defined

**Now that all the games are processed, a function has to be written in order to extract each player's teams and leading Pokemon from the raw data.**

In [1]:
import re

def extract_teams(battle_log: str):
    teams = {
        "p1": set(),
        "p2": set()
    }

    leading_pokemon = {
        "p1": None,
        "p2": None
    }

    # Pattern to find the full teams for both players
    poke_pattern = r'poke\|(p1|p2)\|([^|,]+)'
    poke_matches = re.findall(poke_pattern, battle_log)

    for player, pokemon in poke_matches:
        pokemon = pokemon.strip() # Remove newline characters
        if player == 'p1':
            teams["p1"].add(pokemon)
        elif player == 'p2':
            teams["p2"].add(pokemon)

    # Pattern to find the leading Pokemon first each player
    switch_pattern = r'switch\|(p1a|p2a): [^|]+\|([^|,]+)'
    switch_matches = re.findall(switch_pattern, battle_log)

    # Keep track of the count to get only the first two leading Pokémon
    count = 0
    for player, pokemon in switch_matches:
        pokemon = pokemon.strip()
        if count >= 2:
            break
        if player == 'p1a' and leading_pokemon["p1"] is None:
            leading_pokemon["p1"] = pokemon
            count += 1
        elif player == 'p2a' and leading_pokemon["p2"] is None:
            leading_pokemon["p2"] = pokemon
            count += 1

    return teams, leading_pokemon

In [None]:
game_teams = [extract_teams(game['log']) for game in game_logs]

In [None]:
import csv

with open('/content/dataset.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["id", "p1_poke1", "p1_poke2", "p1_poke3", "p1_poke4",
                     "p1_poke5", "p1_poke6", "p2_poke1", "p2_poke2", "p2_poke3",
                     "p2_poke4", "p2_poke5", "p2_poke6", "p1_choice", "p2_choice"])
    id = 1
    for teams, choices in game_teams:
        row = []
        row.append(id)
        id += 1
        for team in teams:
            for poke in teams[team]:
                row.append(poke)
        for choice in choices:
            row.append(choices[choice])
        writer.writerow(row)

In [None]:
from google.colab import files

files.download('/content/dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>