<a href="https://colab.research.google.com/github/nick-kann/Xatu-AI/blob/main/BuildDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
import sqlite3
import json
import pandas as pd

# **Creating the Dataset**:

The focus will be on games in the Gen 9 OU format since it is the most popular format that allows each player to see the opponent's entire team and choose their leading Pokemon. Only high elo games are going to be used in the data, as higher elo players typically use more logic when selecting their leading Pokemon. In contrast, lower elo players often choose the same Pokemon repeatedly or pick randomly, which complicates the model's learning process. The top 500 players in Gen 9 OU are typically > 1650 elo, but to get a little bit more data points, the elo cutoff is going to be at 1600 elo. The data will be obtained by making HTTP GET requests to the Pokemon Showdown server.

In [3]:
import requests
from IPython.display import clear_output

base_url = "https://replay.pokemonshowdown.com/search.json?format=gen9ou"

all_data = []
last_uploadtime = None
total_fetched = 0

while True:
    if last_uploadtime is None:
        url = base_url
    else:
        url = f"{base_url}&before={last_uploadtime}"

    response = requests.get(url)

    if response.status_code != 200:
        print("Error fetching data:", response.status_code)
        break

    data = response.json()

    if not data:
        break

    all_data.extend(data)

    last_uploadtime = data[-1]["uploadtime"]

    total_fetched = len(all_data)
    print(f"\rTotal replays fetched: {total_fetched}", end='')


print(f"\nTotal replays fetched: {total_fetched}")


Total replays fetched: 637620
Total replays fetched: 637620


In [4]:
df = pd.DataFrame(all_data)
df

Unnamed: 0,uploadtime,id,format,players,rating,private,password
0,1727719185,gen9ou-2212980451,[Gen 9] OU,"[gostojao0705, Thehombre]",1258.0,0,
1,1727719150,gen9ou-2212971406,[Gen 9] OU,"[sssensory, Samarand]",1284.0,0,
2,1727719149,gen9ou-2212975665,[Gen 9] OU,"[Ahsan-219, NeoNaruto✈️_✈️]",1746.0,0,
3,1727719149,gen9ou-2212973032,[Gen 9] OU,"[Banditlegend42, idontwearsweaters]",1097.0,0,
4,1727719149,gen9ou-2212971574,[Gen 9] OU,"[Bolanero, lynchfanaccount]",1386.0,0,
...,...,...,...,...,...,...,...
637615,1669316114,smogtours-gen9ou-662498,[Gen 9] OU,"[Charmflash, HarryBW247]",,0,
637616,1669315924,smogtours-gen9ou-662497,[Gen 9] OU,"[Charmflash, HarryBW247]",,0,
637617,1669313957,smogtours-gen9ou-662495,[Gen 9] OU,"[Vileman, BeatsBlack]",,0,
637618,1669313259,smogtours-gen9ou-662491,[Gen 9] OU,"[Vileman, BeatsBlack]",,0,


In [5]:
# Dropping replays that have no associated elo rating
df = df.dropna(subset=['rating'])
df

Unnamed: 0,uploadtime,id,format,players,rating,private,password
0,1727719185,gen9ou-2212980451,[Gen 9] OU,"[gostojao0705, Thehombre]",1258.0,0,
1,1727719150,gen9ou-2212971406,[Gen 9] OU,"[sssensory, Samarand]",1284.0,0,
2,1727719149,gen9ou-2212975665,[Gen 9] OU,"[Ahsan-219, NeoNaruto✈️_✈️]",1746.0,0,
3,1727719149,gen9ou-2212973032,[Gen 9] OU,"[Banditlegend42, idontwearsweaters]",1097.0,0,
4,1727719149,gen9ou-2212971574,[Gen 9] OU,"[Bolanero, lynchfanaccount]",1386.0,0,
...,...,...,...,...,...,...,...
631769,1701532085,gen9ou-2003211714,[Gen 9] OU,"[mywifenkids, i am ass2]",1435.0,0,
631770,1701532074,gen9ou-2003211704,[Gen 9] OU,"[ortegajd, Seltzer Time]",1359.0,0,
631771,1701532061,gen9ou-2003211656,[Gen 9] OU,"[Ehdhdhdh, alle43]",1457.0,0,
631772,1701532057,gen9ou-2003211428,[Gen 9] OU,"[Sknmdeelectricidad, Adel19]",1654.0,0,


In [27]:
# Filtering the dataframe to only contain games with >= 1600 elo
df_high_elo = df[df['rating'] >= 1600]
df_high_elo

Unnamed: 0,uploadtime,id,format,players,rating,private,password
2,1727719149,gen9ou-2212975665,[Gen 9] OU,"[Ahsan-219, NeoNaruto✈️_✈️]",1746.0,0,
7,1727719010,gen9ou-2212974011,[Gen 9] OU,"[Klopple, Light SV]",1798.0,0,
12,1727718870,gen9ou-2212972756,[Gen 9] OU,"[LaBaleada, Ahsan-219]",1667.0,0,
19,1727718591,gen9ou-2212969784,[Gen 9] OU,"[butterycrap, Ahsan-219]",1816.0,0,
22,1727718558,gen9ou-2212972925,[Gen 9] OU,"[Light SV, RegularGreg]",1775.0,0,
...,...,...,...,...,...,...,...
631701,1701532579,gen9ou-2003212810,[Gen 9] OU,"[ruebs, Hoot-hoot Shiny]",1703.0,0,
631708,1701532548,gen9ou-2003213133,[Gen 9] OU,"[alvar03, Kurosu eX]",1658.0,0,
631725,1701532412,gen9ou-2003212100,[Gen 9] OU,"[TrepYT, StazMTA]",1621.0,0,
631745,1701532248,gen9ou-2003211245,[Gen 9] OU,"[Msousagamer, repete64]",1678.0,0,


**With the high-elo replays collected, the next step is to obtain the specific game-data for each replay.**

In [28]:
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to fetch a single game log
def fetch_game_log(game_id):
    url = f"https://replay.pokemonshowdown.com/{game_id}.json"
    response = requests.get(url)
    return response.json()

game_logs = []
ids = df_high_elo['id'].tolist()

n = len(ids)

# Use ThreadPoolExecutor to parallelize calls
with ThreadPoolExecutor(max_workers=10) as executor:
    # future_to_id = {executor.submit(fetch_game_log, id): id for id in ids}
    futures = []

    for id in ids:
        future = executor.submit(fetch_game_log, id)
        futures.append(future)

    for i, future in enumerate(as_completed(futures)):
        data = future.result()
        game_logs.append(data)
        print(f"\r{i + 1}/{n} games processed", end='')

print("\nAll games processed.")

78061/78061 games processed
All games processed.


In [31]:
len(game_logs)

78061

**Now that all the games are processed, a function has to be written in order to extract each player's teams and leading Pokemon from the raw data.**

In [45]:
import re

def extract_teams(battle_log: str):
    teams = {
        "p1": set(),
        "p2": set()
    }

    leading_pokemon = {
        "p1": None,
        "p2": None
    }

    # Pattern to find the full teams for both players
    poke_pattern = r'poke\|(p1|p2)\|([^|,]+)'
    poke_matches = re.findall(poke_pattern, battle_log)

    for player, pokemon in poke_matches:
        pokemon = pokemon.strip() # Remove newline characters
        # Zamazenta is named Zamazenta-* in game logs
        pokemon = re.sub(r'Zamazenta-\*', 'Zamazenta', pokemon)
        if player == 'p1':
            teams["p1"].add(pokemon)
        elif player == 'p2':
            teams["p2"].add(pokemon)

    # Pattern to find the leading Pokemon first each player
    switch_pattern = r'switch\|(p1a|p2a): [^|]+\|([^|,]+)'
    switch_matches = re.findall(switch_pattern, battle_log)

    # Keep track of the count to get only the first two leading Pokémon
    count = 0
    for player, pokemon in switch_matches:
        pokemon = pokemon.strip()
        pokemon = re.sub(r'Zamazenta-\*', 'Zamazenta', pokemon)
        if count >= 2:
            break
        if player == 'p1a' and leading_pokemon["p1"] is None:
            leading_pokemon["p1"] = pokemon
            count += 1
        elif player == 'p2a' and leading_pokemon["p2"] is None:
            leading_pokemon["p2"] = pokemon
            count += 1

    return teams, leading_pokemon

In [46]:
game_teams = [extract_teams(game['log']) for game in game_logs]

In [47]:
import csv

with open('/content/dataset.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["id", "p1_poke1", "p1_poke2", "p1_poke3", "p1_poke4",
                     "p1_poke5", "p1_poke6", "p2_poke1", "p2_poke2", "p2_poke3",
                     "p2_poke4", "p2_poke5", "p2_poke6", "p1_choice", "p2_choice"])
    id = 1
    for teams, choices in game_teams:
        row = []
        row.append(id)
        id += 1
        for team in teams:
            for poke in teams[team]:
                row.append(poke)
        for choice in choices:
            row.append(choices[choice])
        writer.writerow(row)

In [48]:
from google.colab import files

files.download('/content/dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>