First we install vllm. Notice that you'll have to restart the session afterwards.

In [None]:
!pwd

In [None]:
# !git clone https://github.com/Lux-AI-Challenge/Lux-Design-S3
# %cd Lux-Design-S3
!pip install -e src

In [None]:
!pip install --upgrade pip setuptools -q
!pip install luxai-s3 transformers vllm

In [None]:
!pip install trl datasets -q

In [None]:
!pwd

In [None]:
# !git clone https://github.com/Lux-AI-Challenge/Lux-Design-S3
!mv Lux-Design-S3 lux_ai_env

In [None]:
# self_play_generator.py
import os

from lux1.src.luxai_s3.env import LuxAIS3Env
import json
import random

def generate_self_play_data(num_episodes=100):
    data = []
    env = LuxAIS3Env()
    env.reset(42)

    for _ in range(num_episodes):
        env.reset()
        done = False
        episode_data = {
            "states": [],
            "actions": [],
            "rewards": []
        }

        while not done:
            state = env.render()
            episode_data["states"].append(state)

            # Generate random actions for both teams
            team0_actions = {}
            team1_actions = {}
            for team in ["player_0", "player_1"]:
                for unit in env.state.units[team].values():
                    move = random.choice(["center", "up", "right", "down", "left"])
                    sap_target = (unit.pos.x, unit.pos.y)

                    if team == "player_0":
                        team0_actions[unit.unit_id] = {
                            "move": move,
                            "sap": sap_target
                        }
                    else:
                        team1_actions[unit.unit_id] = {
                            "move": move,
                            "sap": sap_target
                        }

            actions = {
                "player_0": team0_actions,
                "player_1": team1_actions
            }
            episode_data["actions"].append(actions)

            # Step the environment
            obs, reward, done, info = env.step(actions)

            episode_data["rewards"].append(reward)

        data.append(episode_data)

    return data

# Save dataset
data = generate_self_play_data(100)
with open("self_play_dataset.json", "w") as f:
    json.dump(data, f)

Now we import the gsm8k dataset and restructure it to fit into a conversational prompt format:

In [None]:
# reward_functions.py
import json

def compute_rewards(state, actions):
    reward = 0.0

    # Relic collection
    relic_collected = state["relics_collected"]["player_0"] - state["relics_collected"]["player_1"]
    reward += relic_collected * 2.0

    # Energy preservation
    energy0 = sum(u["energy"] for u in state["units"]["player_0"].values())
    energy1 = sum(u["energy"] for u in state["units"]["player_1"].values())
    reward += (energy0 - energy1) * 0.1

    # Sapping effectiveness
    sapped_units = 0
    for action in actions["player_0"].values():
        if "sap" in action:
            sapped_units += 1
    reward += sapped_units * 0.5

    # Collision advantage
    if state["collision_outcome"] == "player_0":
        reward += 1.0
    elif state["collision_outcome"] == "player_1":
        reward -= 1.0

    # Terminal match reward
    if state["real_env_steps"] >= 100:
        if state["teams"]["player_0"].score > state["teams"]["player_1"].score:
            reward += 10.0
        else:
            reward -= 10.0

    return reward

In [None]:
# train_grpo.py
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
import json

# Load dataset
with open("self_play_dataset.json", "r") as f:
    dataset = json.load(f)

# Prepare training data
grpo_dataset = []
for episode in dataset:
    for state, action, reward in zip(episode["states"], episode["actions"], episode["rewards"]):
        grpo_dataset.append({
            "prompt": json.dumps(state),
            "completion": json.dumps(action["player_0"]),
            "reward": compute_rewards(state, action)
        })

# Training configuration
training_args = GRPOConfig(
    output_dir="outputs/lux_ai_agent",
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_prompt_length=512,
    max_completion_length=256,
    report_to="none"
)

# Initialize model and tokenizer
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

# Define reward function
def reward_func(prompt, completions):
    state = json.loads(prompt.split("<state>")[1].split("</state>")[0])
    action = json.loads(completions[0]["content"])
    return compute_rewards(state, action)

# Train the model
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[reward_func],
    args=training_args,
    train_dataset=grpo_dataset
)

trainer.train()

We now set the training arguments:

In [None]:
# agent.py
from transformers import AutoModelForCausalLM, AutoTokenizer
import json

class LuxAILLM:
    def __init__(self, model_type="cpu"):
        self.tokenizer = AutoTokenizer.from_pretrained("outputs/lux_ai_agent")
        self.model = AutoModelForCausalLM.from_pretrained("outputs/lux_ai_agent").to("cuda" if model_type == "cuda" else "cpu")

    def generate_actions(self, game_state):
        prompt = """
        <rules>
        You are playing Lux AI Season 3. Units can move in 5 directions, sap enemy units, and collect relics. Collisions destroy units. Maintain energy and maximize relic collection.
        </rules>
        <state>
        {game_state}
        </state>
        <think>
        Planning optimal moves...
        </think>
        <action>
        {{}}
        </action>
        """.format(game_state=game_state)

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(**inputs, max_new_tokens=512, temperature=0.7)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        action_start = response.find("<action>") + len("<action>")
        action_end = response.find("</action>")
        return json.loads(response[action_start:action_end])

# Example usage
agent = LuxAILLM()
game_state = {
    "units": {
        "player_0": [{"id": "u0", "pos": (3,4), "energy": 80}],
        "player_1": [{"id": "u1", "pos": (8,9), "energy": 65}]
    },
    "relics": [{"pos": (5,6)}],
    "energy_nodes": [{"pos": (12,10)}]
}
actions = agent.generate_actions(game_state)
print(actions)

And launch the actual training:

In [None]:
# evaluate_agent.py
from luxai_s3.env import LuxAI_S3
from agent import LuxAILLM

def evaluate_agent(llm_agent, baseline_agent, episodes=100):
    env = LuxAI_S3(env_cfg="env.cfg", seed=42)
    wins = 0

    for _ in range(episodes):
        env.reset()
        done = False

        while not done:
            state = env.render()

            # LLM agent actions
            llm_actions = llm_agent.generate_actions(state)

            # Baseline agent actions (random example)
            baseline_actions = {}
            for unit in env.state.units["player_1"].values():
                move = random.choice(["center", "up", "right", "down", "left"])
                baseline_actions[unit.unit_id] = {
                    "move": move,
                    "sap": (unit.pos.x, unit.pos.y)
                }

            # Step environment
            obs, reward, done, info = env.step({
                "player_0": llm_actions,
                "player_1": baseline_actions
            })

        if env.state.teams["player_0"].score > env.state.teams["player_1"].score:
            wins += 1

    return wins / episodes

# Run evaluation
llm_agent = LuxAILLM()
win_rate = evaluate_agent(llm_agent, None, episodes=10)
print(f"Win rate: {win_rate:.2f}")