In [1]:
!pip install git+https://github.com/glmcdona/LuxPythonEnvGym.git
!cp -r ../input/lux-ai-2021/* .

Collecting git+https://github.com/glmcdona/LuxPythonEnvGym.git
  Cloning https://github.com/glmcdona/LuxPythonEnvGym.git to /tmp/pip-req-build-ceitcg3w
  Running command git clone -q https://github.com/glmcdona/LuxPythonEnvGym.git /tmp/pip-req-build-ceitcg3w
  Resolved https://github.com/glmcdona/LuxPythonEnvGym.git to commit 55e8ddc15012fd55f17b23aa2e73c919467e43e3
Collecting stable_baselines3==1.2.1a2
  Downloading stable_baselines3-1.2.1a2-py3-none-any.whl (173 kB)
[K     |████████████████████████████████| 173 kB 523 kB/s 
Collecting torch>=1.8.1
  Downloading torch-1.10.0-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |████████████████████████████████| 881.9 MB 13 kB/s 
Building wheels for collected packages: luxai2021
  Building wheel for luxai2021 (setup.py) ... [?25l- \ done
[?25h  Created wheel for luxai2021: filename=luxai2021-0.1.0-py3-none-any.whl size=46760 sha256=e3fb5f1c41c4e12eba09d9433f63c6d058a0521a40c69e458274a7726e1a1503
  Stored in directory

In [2]:
import numpy as np
import math
import json
from pathlib import Path
import os
import random
from tqdm import tqdm
import torch
from torch import nn
import torch.nn.functional as F
# from lux.game import Game
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.model_selection import train_test_split
import torchvision.models as models
from torch.autograd import Variable
from collections import OrderedDict, namedtuple, namedtuple, deque
import itertools
from shutil import copyfile
from kaggle_environments import make
from lux.game_constants import GAME_CONSTANTS
from lux.constants import Constants
from luxai2021.game.game import Game
from lux.game import Game as LuxGame
from luxai2021.game.actions import *
from luxai2021.game.constants import LuxMatchConfigs_Default
import copy

HPC = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
print("CUDA AVAILABLE: %s" % torch.cuda.is_available())


Loading environment football failed: No module named 'gfootball'
CUDA AVAILABLE: True


In [3]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 17
seed_everything(seed)

In [4]:
class Reward():

    def __init__(self, game, team):
        """
        This funciton is called at the start of each game. Use this to
        reset and initialize per game. Note that self.team may have
        been changed since last game. The game map has been created
        and starting units placed.

        Args:
            game ([type]): Game.
        """
        super().__init__()
        self.team = team
        self.last_generated_fuel = game.stats["teamStats"][self.team]["fuelGenerated"]
        self.last_resources_collected = copy.deepcopy(game.stats["teamStats"][self.team]["resourcesCollected"])
        # if self.stats != None:
        #     self.stats_last_game = self.stats
        self.stats = {
            "rew/r_total": 0,
            "rew/r_wood": 0,
            "rew/r_coal": 0,
            "rew/r_uranium": 0,
            "rew/r_research": 0,
            "rew/r_city_tiles_end": 0,
            "rew/r_fuel_collected": 0,
            "rew/r_units": 0,
            "rew/r_city_tiles": 0,
            "game/turns": 0,
            "game/research": 0,
            "game/unit_count": 0,
            "game/cart_count": 0,
            "game/city_count": 0,
            "game/city_tiles": 0,
            "game/wood_rate_mined": 0,
            "game/coal_rate_mined": 0,
            "game/uranium_rate_mined": 0,
        }
        self.is_last_turn = False

        # Calculate starting map resources
        type_map = {
            Constants.RESOURCE_TYPES.WOOD: "WOOD",
            Constants.RESOURCE_TYPES.COAL: "COAL",
            Constants.RESOURCE_TYPES.URANIUM: "URANIUM",
        }

        self.fuel_collected_last = 0
        self.fuel_start = {}
        self.fuel_last = {}
        for type, type_upper in type_map.items():
            self.fuel_start[type] = 0
            self.fuel_last[type] = 0
            for c in game.map.resources_by_type[type]:
                self.fuel_start[type] += c.resource.amount * game.configs["parameters"]["RESOURCE_TO_FUEL_RATE"][
                    type_upper]

        self.research_last = 0
        self.units_last = 0
        self.city_tiles_last = 0

    def get_reward(self, game, is_game_finished, is_new_turn, is_game_error):
        """
        Returns the reward function for this step of the game.
        """
        if is_game_error:
            # Game environment step failed, assign a game lost reward to not incentivise this
            print("Game failed due to error")
            return -1.0

        if not is_new_turn and not is_game_finished:
            # Only apply rewards at the start of each turn
            return 0

        # Get some basic stats
        unit_count = len(game.state["teamStates"][self.team % 2]["units"])
        cart_count = 0
        for id, u in game.state["teamStates"][self.team % 2]["units"].items():
            if u.type == Constants.UNIT_TYPES.CART:
                cart_count += 1

        unit_count_opponent = len(game.state["teamStates"][(self.team + 1) % 2]["units"])
        research = min(game.state["teamStates"][self.team]["researchPoints"], 200.0)  # Cap research points at 200
        city_count = 0
        city_count_opponent = 0
        city_tile_count = 0
        city_tile_count_opponent = 0
        for city in game.cities.values():
            if city.team == self.team:
                city_count += 1
            else:
                city_count_opponent += 1

            for cell in city.city_cells:
                if city.team == self.team:
                    city_tile_count += 1
                else:
                    city_tile_count_opponent += 1

        # Basic stats
        self.stats["game/research"] = research
        self.stats["game/city_tiles"] = city_tile_count
        self.stats["game/city_count"] = city_count
        self.stats["game/unit_count"] = unit_count
        self.stats["game/cart_count"] = cart_count
        self.stats["game/turns"] = game.state["turn"]

        rewards = {}

        # Give up to 1.0 reward for each resource based on % of total mined.
        type_map = {
            Constants.RESOURCE_TYPES.WOOD: "WOOD",
            Constants.RESOURCE_TYPES.COAL: "COAL",
            Constants.RESOURCE_TYPES.URANIUM: "URANIUM",
        }
        fuel_now = {}
        for type, type_upper in type_map.items():
            fuel_now = game.stats["teamStats"][self.team]["resourcesCollected"][type] * \
                       game.configs["parameters"]["RESOURCE_TO_FUEL_RATE"][type_upper]
            rewards["rew/r_%s" % type] = (fuel_now - self.fuel_last[type]) / self.fuel_start[type]
            self.stats["game/%s_rate_mined" % type] = fuel_now / self.fuel_start[type]
            self.fuel_last[type] = fuel_now

        # Give more incentive for coal and uranium
        rewards["rew/r_%s" % Constants.RESOURCE_TYPES.COAL] *= 2
        rewards["rew/r_%s" % Constants.RESOURCE_TYPES.URANIUM] *= 4

        # Give a reward based on amount of fuel collected. 1.0 reward for each 20K fuel gathered.
        fuel_collected = game.stats["teamStats"][self.team]["fuelGenerated"]
        rewards["rew/r_fuel_collected"] = ((fuel_collected - self.fuel_collected_last) / 20000)
        self.fuel_collected_last = fuel_collected

        # Give a reward for unit creation/death. 0.05 reward per unit.
        rewards["rew/r_units"] = (unit_count - self.units_last) * 0.05
        self.units_last = unit_count

        # Give a reward for unit creation/death. 0.1 reward per city.
        rewards["rew/r_city_tiles"] = (city_tile_count - self.city_tiles_last) * 0.1
        self.city_tiles_last = city_tile_count

        # Tiny reward for research to help. Up to 0.5 reward for this.
        rewards["rew/r_research"] = (research - self.research_last) / (200 * 2)
        self.research_last = research

        # Give a reward up to around 50.0 based on number of city tiles at the end of the game
        rewards["rew/r_city_tiles_end"] = 0
        if is_game_finished:
            self.is_last_turn = True
            rewards["rew/r_city_tiles_end"] = city_tile_count

        # Update the stats and total reward
        reward = 0
        for name, value in rewards.items():
            self.stats[name] += value
            reward += value
        self.stats["rew/r_total"] += reward

        # Print the final game stats sometimes
        if is_game_finished and random.random() <= 0.15:
            stats_string = []
            for key, value in self.stats.items():
                stats_string.append("%s=%.2f" % (key, value))
            print(",".join(stats_string))

        return reward

# Preprocessing

In [5]:
def make_input(obs, unit_id):
    width, height = obs['width'], obs['height']
    x_shift = (32 - width) // 2
    y_shift = (32 - height) // 2
    cities = {}
    
    b = np.zeros((20, 32, 32), dtype=np.float32)
    
    for update in obs['updates']:
        strs = update.split(' ')
        input_identifier = strs[0]
        
        if input_identifier == 'u':
            x = int(strs[4]) + x_shift
            y = int(strs[5]) + y_shift
            wood = int(strs[7])
            coal = int(strs[8])
            uranium = int(strs[9])
            if unit_id == strs[3]:
                # Position and Cargo
                b[:2, x, y] = (
                    1,
                    (wood + coal + uranium) / 100
                )
            else:
                # Units
                team = int(strs[2])
                cooldown = float(strs[6])
                idx = 2 + (team - obs['player']) % 2 * 3
                b[idx:idx + 3, x, y] = (
                    1,
                    cooldown / 6,
                    (wood + coal + uranium) / 100
                )
        elif input_identifier == 'ct':
            # CityTiles
            team = int(strs[1])
            city_id = strs[2]
            x = int(strs[3]) + x_shift
            y = int(strs[4]) + y_shift
            idx = 8 + (team - obs['player']) % 2 * 2
            b[idx:idx + 2, x, y] = (
                1,
                cities[city_id]
            )
        elif input_identifier == 'r':
            # Resources
            r_type = strs[1]
            x = int(strs[2]) + x_shift
            y = int(strs[3]) + y_shift
            amt = int(float(strs[4]))
            b[{'wood': 12, 'coal': 13, 'uranium': 14}[r_type], x, y] = amt / 800
        elif input_identifier == 'rp':
            # Research Points
            team = int(strs[1])
            rp = int(strs[2])
            b[15 + (team - obs['player']) % 2, :] = min(rp, 200) / 200
        elif input_identifier == 'c':
            # Cities
            city_id = strs[2]
            fuel = float(strs[3])
            lightupkeep = float(strs[4])
            cities[city_id] = min(fuel / lightupkeep, 10) / 10
    
    # Day/Night Cycle
    b[17, :] = obs['step'] % 40 / 40
    # Turns
    b[18, :] = obs['step'] / 360
    # Map Size
    b[19, x_shift:32 - x_shift, y_shift:32 - y_shift] = 1

    return torch.tensor(b,dtype=torch.float, device=device).reshape([1, 20, 32, 32])

def to_label(action):
    strs = action.split(' ')
    unit_id = strs[1]
    if strs[0] == 'm':
        label = {'c': None, 'n': 0, 's': 1, 'w': 2, 'e': 3}[strs[2]]
    elif strs[0] == 'bcity':
        label = 4
    else:
        label = None
    return unit_id, label


def depleted_resources(obs):
    for u in obs['updates']:
        if u.split(' ')[0] == 'r':
            return False
    return True


def create_dataset_from_json(episode_dir,num_samples=10000, team_name='Toad Brigade',): 
    obses = {}
    samples = []
    append = samples.append
    counter = 0
    episodes = [path for path in Path(episode_dir).glob('*.json') if 'output' not in path.name]
    for i, filepath in enumerate(tqdm(episodes)): 
        with open(filepath) as f:
            json_load = json.load(f)

        ep_id = json_load['info']['EpisodeId']
        index = np.argmax([r or 0 for r in json_load['rewards']])
        if json_load['info']['TeamNames'][index] != team_name:
            continue
        config = LuxMatchConfigs_Default
        config['seed'] = json_load['configuration']['seed']
        game = Game(config)
        team = json_load['steps'][0][0]['observation']['player']
        rew = Reward(game,team)
        game.process_updates(json_load['steps'][0][0]['observation']['updates'])
        _ = rew.get_reward(game, False, True, False)
        for i in range(len(json_load['steps'])-1):

            if counter >= num_samples: break

            actions = json_load['steps'][i+1][index]['action']
            # reward = json_load['steps'][i+1][index]['reward']
            obs = json_load['steps'][i][0]['observation']
            obs_new = json_load['steps'][i+1][0]['observation']
            done = not json_load['steps'][i+1][index]['status'] == 'ACTIVE'
            game.process_updates(obs_new['updates'])
            reward = rew.get_reward(game, done, True, False)


            if depleted_resources(obs):
                break

            obs['player'] = index
            obs = dict([
                    (k,v) for k,v in obs.items()
                    if k in ['step', 'updates', 'player', 'width', 'height']
            ])
            obs_id = f'{ep_id}_{i}'
            obses[obs_id] = obs

            for action in actions:
                unit_id, label = to_label(action)
                if label is not None:
                    counter +=1
                    reward = torch.tensor([reward],device=device)
                    label = torch.tensor([[label]], device=device)
                    append((make_input(obs,unit_id), label, make_input(obs_new,unit_id), reward, True))

    return samples

    
def update_params(model, new_params, tau):
    params = model.state_dict()
    for k in params.keys():
        params[k] = (1-tau) * params[k] + tau * new_params[k]
    return params

# Training

In [6]:
# Input for Neural Network
class LuxDataset(Dataset):
    def __init__(self, obses, samples):
        self.obses = obses
        self.samples = samples
        
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        obs_id, unit_id, action = self.samples[idx]
        obs = self.obses[obs_id]
        state = make_input(obs, unit_id)
        
        return state, action

In [7]:
class ResNeXt(nn.Module):
    def __init__(self, num_blocks, cardinality, bottleneck_width, expansion=2, num_classes=5):
        super(ResNeXt, self).__init__()
        self.cardinality = cardinality
        self.bottleneck_width = bottleneck_width
        self.in_planes = 64
        self.expansion = expansion
        
        self.conv0 = nn.Conv2d(20, self.in_planes, kernel_size=3, stride=1, padding=1)
        self.bn0 = nn.BatchNorm2d(self.in_planes)
        self.pool0 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1=self._make_layer(num_blocks[0],1)
        self.layer2=self._make_layer(num_blocks[1],2)
        self.layer3=self._make_layer(num_blocks[2],2)
        self.layer4=self._make_layer(num_blocks[3],2)
        self.linear = nn.Linear(self.cardinality * self.bottleneck_width, num_classes)

    def forward(self, x):
        out = F.relu(self.bn0(self.conv0(x)))
        # out = self.pool0(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

    def _make_layer(self, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(BasicBlock_C(self.in_planes, self.bottleneck_width, self.cardinality, stride, self.expansion))
            self.in_planes = self.expansion * self.bottleneck_width * self.cardinality
        self.bottleneck_width *= 2
        return nn.Sequential(*layers)
    

class ResBottleBlock(nn.Module):
    
    def __init__(self, in_planes, bottleneck_width=4, stride=1, expansion=1):
        super(ResBottleBlock, self).__init__()
        self.conv0=nn.Conv2d(in_planes,bottleneck_width,1,stride=1,bias=False)
        self.bn0 = nn.BatchNorm2d(bottleneck_width)
        self.conv1=nn.Conv2d(bottleneck_width,bottleneck_width,3,stride=stride,padding=1,bias=False)
        self.bn1=nn.BatchNorm2d(bottleneck_width)
        self.conv2=nn.Conv2d(bottleneck_width,expansion*in_planes,1,bias=False)
        self.bn2=nn.BatchNorm2d(expansion*in_planes)
        
        self.shortcut=nn.Sequential()
        if stride!=1 or expansion!=1:
            self.shortcut=nn.Sequential(
                nn.Conv2d(in_planes,in_planes*expansion,1,stride=stride,bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn0(self.conv0(x)))
        out = F.relu(self.bn1(self.conv1(out)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class BasicBlock_C(nn.Module):
    """
    increasing cardinality is a more effective way of 
    gaining accuracy than going deeper or wider
    """

    def __init__(self, in_planes, bottleneck_width=4, cardinality=32, stride=1, expansion=2):
        super(BasicBlock_C, self).__init__()
        inner_width = cardinality * bottleneck_width
        self.expansion = expansion
        self.basic = nn.Sequential(OrderedDict(
            [
                ('conv1_0', nn.Conv2d(in_planes, inner_width, 1, stride=1, bias=False)),
                ('bn1', nn.BatchNorm2d(inner_width)),
                ('act0', nn.ReLU()),
                ('conv3_0', nn.Conv2d(inner_width, inner_width, 3, stride=stride, padding=1, groups=cardinality, bias=False)),
                ('bn2', nn.BatchNorm2d(inner_width)),
                ('act1', nn.ReLU()),
                ('conv1_1', nn.Conv2d(inner_width, inner_width * self.expansion, 1, stride=1, bias=False)),
                ('bn3', nn.BatchNorm2d(inner_width * self.expansion))
            ]
        ))
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != inner_width * self.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, inner_width * self.expansion, 1, stride=stride, bias=False)
            )
        self.bn0 = nn.BatchNorm2d(self.expansion * inner_width)

    def forward(self, x):
        out = self.basic(x)
        out += self.shortcut(x)
        out = F.relu(self.bn0(out))
        return out


def resnext26_4x32d():
    return ResNeXt(num_blocks=[2, 2, 2, 2], cardinality=4, bottleneck_width=32)



    
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'demonstration'))

        
class ReplayMemory(object):

    # capacity == -1 means unlimited capacity
    def __init__(self, capacity=-1):
        self.capacity = capacity
        self.memory = []
        self.position = 0
        if HPC:
            self.episode_dir = './lux-episodes'
        else:
            self.episode_dir = '../input/lux-ai-episodes'

    def push(self, trans):
        if len(self.memory) < self.capacity or self.capacity < 0:
            self.memory.append(None)
        self.memory[self.position] = trans
        self.position = self.position + 1
        if self.capacity > 0:
            self.position = self.position % self.capacity

    def prefil(self):
        samples = create_dataset_from_json(self.episode_dir, self.capacity)
        for sample in samples:
            self.push(sample)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


In [8]:

def in_city(pos, game_state):    
    try:
        city = game_state.map.get_cell_by_pos(pos).citytile
        return city is not None and city.team == game_state.id
    except:
        return False


def call_func(obj, method, args=[]):
    return getattr(obj, method)(*args)


unit_actions = [('move', 'n'), ('move', 's'), ('move', 'w'), ('move', 'e'), ('build_city',)]

def get_action(policy, unit, dest, game_state):
    act = unit_actions[policy]
    pos = unit.pos.translate(act[-1], 1) or unit.pos
    if pos not in dest or in_city(pos, game_state):
        return call_func(unit, *act), pos

    return unit.move('c'), unit.pos

def select_action(state, model, EPS):
    sample = random.random()
    if sample < EPS:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
             
            return model(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(5)]], device=device, dtype=torch.long)

def JE(samples, policy_net, margin=0.8):
        loss = torch.tensor(0.0, device=device)
        count = 0  # number of demo
        for i in range(len(samples)):
            if samples.demonstration[i] is False:
                continue
            preds = policy_net(samples.state[i]).squeeze()
        
            # get the probability for action aE based on state s
            QE = preds[samples.action[i]]
            
            A2, A1 = torch.argsort(preds)[-2:]  # action with largest and second largest Q
            
            maxA = A2 if (A1 == samples.action[i]).all() else A1
            Q = preds[maxA]
            if (Q + margin) < QE:
                continue
            else:
                loss += (Q - QE[0][0])
                count += 1
        return loss / count if count != 0 else loss
    
def optimize_model(policy_net, target_net, BATCH_SIZE, optimizer, lambda1=1, lambda2=10, GAMMA=0.99, margin=0.8):
#     if len(memory) < BATCH_SIZE:
#         return

    demo_samples = int(BATCH_SIZE * 0.85)
    demo_trans = []
    if demo_samples > 0:
        demo_trans = memory_demos.sample(demo_samples)
    agent_trans = memory.sample(BATCH_SIZE - demo_samples)
    transitions = demo_trans + agent_trans
    batch = Transition(*zip(*transitions))
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.


    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    q_vals = policy_net(state_batch)
    state_action_values = q_vals.gather(1, action_batch)
#     print('without')
#     print(state_action_values)
#     print('action shape')
#     print(action_batch.shape)
#     print('with')
#     print(state_action_values)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was fil.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
#     print(state_action_values)


    # bellman_error = expected_state_action_values.unsqueeze(1) - state_action_values
    # clipped_bellman_error = bellman_error.clamp(-1, 1)
    # d_error = clipped_bellman_error * -1.0


    q_loss = F.mse_loss(state_action_values, expected_state_action_values.unsqueeze(1), size_average=False)
    # n_step_loss = F.mse_loss(state_action_values, n_reward_batch, size_average=False)

    num_actions = q_vals.size(1)
    margins = (torch.ones(num_actions, num_actions) - torch.eye(num_actions)) * margin
    batch_margins = margins[action_batch.data.squeeze().cpu()]
    q_vals = q_vals + Variable(batch_margins).type(dtype)
    supervised_loss = (q_vals.max(1)[0].unsqueeze(1) - state_action_values).pow(2)[:demo_samples].sum()
#     supervised_loss = JE(batch, policy_net)
    loss = q_loss + supervised_loss

    # Compute Huber loss
    # for i in range(BATCH_SIZE):
    #     error = torch.abs(state_action_values.squeeze(1)[i] - expected_state_action_values[i])
    #     memory.update(idxs[i], error)
    
    # Not sure if we need to unsqueeze here
#     print("first")
#     print(state_action_values.squeeze(1).get_device())
#     print('Seccond')
#     print(expected_state_action_values.get_device())
#     print('third')


    # Jtd_loss = loss_mse(state_action_values.squeeze(1), expected_state_action_values, IS*0+1)
    # JE_loss = JE(batch, policy_net)
    
    # J_loss = Jtd_loss + lambda2 * JE_loss


    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
#     torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 100)
#     torch.nn.utils.clip_grad_norm
    
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
        
    optimizer.step()
    return loss.data.item(),q_loss.data.item(), supervised_loss.data.item()

def train_model_RL(policy_net, target_net, optimizer,memory, memory_demos, num_episodes =100, BATCH_SIZE = 128,GAMMA = 0.999, EPS = 0.9,TARGET_UPDATE=5,TAU =0.01 ):
    best_reward = 0
    rewards, losses, q_losses, supervised_losses =  [], [], [], []
    if HPC:
            os.chdir("./lux-ai-il-ensemble-of-models")
    else:
            os.chdir("/kaggle/input/lux-ai-with-il-ensemble-of-models/")
    BATCH_SIZE_NEW = BATCH_SIZE
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env = make('lux_ai_2021', configuration={'loglevel': 1, 'annotations': True}, debug=True)

        trainer = env.train([None, 'agent.py'])
        obs = trainer.reset()
#         print(s)
#         obs = s[0]["observation"]
        episode_reward, episode_loss, episdoe_q_loss, episode_supervised_loss = 0,0,0,0
        
#         if BATCH_SIZE_NEW < 128:
#             BATCH_SIZE_NEW += int(BATCH_SIZE_NEW / 3)
#             if BATCH_SIZE_NEW < 32:
#                 BATCH_SIZE_NEW = 32
#             print("Batch Size: %s" % BATCH_SIZE_NEW)
        trans = []
        for t in itertools.count():
            if obs["step"] == 0:

                config = LuxMatchConfigs_Default
                config['seed'] = env.configuration['seed']
                game = Game(configs=config)
                game.process_updates(obs["updates"])

                game_state = LuxGame()
                game_state._initialize(obs["updates"])
                game_state._update(obs["updates"][2:])
                game_state.id = obs.player
                rew = Reward(game, obs.player)
                _ = reward = rew.get_reward(game, False, True, False)
                # game.id
                cts = 0
                # units = 1
            else:
                # game.process_updates(obs["updates"])
                game_state._update(obs["updates"])
            player = game_state.players[obs.player]


            # Select and perform an action
            actions_w = []
            actions_w_raw = {}
            dest = []
#             print(game_state.turn)
            for unit in player.units:
                if unit.can_act() and (obs['step'] % 40 < 30 or not in_city(unit.pos, game_state)):
                    state = make_input(obs, unit.id)
                    policy = select_action(state, policy_net, EPS)
                    actions_w_raw[unit.id] = policy
                    
                    action, pos = get_action(policy.cpu().item(), unit, dest, game_state)
                    actions_w.append(action)
                    dest.append(pos)
                    
            actions_c = []              
            unit_count = len(player.units)
            for city in player.cities.values():
                for city_tile in city.citytiles:
                    if city_tile.can_act():
                        if unit_count < player.city_tile_count:
                            actions_c.append(city_tile.build_worker())
                            unit_count += 1
                        elif not player.researched_uranium():
                            actions_c.append(city_tile.research())
                            player.research_points += 1
#             print(actions_w + actions_c)
            new_obs, _r, done, info = trainer.step(actions_w + actions_c)
            game.process_updates(new_obs["updates"])

            reward = rew.get_reward(game, done, True, False)
#             print('reward: %s, done: %s, info: %s step %s' %(reward, done, info, game_state.turn))
#             new_obs = env.step([actions_w + actions_c , opponent_actions])
            episode_reward = reward
#             reward = np.clip(reward, -1.0, 1.0)
            reward = torch.tensor([reward], device=device)
            # Observe new state

            # Store the transition in memory
            for key, value in actions_w_raw.items():
                old_state = make_input(obs, key)
                state_new = make_input(new_obs, key)

                memory.push((state_new, value, old_state, reward, False))

            # Move to the next state
            obs = new_obs
            loss = None
            # Perform one step of the optimization (on the policy network)
            if len(memory) > BATCH_SIZE_NEW:
                loss, q_loss, supervised_loss = optimize_model(policy_net=policy_net, target_net=target_net, BATCH_SIZE=BATCH_SIZE_NEW, optimizer=optimizer, GAMMA=GAMMA)
            if loss != None: 
                episode_loss += loss
                episdoe_q_loss += q_loss
                episode_supervised_loss += supervised_loss
            if done:
                break
            rewards.append(episode_reward)
            losses.append(episode_loss)
            q_losses.append(episdoe_q_loss)
            supervised_losses.append(episode_supervised_loss)
#         Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
            target_net.eval()
#         new_params = update_params(target_net, policy_net.state_dict(), TAU)
#         target_net.load_state_dict(new_params)
        
        avg_reward = np.mean(rewards)
        print("Episode: %s Loss: %s Q_loss: %s Supervised Loss: %s Reward: %s"% (i_episode, np.mean(losses), np.mean(q_losses), np.mean(supervised_losses), episode_reward))
        if episode_reward > best_reward:
            print("saving... best reward %s, episode_reward %s" % (best_reward, episode_reward))
            best_reward = episode_reward
            os.chdir("/kaggle/working")
            traced = torch.jit.trace(policy_net.cpu(), torch.rand(1, 20, 32, 32))
            traced.save('model.pth')
            policy_net.to(device)
            

In [9]:
if HPC:
    path = './imi-weights-best'
else:
    path = '/kaggle/input/imi-weights-best'

# model = resnext26_4x32d()
# model_target = resnext26_4x32d()
# path = '/kaggle_simulations/agent' if os.path.exists('/kaggle_simulations') else '.'




model_target = torch.jit.load(f'{path}/model.pth')
model = torch.jit.load(f'{path}/model.pth')
model_target.to(device)
model.to(device)
model_target.eval()


num_episode = 60
BATCH_SIZE = 32
GAMMA = 0.999
EPS = 0.9
TARGET_UPDATE = 5
TAU = 0.01
memory = ReplayMemory(2500)
memory_demos = ReplayMemory(2500)
memory_demos.prefil()
# loss_mse = WeightedMSE()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

100%|██████████| 125/125 [02:43<00:00,  1.31s/it]


In [10]:
# train_model_imi(model, dataloaders_dict, criterion,optimizer, scheduler, num_epochs=30)
train_model_RL(model, model_target, optimizer,memory, memory_demos, num_episode, BATCH_SIZE,GAMMA, EPS,TARGET_UPDATE, TAU)



Episode: 0 Loss: 27245.360810643782 Q_loss: 25890.60367586354 Supervised Loss: 1354.7571257225402 Reward: 20.0
saving... best reward 0, episode_reward 20.0


  "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is."


Episode: 1 Loss: 16104.961282170914 Q_loss: 14981.740053356192 Supervised Loss: 1123.2212309014042 Reward: 79.0
saving... best reward 20.0, episode_reward 79.0


  "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is."


Episode: 2 Loss: 12942.366498219877 Q_loss: 12033.411083394913 Supervised Loss: 908.9553880540432 Reward: 26.0




Episode: 3 Loss: 10356.28216230711 Q_loss: 9592.707089183215 Supervised Loss: 763.5750524868849 Reward: 88.0
saving... best reward 79.0, episode_reward 88.0


  "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is."


Episode: 4 Loss: 11189.019205583274 Q_loss: 10533.672599842295 Supervised Loss: 655.3466017705706 Reward: 41.0




rew/r_total=67.35,rew/r_wood=0.00,rew/r_coal=0.00,rew/r_uranium=0.00,rew/r_research=0.50,rew/r_city_tiles_end=59.00,rew/r_fuel_collected=0.00,rew/r_units=1.95,rew/r_city_tiles=5.90,game/turns=0.00,game/research=200.00,game/unit_count=39.00,game/cart_count=0.00,game/city_count=40.00,game/city_tiles=59.00,game/wood_rate_mined=0.00,game/coal_rate_mined=0.00,game/uranium_rate_mined=0.00
Episode: 5 Loss: 9581.107475932336 Q_loss: 9007.323056442674 Supervised Loss: 573.7844163324006 Reward: 59.0




Episode: 6 Loss: 9012.557744735832 Q_loss: 8496.044277453157 Supervised Loss: 516.5134535139069 Reward: 51.0




rew/r_total=13.64,rew/r_wood=0.00,rew/r_coal=0.00,rew/r_uranium=0.00,rew/r_research=0.09,rew/r_city_tiles_end=12.00,rew/r_fuel_collected=0.00,rew/r_units=0.35,rew/r_city_tiles=1.20,game/turns=0.00,game/research=36.00,game/unit_count=7.00,game/cart_count=0.00,game/city_count=9.00,game/city_tiles=12.00,game/wood_rate_mined=0.00,game/coal_rate_mined=0.00,game/uranium_rate_mined=0.00
Episode: 7 Loss: 8361.765122409377 Q_loss: 7882.949249443936 Supervised Loss: 478.8158584963 Reward: 12.0




Episode: 8 Loss: 7487.041604897923 Q_loss: 7051.726030108776 Supervised Loss: 435.3155619364272 Reward: 49.0




Episode: 9 Loss: 7156.366671816581 Q_loss: 6747.878764906007 Supervised Loss: 408.4878968008277 Reward: 11.0




Episode: 10 Loss: 6795.417923893312 Q_loss: 6420.373904706274 Supervised Loss: 375.04399862495586 Reward: 19.0




rew/r_total=5.63,rew/r_wood=0.00,rew/r_coal=0.00,rew/r_uranium=0.00,rew/r_research=0.03,rew/r_city_tiles_end=5.00,rew/r_fuel_collected=0.00,rew/r_units=0.10,rew/r_city_tiles=0.50,game/turns=0.00,game/research=13.00,game/unit_count=2.00,game/cart_count=0.00,game/city_count=5.00,game/city_tiles=5.00,game/wood_rate_mined=0.00,game/coal_rate_mined=0.00,game/uranium_rate_mined=0.00
Episode: 11 Loss: 6441.197464429031 Q_loss: 6085.372604668486 Supervised Loss: 355.8248402602647 Reward: 5.0




Episode: 12 Loss: 5958.534258441142 Q_loss: 5626.973555250083 Supervised Loss: 331.5606851220742 Reward: 31.0025




rew/r_total=90.95,rew/r_wood=0.00,rew/r_coal=0.00,rew/r_uranium=0.00,rew/r_research=0.50,rew/r_city_tiles_end=78.00,rew/r_fuel_collected=0.00,rew/r_units=4.65,rew/r_city_tiles=7.80,game/turns=0.00,game/research=200.00,game/unit_count=93.00,game/cart_count=0.00,game/city_count=54.00,game/city_tiles=78.00,game/wood_rate_mined=0.00,game/coal_rate_mined=0.00,game/uranium_rate_mined=0.00
Episode: 13 Loss: 5665.8798813534895 Q_loss: 5355.608474687267 Supervised Loss: 310.27138834504757 Reward: 78.0




rew/r_total=10.37,rew/r_wood=0.00,rew/r_coal=0.00,rew/r_uranium=0.00,rew/r_research=0.13,rew/r_city_tiles_end=9.00,rew/r_fuel_collected=0.00,rew/r_units=0.35,rew/r_city_tiles=0.90,game/turns=0.00,game/research=50.00,game/unit_count=7.00,game/cart_count=0.00,game/city_count=8.00,game/city_tiles=9.00,game/wood_rate_mined=0.00,game/coal_rate_mined=0.00,game/uranium_rate_mined=0.00
Episode: 14 Loss: 5282.0999809034665 Q_loss: 4992.152379473966 Supervised Loss: 289.9475844684673 Reward: 9.0




Episode: 15 Loss: 4946.925795721346 Q_loss: 4674.575553283291 Supervised Loss: 272.35022658557364 Reward: 30.0




rew/r_total=13.22,rew/r_wood=0.00,rew/r_coal=0.00,rew/r_uranium=0.00,rew/r_research=0.12,rew/r_city_tiles_end=11.00,rew/r_fuel_collected=0.00,rew/r_units=1.00,rew/r_city_tiles=1.10,game/turns=0.00,game/research=47.00,game/unit_count=20.00,game/cart_count=0.00,game/city_count=9.00,game/city_tiles=11.00,game/wood_rate_mined=0.00,game/coal_rate_mined=0.00,game/uranium_rate_mined=0.00
Episode: 16 Loss: 4872.816671467758 Q_loss: 4609.5314812927245 Supervised Loss: 263.28517543810705 Reward: 11.0




Episode: 17 Loss: 4865.285790021435 Q_loss: 4613.60063311114 Supervised Loss: 251.68514401954505 Reward: 5.0




Episode: 18 Loss: 4717.8198320318015 Q_loss: 4473.38100898859 Supervised Loss: 244.43881055596356 Reward: 38.0




Episode: 19 Loss: 4529.226146573404 Q_loss: 4293.901697646163 Supervised Loss: 235.32443701305652 Reward: 21.0




Episode: 20 Loss: 4356.036962826492 Q_loss: 4129.243518470736 Supervised Loss: 226.79343292532457 Reward: 8.0




Episode: 21 Loss: 4225.031631216307 Q_loss: 4003.6903303787467 Supervised Loss: 221.34128975245025 Reward: 17.0




Episode: 22 Loss: 4137.976016411531 Q_loss: 3920.848042004442 Supervised Loss: 217.12796355509465 Reward: 4.0




Episode: 23 Loss: 3956.21450292438 Q_loss: 3748.0839348551576 Supervised Loss: 208.13055775555932 Reward: 22.005




Episode: 24 Loss: 3937.4236258218734 Q_loss: 3730.2779380438897 Supervised Loss: 207.14567751368656 Reward: 1.0




Episode: 25 Loss: 3840.7871120274067 Q_loss: 3638.470071241227 Supervised Loss: 202.31703077714442 Reward: 11.0




Episode: 26 Loss: 3713.598071922793 Q_loss: 3518.595186258362 Supervised Loss: 195.00287594547845 Reward: 15.0




Episode: 27 Loss: 3632.1601954882694 Q_loss: 3441.251995548449 Supervised Loss: 190.90819045872564 Reward: 3.0




Episode: 28 Loss: 3585.660406814552 Q_loss: 3397.262439888492 Supervised Loss: 188.39795754554848 Reward: 4.0




Episode: 29 Loss: 3449.111988798755 Q_loss: 3267.541637770552 Supervised Loss: 181.57034200155098 Reward: 31.0




Episode: 30 Loss: 3369.602377961088 Q_loss: 3192.76824915599 Supervised Loss: 176.83412036878772 Reward: 39.0




Episode: 31 Loss: 3273.3469391340554 Q_loss: 3100.328633263635 Supervised Loss: 173.0182975865666 Reward: 22.0




Episode: 32 Loss: 3166.166568344749 Q_loss: 2997.9291356266995 Supervised Loss: 168.23742468859558 Reward: 36.0




Episode: 33 Loss: 3123.0580956073127 Q_loss: 2957.1804443621845 Supervised Loss: 165.87764328874601 Reward: 7.0




Episode: 34 Loss: 3021.9004500006977 Q_loss: 2861.1781790263017 Supervised Loss: 160.72226328340537 Reward: 43.0




Episode: 35 Loss: 2969.209713422118 Q_loss: 2811.1467497310327 Supervised Loss: 158.06295613452647 Reward: 6.0




Episode: 36 Loss: 2929.3520556724816 Q_loss: 2773.384810946406 Supervised Loss: 155.96723730556346 Reward: 11.0




Episode: 37 Loss: 2844.648416885326 Q_loss: 2692.8328194110154 Supervised Loss: 151.81559031542074 Reward: 50.0




rew/r_total=40.55,rew/r_wood=0.00,rew/r_coal=0.00,rew/r_uranium=0.00,rew/r_research=0.35,rew/r_city_tiles_end=35.00,rew/r_fuel_collected=0.00,rew/r_units=1.70,rew/r_city_tiles=3.50,game/turns=0.00,game/research=140.00,game/unit_count=34.00,game/cart_count=0.00,game/city_count=31.00,game/city_tiles=35.00,game/wood_rate_mined=0.00,game/coal_rate_mined=0.00,game/uranium_rate_mined=0.00
Episode: 38 Loss: 2761.6540782458487 Q_loss: 2614.081958225409 Supervised Loss: 147.57211309271588 Reward: 35.0




Episode: 39 Loss: 2709.1429100449595 Q_loss: 2564.2842816710063 Supervised Loss: 144.85862158271593 Reward: 14.0




Episode: 40 Loss: 2631.7369963538818 Q_loss: 2490.923290040377 Supervised Loss: 140.81369972528847 Reward: 38.0025




Episode: 41 Loss: 2650.9912482911386 Q_loss: 2511.426165330198 Supervised Loss: 139.56507555235353 Reward: 34.005




Episode: 42 Loss: 2611.847969088027 Q_loss: 2475.1284234053514 Supervised Loss: 136.71953895654116 Reward: 38.0




Episode: 43 Loss: 2597.7926900111847 Q_loss: 2461.7950642198853 Supervised Loss: 135.9976191018122 Reward: 2.0




Episode: 44 Loss: 2639.983820889202 Q_loss: 2504.6310820634876 Supervised Loss: 135.35273358428836 Reward: 7.0




Episode: 45 Loss: 2640.440372619789 Q_loss: 2508.0795314746115 Supervised Loss: 132.3608361577964 Reward: 25.0




Episode: 46 Loss: 2613.900726704743 Q_loss: 2483.2813659648373 Supervised Loss: 130.61935493115712 Reward: 24.0




Episode: 47 Loss: 2575.473209895472 Q_loss: 2447.0941511717383 Supervised Loss: 128.37905267409442 Reward: 26.0




Episode: 48 Loss: 2516.884437119708 Q_loss: 2391.420332925331 Supervised Loss: 125.46409829168893 Reward: 25.0




Episode: 49 Loss: 2467.782605439191 Q_loss: 2344.9054334380958 Supervised Loss: 122.87716624103282 Reward: 13.0




Episode: 50 Loss: 2428.8964755171937 Q_loss: 2308.49513487872 Supervised Loss: 120.401335630874 Reward: 39.0




Episode: 51 Loss: 2395.467997366755 Q_loss: 2276.4781859833665 Supervised Loss: 118.98980644639767 Reward: 33.0




Episode: 52 Loss: 2357.1481300016762 Q_loss: 2239.9274454980196 Supervised Loss: 117.22067964774871 Reward: 14.0




Episode: 53 Loss: 2345.6760803335783 Q_loss: 2230.40790492969 Supervised Loss: 115.26817067944185 Reward: 11.0




Episode: 54 Loss: 2301.957034068106 Q_loss: 2188.3802200620335 Supervised Loss: 113.57680939564665 Reward: 17.0




Episode: 55 Loss: 2276.2904308362645 Q_loss: 2163.9468725056354 Supervised Loss: 112.34355377431348 Reward: 5.0




Episode: 56 Loss: 2251.276124900163 Q_loss: 2140.025001003405 Supervised Loss: 111.25111942960531 Reward: 11.0




Episode: 57 Loss: 2213.2147278894763 Q_loss: 2103.027982771978 Supervised Loss: 110.18674072915383 Reward: 11.0




Episode: 58 Loss: 2190.1118074707833 Q_loss: 2081.0751732269923 Supervised Loss: 109.03662990154885 Reward: 11.0




Episode: 59 Loss: 2157.4720156682374 Q_loss: 2049.9005215972793 Supervised Loss: 107.57148973790753 Reward: 22.0


# Submission

In [11]:
%%writefile agent.py
import os
import numpy as np
import torch
from lux.game import Game


path = '/kaggle_simulations/agent' if os.path.exists('/kaggle_simulations') else '.'
model = torch.jit.load(f'{path}/model.pth')
model.eval()


def make_input(obs, unit_id):
    width, height = obs['width'], obs['height']
    x_shift = (32 - width) // 2
    y_shift = (32 - height) // 2
    cities = {}
    
    b = np.zeros((20, 32, 32), dtype=np.float32)
    
    for update in obs['updates']:
        strs = update.split(' ')
        input_identifier = strs[0]
        
        if input_identifier == 'u':
            x = int(strs[4]) + x_shift
            y = int(strs[5]) + y_shift
            wood = int(strs[7])
            coal = int(strs[8])
            uranium = int(strs[9])
            if unit_id == strs[3]:
                # Position and Cargo
                b[:2, x, y] = (
                    1,
                    (wood + coal + uranium) / 100
                )
            else:
                # Units
                team = int(strs[2])
                cooldown = float(strs[6])
                idx = 2 + (team - obs['player']) % 2 * 3
                b[idx:idx + 3, x, y] = (
                    1,
                    cooldown / 6,
                    (wood + coal + uranium) / 100
                )
        elif input_identifier == 'ct':
            # CityTiles
            team = int(strs[1])
            city_id = strs[2]
            x = int(strs[3]) + x_shift
            y = int(strs[4]) + y_shift
            idx = 8 + (team - obs['player']) % 2 * 2
            b[idx:idx + 2, x, y] = (
                1,
                cities[city_id]
            )
        elif input_identifier == 'r':
            # Resources
            r_type = strs[1]
            x = int(strs[2]) + x_shift
            y = int(strs[3]) + y_shift
            amt = int(float(strs[4]))
            b[{'wood': 12, 'coal': 13, 'uranium': 14}[r_type], x, y] = amt / 800
        elif input_identifier == 'rp':
            # Research Points
            team = int(strs[1])
            rp = int(strs[2])
            b[15 + (team - obs['player']) % 2, :] = min(rp, 200) / 200
        elif input_identifier == 'c':
            # Cities
            city_id = strs[2]
            fuel = float(strs[3])
            lightupkeep = float(strs[4])
            cities[city_id] = min(fuel / lightupkeep, 10) / 10
    
    # Day/Night Cycle
    b[17, :] = obs['step'] % 40 / 40
    # Turns
    b[18, :] = obs['step'] / 360
    # Map Size
    b[19, x_shift:32 - x_shift, y_shift:32 - y_shift] = 1

    return b


game_state = None
def get_game_state(observation):
    global game_state
    
    if observation["step"] == 0:
        game_state = Game()
        game_state._initialize(observation["updates"])
        game_state._update(observation["updates"][2:])
        game_state.id = observation["player"]
    else:
        game_state._update(observation["updates"])
    return game_state


def in_city(pos):    
    try:
        city = game_state.map.get_cell_by_pos(pos).citytile
        return city is not None and city.team == game_state.id
    except:
        return False


def call_func(obj, method, args=[]):
    return getattr(obj, method)(*args)


unit_actions = [('move', 'n'), ('move', 's'), ('move', 'w'), ('move', 'e'), ('build_city',)]
def get_action(policy, unit, dest):
    for label in np.argsort(policy)[::-1]:
        act = unit_actions[label]
        pos = unit.pos.translate(act[-1], 1) or unit.pos
        if pos not in dest or in_city(pos):
            return call_func(unit, *act), pos 
            
    return unit.move('c'), unit.pos


def agent(observation, configuration):
    global game_state
    
    game_state = get_game_state(observation)    
    player = game_state.players[observation.player]
    actions = []
    
    # City Actions
    unit_count = len(player.units)
    for city in player.cities.values():
        for city_tile in city.citytiles:
            if city_tile.can_act():
                if unit_count < player.city_tile_count: 
                    actions.append(city_tile.build_worker())
                    unit_count += 1
                elif not player.researched_uranium():
                    actions.append(city_tile.research())
                    player.research_points += 1
    # Worker Actions
    dest = []
    for unit in player.units:
        if unit.can_act() and (game_state.turn % 40 < 30 or not in_city(unit.pos)):
            state = make_input(observation, unit.id)
            with torch.no_grad():
                p = model(torch.from_numpy(state).unsqueeze(0))

            policy = p.squeeze(0).numpy()

            action, pos = get_action(policy, unit, dest)
            actions.append(action)
            dest.append(pos)

    return actions

Overwriting agent.py


In [12]:
from kaggle_environments import make

env = make("lux_ai_2021", configuration={"width": 24, "height": 24, "loglevel": 2, "annotations": True}, debug=False)
steps = env.run(['agent.py', 'agent.py'])
env.render(mode="ipython", width=1200, height=800)

In [13]:
!tar -czf submission.tar.gz *