-
Notifications
You must be signed in to change notification settings - Fork 1
/
agent.py
73 lines (64 loc) · 2.55 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
""" Greedy AI. You SHOULD beat this AI!
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import os
import numpy as np
import time
from ...constants import *
from ...board import Board
from ...game import Checkers
from ..agent import Agent
from ..Random.agent import RandomAgent
class GreedyAgent(Agent):
def __init__(self, player, rule, future_count=4, num_simulation=5):
base_name = 'Greedy'
self.future_count = future_count
self.num_simulation = num_simulation
self.sub_agent = RandomAgent(1, rule) # no player specified.
super(GreedyAgent, self).__init__(base_name, player, rule)
def compute_reward(self, last_player, rew):
r = 0.
r += float(rew['capture-man']) * 1.
r += float(rew['capture-king']) * 5.
r += float(rew['promotion']) * 3.
r += float(rew['win']) * 100.
r += float(rew['draw']) * 0. # no point for draw
if self.player != last_player: # negative reward
r = -r
return r
def act(self, obs, moves, info):
scores = []
actions = []
for from_pos, legal_moves in moves:
for to_pos in legal_moves:
score = 0.
for _ in range(self.num_simulation):
matrix = np.copy(obs)
matrix[matrix == BLIND] = EMPTY
temp_env = Checkers(self.rule)
temp_env.reset(self.player, matrix)
temp_env.move_count = info['move-count']
action = (from_pos, to_pos)
for _ in range(self.future_count):
last_player = temp_env.player
_, temp_obs, temp_moves, temp_rew, temp_done, temp_info = temp_env.step(action)
score += self.compute_reward(last_player, temp_rew)
if temp_done > 0:
break
action = self.sub_agent.act(temp_obs, temp_moves, temp_info)
del temp_env
del matrix
score /= self.num_simulation
scores.append(score)
actions.append((from_pos, to_pos))
scores = np.array(scores)
max_indices = np.where(scores == np.max(scores))[0]
random_index = int(np.random.random_sample() * len(max_indices))
max_index = max_indices[random_index]
max_action = actions[max_index]
return max_action
def consume(self, rew):
pass