In [64]:
import numpy
from collections import defaultdict

# Ice Puzzle

![title](snow_1.png)

In [13]:
#Install packages
import numpy as np
import random

In [14]:
class Environment:
    def __init__(self,start,goal,length=10,height=5,punishment=-5,reward=100):
        # Define environment size
        self.length=length
        self.height=height
        self.Map=np.array(np.zeros((height,length)))

        # Define Items on the Map
        self.start=start #Starting Point
        self.goal=goal
        self.punishment=punishment
        self.reward=reward

        # Define Variables for Training
        self.actions_map = {"Up": 0,"Down": 1,"Left": 2,"Right": 3}
        self.Q=None
        self.transitions = dict()
        self.R = dict()
        self.S = set()
        self.episode_actions=[]

        # Define the next state because this map cannot utilise simple directions
        self.transitions=None

    def GiveTransition(self,loc,act,destination):
        # Variables:
        # 1) loc: Coordinate in the form of 'x,y' INCLUDING the apostrophes, 
        # which converts to its corresponding index through self.s_dict
        # 2) acts: Possible actions in the form of ['Up','Down,'Left','Right'] where any of the directions 
        # can be removed to indicate the action is not possible
        # 3) destinations: End coordinate after taking the action defined above in the form of ['x,y']
        # Shapes of acts and destinations must be the same

        self.transitions[loc,self.actions_map[act]] = destination
        self.R[loc,self.actions_map[act]] = 0
        self.S.insert(loc)

    def CreateQMat(self):
        self.Q=np.zeros((len(self.St),len(self.actions)))

    #Assign reward value of punishment value
    def GiveValue(self,loc,acts,value=None): 
        if value is None:
            value = self.reward
        self.R[loc,self.actions.index(acts)]=value

    def TrainEpisode(self,alpha,gamma,epsilon,max_step,default_start=True): 
        #...
        # Training Episodes using Code from INM707 Lab 4 as reference
        #...
        if(default_start==True):
            curr_s=self.start
        else:
            curr_s=random.randint(0,len(self.S)-1)
        # print("Starting state is '{}'".format(list(self.S_dict.keys())[curr_s]))
        self.episode_actions=[]
        for step in range(max_step):

            # Define actions for both exploring and exploiting policies
            open_actions = np.where(~np.isnan(self.R[curr_s]))[0]
            # print([self.actions[a] for a in open_actions])

            open_q = [self.Q[curr_s,a] for a in open_actions]

            best_act = open_actions[np.where(open_q == np.max(open_q))[0]]
            best_act_q = [self.Q[curr_s,x] for x in best_act]

            # print(best_act)

            # Pick Action based on policy
            if np.random.uniform() < epsilon:
                a = np.random.choice(open_actions)
            else:
                a = np.random.choice(best_act)

            # Update Environment States
            r = self.R[curr_s,a]
            s_old = curr_s
            curr_s = int(self.transitions[curr_s,a])

            self.episode_actions.append("{}, {}".format(self.S[s_old],self.actions[a]))
            # print("New state is '{}'".format(list(self.S_dict.keys())[int(curr_s)]))
            # print((self.Q[curr_s]))
            q_updated =  self.Q[s_old,a] + alpha*(self.R[s_old,a] + gamma*np.max(self.Q[curr_s]) - self.Q[s_old,a])
            self.Q[s_old,a] = q_updated

            # print('Q matrix updated: \n\n {}'.format(self.Q.round(0)))

            if curr_s == self.goal:
                # print("Goal state '{}' reached. Ending episode.".format(self.goal))
                break

        return self.episode_actions

In [144]:
import numpy
from collections import defaultdict

class Environment:
    dirs = {
        0: (-1,  0),
        1: ( 1,  0),
        2: ( 0, -1),
        3: ( 0,  1),
    }

    def __init__(self, map):
        self.parseMap(map)
        self.reward_val = 100
        self.Q = numpy.full((self.N, self.M, len(self.dirs)), numpy.nan)
        for (ty, tx), tds in self.transitions.items():
            for t in tds.keys():
                self.Q[ty, tx, t] = 0

    def parseMap(self, map):
        self.map = map
        self.start = None
        self.end = None
        self.rewards = set()
        self.N = len(map)
        self.M = len(map[0])

        self.transitions = defaultdict(lambda: dict())

        for y, row in enumerate(self.map):
            for x, tile in enumerate(row):
                self.processTile(y, x, tile)

        if self.start is None:
            raise ValueError('No start')
        if self.end is None:
            raise ValueError('No end')

    def canStep(self, y, x):
        return y >= 0 and y < self.N and x >= 0 and x < self.M and self.map[y][x] != 'x'

    def processTile(self, y, x, tile):
        if tile == 's':
            if self.start is not None:
                raise ValueError('Several starts')
            self.start = (y, x)

        if tile in ('e', 'R'):
            if self.end is not None:
                raise ValueError('Several ends')
            self.end = (y, x)

        if tile == 'x':
            return

        assert self.canStep(y, x)
        for dir, (dy, dx) in self.dirs.items():
            cy, cx = y, x
            while self.canStep(cy, cx):
                cy += dy
                cx += dx
            cy -= dy
            cx -= dx

            assert dir not in self.transitions[(y, x)]
            self.transitions[(y, x)][dir] = (cy, cx)

    def trainEpisode(self, alpha, gamma, epsilon, max_steps):
        s = self.start

        open_actions = self.transitions[s].keys()
        best_actions = [x for x in self.dirs.keys() if self.Q[s][x] == numpy.max(self.Q[s])]
        # open_values = numpy.array(self.transition[s].values())
        for step in range(max_steps):
            if numpy.random.uniform() < epsilon:
                a = numpy.random.choice(open_actions)
            else:
                try:
                    a = numpy.random.choice(best_actions)
                except ValueError:
                    print(f'At state {s}')
                    assert False

            ns = self.transitions[s][a]
            r = self.reward_val if ns == self.end else 0
            self.Q[s][a] += alpha * (r + gamma * numpy.max(self.Q[ns]) - self.Q[s][a])

            s = ns
            if s == self.end:
                break

In [145]:
e = Environment(map)
e.trainEpisode(alpha = 1, gamma = 0.7, epsilon = 0.8, max_steps = 500)

ValueError: a must be 1-dimensional or an integer

In [115]:
e.transitions[(3, 0)]

{'Up': (3, 0), 'Down': (4, 0), 'Left': (3, 0), 'Right': (3, 6)}

In [132]:
numpy.argmax(e.Q[3, 0])

0

In [33]:
Q = numpy.array(
    [
        [ 1, 2, 3, 4 ],
        [ 5, 6, 7, 8 ],
        [ 9, 1, 2, 3 ],
    ]
)

In [41]:
numpy.unravel_index(Q.argmax(), Q.shape)

(2, 0)

In [56]:
a = numpy.array([(0, 1), (0, 2), (1, 2)])

In [57]:
type(a)

numpy.ndarray

In [58]:
Q[a[:, 0], a[:, 1]]

array([2, 3, 7])

In [60]:
a[numpy.argmax(Q[a[:, 0], a[:, 1]])]

array([1, 2])

In [51]:
Q[[(0, 1), (1, 2)]]

array([[[1, 2, 3, 4],
        [5, 6, 7, 8]],

       [[5, 6, 7, 8],
        [9, 1, 2, 3]]])

# Create Map

In [3]:
#Define Totems (Tiles that cant be stood on), Ice (Tiles that can only be slid past), Treasure(Tile containing reward) and Start(Starting Point)
Totems=[(0,0),(1,0),(2,0),(0,2),(4,1),(4,5),(3,7),(0,9),(2,9),(3,9),(4,9)]
Ice=[(2,1),(2,2),(3,2),(1,3),(2,3),(3,3),(1,4),(2,4),(3,4),(0,5),(1,5),(2,5),(3,5),(2,7),(2,8),(1,6),(2,6),(0,7),(1,7),(4,7),(1,8),(3,8)]
Treasure=[(1,9)]
Start='3,0'
End = '1,9']
Start='3,0'
End = '1,9'

Map = IceEnv.CreateMap()
print(Map)

#Create State Matrix

[[ nan   0.  nan   0.   0.  -1.   0.  -1.   0.  nan]
 [ nan   0.   0.  -1.  -1.  -1.  -1.  -1.  -1. 100.]
 [ nan  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  nan]
 [  0.   0.  -1.  -1.  -1.  -1.   0.  nan  -1.  nan]
 [  0.  nan   0.   0.   0.  nan   0.  -1.   0.  nan]]


# Read map from file

In [27]:
lines = open('snow_map').readlines()

# Remove first line
lines = lines[1:]

map = []
for line in lines:
    map.append(line[1:-1])

In [28]:
print(' ' + ''.join([str(x) for x in range(10)]))
print('\n'.join([f'{x}{y}' for x, y in enumerate(map)]))

 0123456789
0x x      x
1x        R
2x        x
3s      x x
4 x   x   x


In [16]:
N = len(map)
M = len(map[0])

IceEnv = Environment(start = '3,0', goal = '1,9', length = M, height = N)

dirs = {
    'Up':    (-1,  0),
    'Down':  ( 1,  0),
    'Left':  ( 0, -1),
    'Right': ( 0,  1),
}
for i in range(N):
    for j in range(M):
        if map[i][j] == 'x':
            continue

        for dir, (dy, dx) in dirs.items():
            y = i
            x = j
            while y >= 0 and y < N and x >= 0 and x < M and map[y][x] != 'x':
                y += dy
                x += dx

            y -= dy
            x -= dx
            if y != i or x != j:
                IceEnv.GiveTransition(f'{i},{j}', dir, f'{y},{x}')
                if map[y][x] == 'r':
                    IceEnv.GiveValue(f'{i},{j}', dir)

TypeError: 'NoneType' object does not support item assignment

In [43]:
dirs

{'Up': (-1, 0), 'Down': (1, 0), 'Left': (0, -1), 'Right': (0, 1)}

# Begin Reinforcement Learning

## Initialize State

In [6]:
import random

# Initialize Training
alpha = 1
gamma = 0.7
epsilon = 0.8
max_step = 500
decay=0.95

IceEnv.CreateQMat()

In [7]:
for i in range(100):
    actionlist=IceEnv.TrainEpisode(alpha,gamma,epsilon,max_step)
    epsilon*=decay

print(IceEnv.Q)
print(epsilon)
print(IceEnv.S_dict)
print('{} \n'.format([a for a in actionlist]))
# print('{} \n'.format([a for a in actionlist]))

[[  0.        24.01       0.         0.      ]
 [  0.        34.3        0.        16.807   ]
 [  0.         0.         0.        16.807   ]
 [  0.        11.7649    24.01      16.807   ]
 [  0.        11.7649    24.01       0.      ]
 [  0.         5.764801   0.       100.      ]
 [  0.        49.        70.       100.      ]
 [  0.         0.         0.         0.      ]
 [  0.         5.764801   0.        11.7649  ]
 [  0.         0.         8.23543    0.      ]
 [  8.23543   11.7649    16.807      0.      ]
 [  8.23543    0.         0.         0.      ]
 [ 70.         0.         0.        34.3     ]
 [ 24.01       0.        49.        34.3     ]
 [ 11.7649     0.        49.         0.      ]
 [ 16.807      0.         0.        11.7649  ]
 [ 16.807      0.        11.7649     0.      ]]
0.004736423376267195
{'0,1': 0, '0,3': 1, '0,4': 2, '0,6': 3, '0,8': 4, '1,1': 5, '1,2': 6, '1,9': 7, '3,0': 8, '3,1': 9, '3,6': 10, '4,0': 11, '4,2': 12, '4,3': 13, '4,4': 14, '4,6': 15, '4,8': 16}
[