# Prediction : MC Learning

## Import the library

In [None]:
import random
import numpy as np

## Environment Class

In [None]:
class GridWorld():
    def __init__(self):
        self.x = 0
        self.y = 0

    def step(self, a):
        # 0번 액션: 왼쪽, 1번 액션: 위, 2번 액션: 오른쪽, 3번 액션: 아래쪽
        if a==0:
            self.move_left()
        elif a==1:
            self.move_up()
        elif a==2:
            self.move_right()
        elif a==3:
            self.move_down()

        reward = -1           # 보상은 항상 -1로 고정
        done = self.is_done() # episode 종료 여부
        return (self.x, self.y), reward, done

    def move_right(self):
        self.y += 1
        if self.y > 3:
            self.y = 3

    def move_left(self):
        self.y -= 1
        if self.y < 0:
            self.y = 0

    def move_up(self):
        self.x -= 1
        if self.x < 0:
            self.x = 0

    def move_down(self):
        self.x += 1
        if self.x > 3:
            self.x = 3

    def is_done(self):
        if self.x==3 and self.y==3:
            return True  # episode 종료 지점 도착
        else :
            return False

    def get_state(self):
        return (self.x, self.y)

    def reset(self):
        self.x = 0
        self.y = 0
        return (self.x, self.y)

## Agent Class

In [None]:
class Agent():
    def __init__(self):
        pass

    def select_action(self):  # 정책(𝝅) 동서남북 0.25로 고정
        coin = random.random()
        if coin < 0.25:
            action = 0
        elif coin < 0.5:
            action = 1
        elif coin < 0.75:
            action = 2
        else:
            action = 3
        return action

## main

In [None]:
def main():
    env = GridWorld()   # 환경 설정
    agent = Agent()     # 에이전트 설정
    data = [[0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]] # 테이블 초기화
    gamma = 1.0
    alpha = 0.0001

    for k in range(50000):    # 50000 에피소드 진행
        done = False
        history = []
        while not done:
            action = agent.select_action()
            (x, y), reward, done = env.step(action)
            history.append((x, y, reward))
        env.reset()

        # 매 에피소드 끝나고 테이블 업데이트
        cum_reward = 0
        for transition in history[::-1]:
            # 방문했던 상태들을 뒤에서 부터 차례대로 리턴 계산
            x, y, reward = transition
            # MC Learning function(version of parital update)
            # V(s) = V(s) + α * (G(s) - V(s))
            data[x][y] = data[x][y] + alpha*(cum_reward - data[x][y])
            # G_t = R_t+1 + γG_t+1
            cum_reward = reward + gamma*cum_reward

    # 학습이 끝나고 난 후 데이터 보기
    for row in data:
        print(row)

In [None]:
if __name__ == '__main__':
    main()

[-59.847571592710175, -57.77331755635565, -54.583169306377016, -52.395830346176865]
[-58.0530879165677, -55.67840800790157, -50.83721376607114, -45.64505278108109]
[-54.72056737582497, -50.655481424172, -41.48242941538533, -29.776456575519916]
[-52.39294297718623, -46.10997554814433, -30.460820994312577, 0.0]
