## 인공지능 과제 2

In [21]:
import util

class BlackjackMDP(util.MDP):
    def __init__(self, cardValues, multiplicity, threshold, peekCost):
        """
        cardValues: array of card values for each card type
        multiplicity: number of each card type
        threshold: maximum total before going bust
        peekCost: how much it costs to peek at the next card
        """
        self.cardValues = cardValues
        self.multiplicity = multiplicity
        self.threshold = threshold
        self.peekCost = peekCost

    # Return the start state.
    # Look at this function to learn about the state representation.
    # The first element of the tuple is the sum of the cards in the player's hand.
    # The second element is the index (not the value) of the next card, if the player peeked in the last action.
        # If they didn't peek, this will be None.
    # The final element is the current deck.
    def startState(self):
        return (0, None, (self.multiplicity,) * len(self.cardValues))  # total, next card (if any), multiplicity for each card

    # Return set of actions possible from |state|.
    # You do not need to modify this function.
    # All logic for dealing with end states should be done in succAndProbReward
    def actions(self, state):
        return ['Take', 'Peek', 'Quit']

    # Return a list of (newState, prob, reward) tuples corresponding to edges coming out of |state|.
    # Indicate a terminal state (after quitting or busting) by setting the deck to None.
    # When the probability is 0 for a particular transition, don't include that in the list returned by succAndProbReward.
    def succAndProbReward(self, state, action):
        # BEGIN_YOUR_CODE
        succ_prob_reward_list = []
        card_sum, peek_idx, deck = state  # card_sum = the sum of taken cards' values

        if deck is None:  # when there is no card in the deck
            pass          # no possible successor state

        elif action == 'Take':
            num_all_cards = sum(deck)  # the number of all cards

            # get_succ_reward(idx) returns a successor state and a reward, when a card is taken.
            def get_succ_reward(idx):
                new_card_sum = card_sum + self.cardValues[idx]  # HINT: use self.cardValues  # what's the new sum of card values, when we take a new card?
                if new_card_sum > self.threshold:  # when the card sum exceeds the threshold
                    new_deck = None
                    reward = 0
                elif num_all_cards > 1:  # sum(new_deck) > 0; when some cards remain
                    new_deck = list(deck)
                    new_deck[idx] -=1  # decrease the number of instances of the taken card.
                    new_deck = tuple(new_deck)
                    reward = 0
                else:  # when there is no card remaining
                    new_deck = None
                    reward = new_card_sum
                succ = new_card_sum, None, new_deck
                return succ, reward

            # Peek implementation ----------------------------------------
            if peek_idx is not None:  # when previous action was 'Peek'
                succ, reward = get_succ_reward(peek_idx)
                succ_prob_reward_list.append((succ, 1, reward))
            # ---------------------------------------- Peek implementation
            else:  # when previous action was not 'Peek'
                for idx, num in enumerate(deck):
                    if num == 0:
                        continue
                    succ, reward = get_succ_reward(idx)
                    prob = num/sum(deck)
                    succ_prob_reward_list.append((succ, prob, reward))

        # Peek implementation ----------------------------------------
        elif action == 'Peek':
            if peek_idx is None:
                num_all_cards = sum(deck)

                for idx, num in enumerate(deck):
                    if num == 0:
                        continue
                    prob = num/sum(deck)
                    succ_prob_reward_list.append(((card_sum,idx,deck), prob, - self.peekCost))  # HINT: has the form (new_card_sum, new_peek_idx, new_deck)
        # ---------------------------------------- Peek implementation

        elif action == 'Quit':
            succ_prob_reward_list.append(((card_sum, None, None), 1, card_sum))

        else:
            raise ValueError("Undefined action '{}'".format(action))

        return succ_prob_reward_list
        # END_YOUR_CODE

    def discount(self):
        return 1


if __name__ == '__main__':
    mdp = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)

    algorithm = util.ValueIteration()
    algorithm.solve(mdp, verbose=0)

    for s in algorithm.pi:
        print(f'pi({s}) = {algorithm.pi[s]}')
    
    


pi((6, None, (1, 1))) = Quit
pi((1, 1, (1, 2))) = Take
pi((6, None, None)) = Take
pi((0, 0, (2, 2))) = Take
pi((5, None, (2, 1))) = Take
pi((7, None, None)) = Take
pi((2, None, (0, 2))) = Take
pi((6, 1, (1, 1))) = Quit
pi((2, 1, (0, 2))) = Take
pi((10, None, (2, 0))) = Quit
pi((7, None, (0, 1))) = Quit
pi((5, None, None)) = Take
pi((1, 0, (1, 2))) = Take
pi((0, None, (2, 2))) = Take
pi((5, 1, (2, 1))) = Take
pi((0, None, None)) = Take
pi((7, 1, (0, 1))) = Quit
pi((1, None, None)) = Take
pi((1, None, (1, 2))) = Take
pi((12, None, None)) = Take
pi((6, 0, (1, 1))) = Take
pi((2, None, None)) = Take
pi((0, 1, (2, 2))) = Take
pi((11, None, None)) = Take
pi((10, None, None)) = Take
pi((5, 0, (2, 1))) = Take
pi((10, 0, (2, 0))) = Quit


In [20]:
import util

_FILL_IN_ = None


class BlackjackMDP(util.MDP):
    def __init__(self, cardValues, multiplicity, threshold, peekCost):
        """
        cardValues: array of card values for each card type
        multiplicity: number of each card type
        threshold: maximum total before going bust
        peekCost: how much it costs to peek at the next card
        """
        self.cardValues = cardValues
        self.multiplicity = multiplicity
        self.threshold = threshold
        self.peekCost = peekCost

    # Return the start state.
    # Look at this function to learn about the state representation.
    # The first element of the tuple is the sum of the cards in the player's hand.
    # The second element is the index (not the value) of the next card, if the player peeked in the last action.
        # If they didn't peek, this will be None.
    # The final element is the current deck.
    def startState(self):
        return (0, None, (self.multiplicity,) * len(self.cardValues))  # total, next card (if any), multiplicity for each card

    # Return set of actions possible from |state|.
    # You do not need to modify this function.
    # All logic for dealing with end states should be done in succAndProbReward
    def actions(self, state):
        return ['Take', 'Peek', 'Quit']

    # Return a list of (newState, prob, reward) tuples corresponding to edges coming out of |state|.
    # Indicate a terminal state (after quitting or busting) by setting the deck to None.
    # When the probability is 0 for a particular transition, don't include that in the list returned by succAndProbReward.
    def succAndProbReward(self, state, action):
        # BEGIN_YOUR_CODE
        succ_prob_reward_list = []
        card_sum, peek_idx, deck = state  # card_sum = the sum of taken cards' values

        if deck is None:  # when there is no card in the deck
            pass          # no possible successor state

        elif action == 'Take':
            num_all_cards = sum(deck)  # the number of all cards

            # get_succ_reward(idx) returns a successor state and a reward, when a card is taken.
            def get_succ_reward(idx):
                new_card_sum = card_sum + self.cardValues[idx]  # HINT: use self.cardValues  # what's the new sum of card values, when we take a new card?
                if new_card_sum > self.threshold:  # when the card sum exceeds the threshold
                    new_deck = None
                    reward = 0
                elif num_all_cards > 1:  # sum(new_deck) > 0; when some cards remain
                    new_deck = list(deck)
                    new_deck[idx] -=1  # decrease the number of instances of the taken card.
                    new_deck = tuple(new_deck)
                    reward = 0
                else:  # when there is no card remaining
                    new_deck = None
                    reward = new_card_sum
                succ = new_card_sum, None, new_deck
                return succ, reward

            # Peek implementation ----------------------------------------
            if peek_idx is not None:  # when previous action was 'Peek'
                succ, reward = get_succ_reward(peek_idx)
                succ_prob_reward_list.append((succ, 1, reward))
            # ---------------------------------------- Peek implementation
            else:  # when previous action was not 'Peek'
                for idx, num in enumerate(deck):
                    if num == 0:
                        continue
                    succ, reward = get_succ_reward(idx)
                    prob = float(num) / num_all_cards
                    succ_prob_reward_list.append((succ, prob, reward))

        # Peek implementation ----------------------------------------
        elif action == 'Peek':
            if peek_idx is None:
                num_all_cards = sum(deck)

                for idx, num in enumerate(deck):
                    if num == 0:
                        continue
                    prob = float(num) / num_all_cards
                    succ_prob_reward_list.append(((card_sum,idx,deck), prob, - self.peekCost))  # HINT: has the form (new_card_sum, new_peek_idx, new_deck)
        # ---------------------------------------- Peek implementation

        elif action == 'Quit':
            succ_prob_reward_list.append(((card_sum, None, None), 1, card_sum))

        else:
            raise ValueError("Undefined action '{}'".format(action))

        return succ_prob_reward_list
        # END_YOUR_CODE

    def discount(self):
        return 1


if __name__ == '__main__':
    mdp = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)

    algorithm = util.ValueIteration()
    algorithm.solve(mdp, verbose=0)

    for s in algorithm.pi:
        print(f'pi({s}) = {algorithm.pi[s]}')
    
    


pi((6, None, (1, 1))) = Quit
pi((1, 1, (1, 2))) = Take
pi((6, None, None)) = Take
pi((0, 0, (2, 2))) = Take
pi((5, None, (2, 1))) = Take
pi((7, None, None)) = Take
pi((2, None, (0, 2))) = Take
pi((6, 1, (1, 1))) = Quit
pi((2, 1, (0, 2))) = Take
pi((10, None, (2, 0))) = Quit
pi((7, None, (0, 1))) = Quit
pi((5, None, None)) = Take
pi((1, 0, (1, 2))) = Take
pi((0, None, (2, 2))) = Take
pi((5, 1, (2, 1))) = Take
pi((0, None, None)) = Take
pi((7, 1, (0, 1))) = Quit
pi((1, None, None)) = Take
pi((1, None, (1, 2))) = Take
pi((12, None, None)) = Take
pi((6, 0, (1, 1))) = Take
pi((2, None, None)) = Take
pi((0, 1, (2, 2))) = Take
pi((11, None, None)) = Take
pi((10, None, None)) = Take
pi((5, 0, (2, 1))) = Take
pi((10, 0, (2, 0))) = Quit


In [14]:

from submission import *
from util import *


def main():
    try:
        print('\n========== Problem A ==========')
        mdp = ExampleMDP()
        algorithm = ValueIteration()
        algorithm.solve(mdp, 20, verbose=True) # when epsilon=20, the algorithm repeats 2 iterations
        for i in [-2, -1, 0, 1, 2]:
            print("Value of the state '%d' : %f"%(i, algorithm.V[i]))

        for i in [-1, 0, 1]:
            print("Policy at the state '%d' : %s"%(i, algorithm.pi[i]))

        print('\n========== Problem C ==========')
        mdp1 = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)
        startState = mdp1.startState()
        preBustState = (6, None, (1, 1))
        postBustState = (11, None, None)

        mdp2 = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=15, peekCost=1)
        preEmptyState = (11, None, (1,0))

#         print('\n---------- Test c1 ----------')
#         # Make sure the succAndProbReward function is implemented correctly.

#         vanilla_tests = [
#             ([((1, None, (1, 2)), 0.5, 0), ((5, None, (2, 1)), 0.5, 0)], mdp1, startState, 'Take'),
#             ([((0, None, None), 1, 0)], mdp1, startState, 'Quit'),
#             ([((7, None, (0, 1)), 0.5, 0), ((11, None, None), 0.5, 0)], mdp1, preBustState, 'Take'),
#             ([], mdp1, postBustState, 'Take'),
#             ([], mdp1, postBustState, 'Quit'),
#             ([((12, None, None), 1., 12)], mdp2, preEmptyState, 'Take'),
#         ]

#         print('Vanilla Blackjack')
#         for no, (answer, mdp, state, action) in enumerate(vanilla_tests):
#             print('No %d'%(no+1), end=' ')
#             if answer != mdp.succAndProbReward(state, action):
#                 print('=> wrong')
#             else:
#                 print('=> right')
#             print('- state: {}, action: {}'.format(state, action))
#             print('- true answer =', answer)
#             print('- your answer =', mdp.succAndProbReward(state, action))

        print('\n---------- Test c2 ----------')
        peek_tests = [
            ([((0, 0, (2, 2)), 0.5, -1), ((0, 1, (2, 2)), 0.5, -1)], mdp1, startState, 'Peek'),
            ([((1 , None, (1, 2) ), 1, 0)] , mdp1, (0, 0, (2, 2)), 'Take'),
            ([], mdp1, postBustState, 'Peek'),
            ]

        print('Peeking Blackjack')
        for no, (answer, mdp, state, action) in enumerate(peek_tests):
            print('No %d'%(no+1), end=' ')
            if answer != mdp.succAndProbReward(state, action):
                print('=> wrong')
            else:
                print('=> right')
            print('- state: {}, action: {}'.format(state, action))
            print('- true answer =', answer)
            print('- your answer = ', mdp.succAndProbReward(state, action))

        print('\n---------- Test c3 ----------')
        algorithm = ValueIteration()
        algorithm.solve(mdp1, verbose=True)
        for s in algorithm.V:
            print('V(%s) = %f'%(s, algorithm.V[s]))
        print('------------')
        for s in algorithm.pi:
            print('pi(%s) = %s'%(s, algorithm.pi[s]))
        print('------------')
        print('Q1 (6, None, (1, 1) => %s'%(algorithm.pi[(6, None, (1, 1))]))
        print('Q2 (6, 0, (1, 1) => %s'%(algorithm.pi[(6, 0, (1, 1))]))

        print('\n========== Problem D ==========')
        mdp = util.NumberLineMDP()
        rl = QLearningAlgorithm(mdp.actions, mdp.discount(), identityFeatureExtractor, 0)

        # We call this here so that the stepSize will be 1
        rl.numIters = 1

        rl.incorporateFeedback(0, 1, 0, 1)
        print('Q-value for (state = 0, action = -1) : Answer %.1f, Output %.1f'%(0, rl.getQ(0, -1)))
        print('Q-value for (state = 0, action =  1) : Answer %.1f, Output %.1f'%(0, rl.getQ(0, 1)))

        rl.incorporateFeedback(1, 1, 1, 2)
        print('Q-value for (state = 0, action = -1) : Answer %.1f, Output %.1f'%(0, rl.getQ(0, -1)))
        print('Q-value for (state = 0, action =  1) : Answer %.1f, Output %.1f'%(0, rl.getQ(0, 1)))
        print('Q-value for (state = 1, action = -1) : Answer %.1f, Output %.1f'%(0, rl.getQ(1, -1)))
        print('Q-value for (state = 1, action =  1) : Answer %.1f, Output %.1f'%(1, rl.getQ(1, 1)))

        rl.incorporateFeedback(2, -1, 1, 1)
        print('Q-value for (state = 2, action = -1) : Answer %.1f, Output %.1f'%(1.9, rl.getQ(2, -1)))
        print('Q-value for (state = 2, action =  1) : Answer %.1f, Output %.1f'%(0, rl.getQ(2, 1)))

        print('\n========== Problem E ==========')
        # Small test case
        smallMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)
        compareQLandVI(smallMDP, identityFeatureExtractor)

        # Large test case
        largeMDP = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1)
        compareQLandVI(largeMDP, identityFeatureExtractor)
        print('\n========== Problem F ==========')

        mdp = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)
        rl = QLearningAlgorithm(mdp.actions, mdp.discount(), blackjackFeatureExtractor, 0)

        # We call this here so that the stepSize will be 1
        rl.numIters = 1

        rl.incorporateFeedback((7, None, (0, 1)), 'Quit', 7, (7, None, None))
        print("Q-value for (state = (7, None, (0, 1)), action = 'Quit') : Answer %.1f, Output %.1f"%(28, rl.getQ((7, None, (0, 1)), 'Quit')))
        print("Q-value for (state = (7, None, (1, 0)), action = 'Quit') : Answer %.1f, Output %.1f"%(7, rl.getQ((7, None, (1, 0)), 'Quit')))
        print("Q-value for (state = (2, None, (0, 2)), action = 'Quit') : Answer %.1f, Output %.1f"%(14, rl.getQ((2, None, (0, 2)), 'Quit')))
        print("Q-value for (state = (2, None, (0, 2)), action = 'Take') : Answer %.1f, Output %.1f"%(0, rl.getQ((2, None, (0, 2)), 'Take')))

        # Large test case
        largeMDP = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1)
        # random.seed(0)
        compareQLandVI(largeMDP, blackjackFeatureExtractor)  # 591/2745 = 0.215301% different states (when random.seed(0))

    except NotImplementedError as err:
        # print err
        print("\nNotImplementedError: you didn't implement the function.")


if __name__ == '__main__':
    main()



state 0 ['Left', 'Right'] [10.8, 11.7]
state 1 ['Left', 'Right'] [26.759999999999998, 35.915]
state 2 ['Left', 'Right'] [0, 0]
state -2 ['Left', 'Right'] [0, 0]
state -1 ['Left', 'Right'] [17.69, 16.535]
ValueIteration: 2 iterations
Value of the state '-2' : 0.000000
Value of the state '-1' : 14.000000
Value of the state '0' : 13.450000
Value of the state '1' : 23.000000
Value of the state '2' : 0.000000
Policy at the state '-1' : Left
Policy at the state '0' : Right
Policy at the state '1' : Right


---------- Test c2 ----------
Peeking Blackjack
No 1 => wrong
- state: (0, None, (2, 2)), action: Peek
- true answer = [((0, 0, (2, 2)), 0.5, -1), ((0, 1, (2, 2)), 0.5, -1)]
- your answer =  [(None, None, -1), (None, None, -1)]
No 2 

TypeError: '>' not supported between instances of 'NoneType' and 'int'