# Intel·ligència Artificial

# Q-LEARNING - Joc de Cartes (Manera 1) {Perder}

In [28]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = "Pau Sanchez Valdivieso"
__email__ = "pau@startval.com"
__date__ = "4-1-2017"

In [29]:
# Declaracions

gamma = 1
alpha = 1

states = range(6)

alternate = 'P'
q_values = {(S, A): 0 for S in states for A in ("T", "P")}

In [30]:
def get_next_card():
    """ Vomita la següent carta i crea el generador"""
    
    game_sequence = (2, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2)
    
    for card in game_sequence:
        yield card

In [31]:
def get_action(state):
    """ Retorna l'acció segons la carta """
    
    if state in {0, 1, 2}:
        return 'T'
    if state == 3:
        global alternate
        alternate = 'T' if alternate == 'P' else 'P'
        return alternate
    return 'P'

In [32]:
def get_best_QSA(state):
    """ Retorna el millor futur QSA """   
    
    try:
        keys = filter(lambda x: state in x, q_values)
        return q_values[max(keys, key=lambda x: q_values[x])]
    except:
        return 0

In [33]:
def update(QSA, state, reward):
    """ Retorna el valor d'actualització """
    
    max_QSA = get_best_QSA(state)
    return QSA + alpha * (reward + gamma * max_QSA)

In [34]:
def calc_QSA_values(num_iterations):
    """ Calcula els qValor a cada iteració """

    curr_state = 0
    n_card = get_next_card()

    for i in range(num_iterations):

        action = get_action(curr_state)

        if action == 'T':
            card = n_card.next()
            reward = 0
            next_state = curr_state + card
            q_values[(curr_state, action)] = update(q_values[(curr_state, action)], next_state, reward)

        if action == 'P':
            reward = curr_state * 2 if curr_state == 5 else curr_state
            q_values[(curr_state, action)] = get_best_QSA(next_state) + reward
            next_state = 0

        curr_state = next_state if action == 'T' else 0
            
num_iterations = 15
calc_QSA_values(num_iterations)

In [35]:
def get_ViS_values():
    """ Calcula el Vi(S) i la millor política """
    
    ViS_values = {}
    argmax_policy = dict()
    
    for S in states:
        array = [x for x in q_values.iteritems() if x[0][0] == S]
        
        argmax = max(array, key=lambda x: x[1])
        ViS_values[S] = argmax[1]
        
        if array[0][1] == array[1][1]: 
            argmax_policy[S] = 'T, P'
        else:
            argmax_policy[S] = argmax[0][1]
    
    return ViS_values, argmax_policy

In [36]:
def result():
    """ Crea els resultats per mostrar """
    
    ViS, argmax = get_ViS_values()

    res = "\n{}\nQ-Valors finals\n{}".format(30 * '*', 30 * '*')
    for key, QSA in sorted(q_values.iteritems(), key=lambda x: x[0]):
        res += "\nQ{0} >> {1}".format(key, QSA)

    res += "\n\n{}\nVi(S) final de cada estat\n{}".format(30 * '*', 30 * '*')
    for key, vis in ViS.iteritems():
        res += "\nEstat: {0} >> Vi(S): {1}".format(key, vis)

    res += "\n\n{}\nPolítica final de cada estat\n{}".format(30 * '*', 30 * '*')
    for key, arg in argmax.iteritems():
        res += "\nEstat: {0} >> Política: {1}".format(key, arg)
        
    return res

In [37]:
print result()


******************************
Q-Valors finals
******************************
Q(0, 'P') >> 0
Q(0, 'T') >> 3
Q(1, 'P') >> 0
Q(1, 'T') >> 0
Q(2, 'P') >> 0
Q(2, 'T') >> 4
Q(3, 'P') >> 3
Q(3, 'T') >> 10
Q(4, 'P') >> 8
Q(4, 'T') >> 0
Q(5, 'P') >> 20
Q(5, 'T') >> 0

******************************
Vi(S) final de cada estat
******************************
Estat: 0 >> Vi(S): 3
Estat: 1 >> Vi(S): 0
Estat: 2 >> Vi(S): 4
Estat: 3 >> Vi(S): 10
Estat: 4 >> Vi(S): 8
Estat: 5 >> Vi(S): 20

******************************
Política final de cada estat
******************************
Estat: 0 >> Política: T
Estat: 1 >> Política: T, P
Estat: 2 >> Política: T
Estat: 3 >> Política: T
Estat: 4 >> Política: P
Estat: 5 >> Política: P
