# Optimistic-RL

In [None]:
import random
import numpy as np
from numpy.random import choice

In [None]:
# Problem Definition

states = ('PU', 'PF', 'RU', 'RF')
actions = ('save_money', 'advertise')

def reward(state):
    if state.startswith('P'):
        return 0
    elif state.startswith('R'):
        return 10

def transition_function(state, action, resulted_state):
    transition_dictionary = {
        ('PU', 'save_money', 'PU'): 1.0,
        ('PU', 'save_money', 'PF'): 0,
        ('PU', 'save_money', 'RU'): 0,
        ('PU', 'save_money', 'RF'): 0,
        ('PU', 'advertise', 'PU'): 0.5,
        ('PU', 'advertise', 'PF'): 0.5,
        ('PU', 'advertise', 'RU'): 0,
        ('PU', 'advertise', 'RF'): 0,

        ('PF', 'save_money', 'PU'): 0.5,
        ('PF', 'save_money', 'PF'): 0,
        ('PF', 'save_money', 'RU'): 0,
        ('PF', 'save_money', 'RF'): 0.5,
        ('PF', 'advertise', 'PU'): 0,
        ('PF', 'advertise', 'PF'): 1.0,
        ('PF', 'advertise', 'RU'): 0,
        ('PF', 'advertise', 'RF'): 0,

        ('RU', 'save_money', 'PU'): 0.5,
        ('RU', 'save_money', 'PF'): 0,
        ('RU', 'save_money', 'RU'): 0.5,
        ('RU', 'save_money', 'RF'): 0,
        ('RU', 'advertise', 'PU'): 0.5,
        ('RU', 'advertise', 'PF'): 0.5,
        ('RU', 'advertise', 'RU'): 0,
        ('RU', 'advertise', 'RF'): 0,

        ('RF', 'save_money', 'PU'): 0,
        ('RF', 'save_money', 'PF'): 0,
        ('RF', 'save_money', 'RU'): 0.5,
        ('RF', 'save_money', 'RF'): 0.5,
        ('RF', 'advertise', 'PU'): 0,
        ('RF', 'advertise', 'PF'): 1.0,
        ('RF', 'advertise', 'RU'): 0,
        ('RF', 'advertise', 'RF'): 0
    }
    return transition_dictionary[state, action, resulted_state]


def get_next_state(state, action):
    probabilities = [transition_function(state, action, s) for s in states]
    resulted_state = choice(states, p=probabilities)
    return resulted_state

In [None]:
# Optimistic-RL class implementation

from collections import defaultdict
from copy import deepcopy
import numpy as np

class OptimisticRL:
    '''
    Assuming P and R are unknown and needed to be estimated.
    '''

    def __init__(self, states, actions, r_max, gamma=0.9, N_e=3, T=100):
        # mapping states and actions to integers:
        self.original_states = states
        self.original_actions = actions
        self.states = list(range(len(states)))
        self.actions = list(range(len(actions)))

        self.T = T # number of iteration for VI
        self.N_e = N_e # explored-enough constant
        self.N = defaultdict(int) # number of times action a has been tried in state s
        self.P = self.init_probs() # est. current probs
        self.V = {s: 0 for s in states} # est. values for all states
        self.observed_transitions = 0 * self.init_probs() # number of times action a has been tried in state s and led to state s'
        self.gamma = gamma  # discount constant
        self.R_max = r_max
        self.V_max = self.R_max / (1-gamma)
        self.R = {s: self.R_max for s in self.states} # est. current rewards
        self.policy = {} # current known policy

    def init_probs(self):
      n = len(self.states)
      m = len(self.actions)
      return (1/n) * np.ones((n, m, n))
    
    def value_iteration(self, optimistic=True):
      policy = {}
      old_V = {s: 0 for s in self.states}
      for i in range(self.T):
        new_V = deepcopy(old_V)
        for s in self.states:
          max_a_exp = float("-inf") # sum ( P(s' | s , a) * V(s') )
          max_a = None # a which maximizing max_a_exp
          for a in self.actions:
            if optimistic and self.N[(s, a)] < self.N_e:
              max_a_exp = self.V_max
              max_a = a
              break
            else:
              exp = sum([self.P[s][a][s2] * old_V[s2] for s2 in self.states])
              if exp > max_a_exp:
                max_a_exp = exp
                max_a = a
          policy[s] = max_a
          new_V[s] = self.R[s] + self.gamma * max_a_exp
          old_V = deepcopy(new_V)
      self.V = deepcopy(new_V)
      self.policy = deepcopy(policy)

    def choose_action(self, state):
      s = self.original_states.index(state)
      a = self.policy[s]
      action = self.original_actions[a]
      return action

    def learn(self, s1, a1, reward_state1, s2):
      state1 = self.original_states.index(s1)
      action1 = self.original_actions.index(a1)
      state2 = self.original_states.index(s2)
      self.R[state1] = reward_state1
      self.N[(state1, action1)] += 1
      self.observed_transitions[state1][action1][state2] += 1
      self.update_probs()

    def update_probs(self):
      for state1 in self.states:
        for action1 in self.actions:
          for state2 in self.states:
              if sum(self.observed_transitions[state1][action1]) > 0:
                self.P[state1][action1][state2] = self.observed_transitions[state1][action1][state2] / sum(self.observed_transitions[state1][action1])
              else:
                self.P[state1][action1][state2] = 0
    
    def print_policy(self):
        for s in self.states:
            a = self.policy[s]
            print(self.original_states[s], self.original_actions[a])
        print()

In [None]:
# Usage - print initial policy

opt_rl = OptimisticRL(states, actions, max([reward(s) for s in states]))
curr_state = 'PU'
opt_rl.value_iteration()
print('initial policy: ')
opt_rl.print_policy()

# Usage - print learned policy

n_iter = 100
for j in range(n_iter):
    opt_rl.value_iteration()
    a = opt_rl.choose_action(curr_state)
    next_state = get_next_state(curr_state, a)
    opt_rl.learn(curr_state, a, reward(curr_state), next_state)
    curr_state = next_state
print('\nlearned policy: ')
opt_rl.print_policy()
print('\nestimated value: ')
opt_rl.V

initial policy: 
PU save_money
PF save_money
RU save_money
RF save_money


learned policy: 
PU advertise
PF save_money
RU save_money
RF save_money


estimated value: 


{0: 29.15463929286255,
 1: 35.30951072583212,
 2: 44.06945022961627,
 3: 52.059528109155664}