In [25]:
import numpy as np


In [26]:
class Absent_Minded_Driver:
    
    # Full specification of the absent-minded driver problem
    
    def __init__(self):   
        self.actions = ["CONT", "EXIT"]
        self.states = ["Intersection_1", "Intersection_2", "Stop_A", "Stop_B", "Stop_C"]
        self.end_states = ["Stop_A", "Stop_B", "Stop_C"]
        self.epistemic_states = ["Intersection", "Stop_A", "Stop_B", "Stop_C"]
        
        self.state_to_epistemic_state_dict = {"Intersection_1":"Intersection",
                                "Intersection_2":"Intersection", 
                                "Stop_A":"Stop_A", 
                                "Stop_B":"Stop_B", 
                                "Stop_C":"Stop_C"
                                }
        
        #self.epistemic_state_to_state_dict = {"Intersection":["Intersection_1", "Intersection_2"],
        #                                    "Stop_A":["Stop_A"], 
        #                                    "Stop_B":["Stop_B"], 
        #                                    "Stop_C":["Stop_C"]
        #                                    }
    
        self.causation_dict = {("Intersection_1", "EXIT"):"Stop_A", 
                              ("Intersection_1", "CONT"):"Intersection_2", 
                              ("Intersection_2", "EXIT"):"Stop_B", 
                              ("Intersection_2", "CONT"):"Stop_C", 
                              }

        self.utility_dict = {"Stop_A":0,
                        "Stop_B":4,
                        "Stop_C":1,
                        }
        
        self.start_state = "Intersection_1"
        #self.reset()

    def reset(self, agent):
        self.finished = False
        self.state = self.start_state
        
    def do(self, action, distribution=[1]):
        
        if action in self.actions:
            self.state = self.cause(self.state, action)
            utility = self.utility(self.state)
        else:
            assert False
            
        if self.state in self.end_states:
            self.finished = True
        
        return (self.epistemic_state(), utility)
        
    def utility(self, state):
        try:
            return self.utility_dict[state]
        except:
            return 0
    
    def epistemic_state(self):
        return self.state_to_epistemic_state_dict[self.state]
        
    def cause(self, state, action):
        try:
            return self.causation_dict[(state, action)]
        except:
            return state
            
    #def possible_states(self, epistemic_state):
    #    return self.inverse_epistemic_state_dict[epistemic_state]       

In [27]:
class Evidential_Blackmail:
    
    # Full specification of the absent-minded driver problem
    
    def __init__(self):   
        self.actions = ["PAY", "DONT"]
        
        self.states = ["crash+blackmail", "no crash+blackmail", 
                       "crash+no blackmail", "no crash+no blackmail",
                       "crash+pay", "crash+no pay", "no crash+pay", "no crash+no pay"]
        
        self.end_states = ["crash+pay", "crash+no pay", "no crash+pay", "no crash+no pay"]
        
        self.epistemic_states = ["Blackmail", "No Blackmail", "END"]
        
        self.state_to_epistemic_state_dict = {"crash+blackmail":"Blackmail",
                                              "no crash+blackmail":"Blackmail",
                                              "crash+no blackmail":"No Blackmail",
                                              "no crash+no blackmail":"No Blackmail",
                                              "crash+pay":"END",
                                              "crash+no pay":"END",
                                              "no crash+pay":"END",
                                              "no crash+no pay":"END",
                                             }
    
        self.causation_dict = {("crash+blackmail", "PAY"):"crash+pay", 
                               ("crash+blackmail", "DONT"):"crash+no pay", 
                               ("no crash+blackmail", "PAY"):"no crash+pay", 
                               ("no crash+blackmail", "DONT"):"no crash+no pay", 
                               ("crash+no blackmail", "PAY"):"crash+pay",
                               ("crash+no blackmail", "DONT"):"crash+no pay", 
                               ("no crash+no blackmail", "PAY"):"no crash+pay", 
                               ("no crash+no blackmail", "DONT"):"no crash+no pay", 
                              }

        self.utility_dict = {"no crash+no pay":11,
                             "no crash+pay":10,
                             "crash+no pay":1,
                             "crash+pay":0
                            }
        
        #self.reset()

    def reset(self, agent):
        self.finished = False
        crash = np.random.choice([True, False], 1, p=[0.5, 0.5])[0]
        
        if crash:
            self.state = "crash+no blackmail"
            
        else:
            if agent.decide("Blackmail") == "PAY":
                self.state = "no crash+blackmail"
            else:
                self.state = "no crash+no blackmail"

        
    def do(self, action, distribution=[1]):
        
        if action in self.actions:
            self.state = self.cause(self.state, action)
            utility = self.utility(self.state)
        else:
            assert False
            
        if self.state in self.end_states:
            self.finished = True
        
        return (self.epistemic_state(), utility)
        
    def utility(self, state):
        try:
            return self.utility_dict[state]
        except:
            return 0
    
    def epistemic_state(self):
        return self.state_to_epistemic_state_dict[self.state]
        
    def cause(self, state, action):
        try:
            return self.causation_dict[(state, action)]
        except:
            return state
  

In [28]:
def to_probability(exp_util):
    return softmax(exp_util)

def normalise(x):
    ans = x.copy()
    for i in range(len(x)):
        ans[i] /= sum(x) 
    return ans

def softmax(x):
    x = [i/10.0 for i in x]
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [29]:
class Action_Agent:
    
    # Agent that iterates over actions and selects one that has given it a lot of utility in the past

    def __init__(self, decision_problem):  

        self.DP = decision_problem
        self.actions = self.DP.actions
        
        temp = {action:1 for action in self.actions}
        self.expected_utility = {es:temp.copy() for es in self.DP.epistemic_states}
        self.times_action_taken = {es:temp.copy() for es in self.DP.epistemic_states}
        
    def get_action_probabilities(self, epistemic_state):
        
        xp = [self.expected_utility[epistemic_state][action] for action in self.actions]
        action_probabilities = to_probability(xp)
        return action_probabilities
            
    def decide(self, epistemic_state):
        
        action_probabilities = self.get_action_probabilities(epistemic_state)
        return np.random.choice(self.actions, 1, p=action_probabilities)[0]
    
    def learn_from(self, epistemic_state, action, utility):
        
        i = self.times_action_taken[epistemic_state][action]
        exp = self.expected_utility[epistemic_state][action]
        self.expected_utility[epistemic_state][action] = (utility+exp*i)/(i+1.0)
        self.times_action_taken[epistemic_state][action] += 1
                
    def play(self, iterations):
        
        utility_record = []
        for i in range(iterations):          

            self.DP.reset(self)
            total_utility = 0
            history = []
            
            while not self.DP.finished:
                
                epistemic_state = self.DP.epistemic_state()
                action = self.decide(epistemic_state)
                action_probabilities = self.get_action_probabilities(epistemic_state)
                _, new_utility = self.DP.do(action, action_probabilities)
                
                history.append((epistemic_state, action, new_utility))
                total_utility += new_utility

            average_action_utility = total_utility/len(history)
            
            for epistemic_state, action, _ in history:
                self.learn_from(epistemic_state, action, average_action_utility)
                
            utility_record.append(total_utility)
            
        return utility_record
                    

In [30]:
class Naive_Action_Agent:
    
    # Agent that iterates over actions, but only rewards the action immideately preceding the reward

    def __init__(self, decision_problem):  

        self.DP = decision_problem
        self.actions = self.DP.actions
        
        temp = {action:1 for action in self.actions}
        self.expected_utility = {es:temp.copy() for es in self.DP.epistemic_states}
        self.times_action_taken = {es:temp.copy() for es in self.DP.epistemic_states}
        
    def get_action_probabilities(self, epistemic_state):
        
        xp = [self.expected_utility[epistemic_state][action] for action in self.actions]
        action_probabilities = to_probability(xp)
        return action_probabilities
            
    def decide(self, epistemic_state):
        
        action_probabilities = self.get_action_probabilities(epistemic_state)      
        return np.random.choice(self.actions, 1, p=action_probabilities)[0]
    
    def learn_from(self, epistemic_state, action, utility):
        
        i = self.times_action_taken[epistemic_state][action]
        exp = self.expected_utility[epistemic_state][action]
        self.expected_utility[epistemic_state][action] = (utility+exp*i)/(i+1)
        self.times_action_taken[epistemic_state][action] += 1
        
    def play(self, iterations):
        
        utility_record = []
        for i in range(iterations):          

            self.DP.reset(self)
            total_utility = 0
            history = []
            
            while not self.DP.finished:

                epistemic_state = self.DP.epistemic_state()
                action = self.decide(epistemic_state)
                action_probabilities = self.get_action_probabilities(epistemic_state)
                _, new_utility = self.DP.do(action, action_probabilities)
                
                self.learn_from(epistemic_state, action, new_utility)
                
                total_utility += new_utility
                
            utility_record.append(total_utility)
            
        return utility_record
                    

In [31]:
class Policy_Agent:
    
    # Agent that iterated over policies

    def __init__(self, decision_problem):  

        self.DP = decision_problem
        self.actions = self.DP.actions
        
        self.policies = [str(x) for x in np.arange(0.0, 1.0, 0.05)] # assumes only two possible actions atm
        
        temp = {policy:1 for policy in self.policies}
        self.expected_utility = {es:temp.copy() for es in self.DP.epistemic_states}
        self.times_action_taken = {es:temp.copy() for es in self.DP.epistemic_states}
        
    def get_action_probabilities(self, epistemic_state):
        
        xp = [self.expected_utility[epistemic_state][policy] for policy in self.policies]
        policy_probabilities = to_probability(xp)
        p = sum(np.arange(0.0, 1.0, 0.05)*policy_probabilities)
        
        return [p, 1-p]
            
    def decide(self, epistemic_state):
        # not used for this agent
        
        policy = self.get_action_probabilities(epistemic_state)
        return np.random.choice(self.actions, 1, p=policy)[0]
    
    def learn_from(self, epistemic_state, policy, utility):
        
        i = self.times_action_taken[epistemic_state][policy]
        exp = self.expected_utility[epistemic_state][policy]
        self.expected_utility[epistemic_state][policy] = (utility+exp*i)/(i+1)
        self.times_action_taken[epistemic_state][policy] += 1
    
    def choose_policy(self, epistemic_state):
        
        xp = [self.expected_utility[epistemic_state][policy] for policy in self.policies]
        policy_probabilities = to_probability(xp)
        #policy_probabilities = test_prob_arr
        policy = np.random.choice(self.policies, 1, p=policy_probabilities)[0]
        
        return policy
                
    def play(self, iterations):
    
        utility_record = []
        for i in range(iterations):          

            self.DP.reset(self)
            total_utility = 0
            #history = []
            
            policy_dict = {es:self.choose_policy(es) for es in self.DP.epistemic_states}
            
            while not self.DP.finished:
                
                epistemic_state = self.DP.epistemic_state()
                policy = policy_dict[epistemic_state]
                action = np.random.choice(self.actions, 1, p=[float(policy), 1-float(policy)])[0]
                _, new_utility = self.DP.do(action, policy)
                
                #history.append((epistemic_state, policy, new_utility))
                
                total_utility += new_utility
                
            for es in self.DP.epistemic_states:
                pc = policy_dict[es]
                self.learn_from(es, pc, total_utility)

            #average_policy_utility = total_utility/len(history)
            #for epistemic_state, policy, _ in history:
            #    self.learn_from(epistemic_state, policy, average_policy_utility)
                
            utility_record.append(total_utility)
            
        return utility_record
                    

In [32]:
for i in range(5):
    AMD = Absent_Minded_Driver()
    agent = Action_Agent(AMD)

    print(np.mean(agent.play(10000)))
    # optimal value: 4/3

    print(agent.get_action_probabilities("Intersection")[0])
    # optimal value: 2/3
    
    print()

1.2618
0.507997316232

1.2735
0.50791119912

1.2487
0.508196909592

1.2712
0.507851333041

1.2572
0.508404768772



In [33]:
for i in range(5):
    AMD = Absent_Minded_Driver()
    agent = Naive_Action_Agent(AMD)

    print(np.mean(agent.play(10000)))
    # optimal value: 4/3
    
    print(agent.get_action_probabilities("Intersection")[0])
    # optimal value: 2/3
    
    print()

1.2227
0.4758811679

1.2642
0.474404392337

1.2359
0.475402064757

1.2214
0.475946006534

1.1979
0.476765568304



In [34]:
AMD = Absent_Minded_Driver()
agent = Policy_Agent(AMD)

for i in range(5):

    print(np.mean(agent.play(10000)))
    # optimal value: 4/3
    
    print(agent.get_action_probabilities("Intersection")[0])
    # optimal value: 2/3

    print()

0.9831
0.484216925736

0.9986
0.484508410309

1.0065
0.484482036277

1.0107
0.484495306922

0.9617
0.484501916129



In [36]:
for i in range(5):
    EB = Evidential_Blackmail()
    agent = Action_Agent(EB)

    print(np.mean(agent.play(10000)))

    print(agent.get_action_probabilities("Blackmail")[0])
    print(agent.get_action_probabilities("No Blackmail")[0])
    
    print()

5.5568
0.475014891802
0.476538777514

5.5941
0.475018309396
0.471766838764

5.4328
0.475014897395
0.473873518892

5.5592
0.475017622465
0.476112635645

5.5597
0.475012112494
0.474844463627



In [37]:
for i in range(5):
    EB = Evidential_Blackmail()
    agent = Naive_Action_Agent(EB)

    print(np.mean(agent.play(10000)))

    print(agent.get_action_probabilities("Blackmail")[0])
    print(agent.get_action_probabilities("No Blackmail")[0])
    
    print()
    

5.5456
0.475027877184
0.478972403014

5.5418
0.475024693499
0.470364282871

5.3883
0.47503403463
0.476956420957

5.5375
0.475022993943
0.472022257449

5.5869
0.475003438878
0.476092235975



In [38]:
for i in range(5):
    EB = Evidential_Blackmail()
    agent = Policy_Agent(EB)

    print(np.mean(agent.play(10000)))

    print(agent.get_action_probabilities("Blackmail")[0])
    print(agent.get_action_probabilities("No Blackmail")[0])
    
    print()
    

5.4781
0.474453519622
0.468667271708

5.5445
0.472710655986
0.468298815595

5.6398
0.473266916141
0.466815931958

5.5072
0.473972252931
0.467717416996

5.5111
0.475044743085
0.470855925838

