In [83]:
import numpy as np


In [84]:
class Absent_Minded_Driver:
    
    def __init__(self):   
        self.actions = ["CONT", "EXIT"]
        self.states = ["Intersection_1", "Intersection_2", "Stop_A", "Stop_B", "Stop_C"]
        self.end_states = ["Stop_A", "Stop_B", "Stop_C"]
        self.epistemic_states = ["Intersection", "Stop_A", "Stop_B", "Stop_C"]
        
        self.state_to_epistemic_state_dict = {"Intersection_1":"Intersection",
                                              "Intersection_2":"Intersection", 
                                              "Stop_A":"Stop_A",
                                              "Stop_B":"Stop_B",
                                              "Stop_C":"Stop_C",
                                             }
    
        self.causation_dict = {("Intersection_1", "EXIT"):"Stop_A", 
                               ("Intersection_1", "CONT"):"Intersection_2", 
                               ("Intersection_2", "EXIT"):"Stop_B",
                               ("Intersection_2", "CONT"):"Stop_C",
                              }

        self.utility_dict = {"Stop_A":0,
                             "Stop_B":4,
                             "Stop_C":1,
                            }

    def reset_with(self, agent):
        self.finished = False
        self.state = "Intersection_1"
        
    def run(self, agent, iterations):
        
        history = []
        
        for i in range(iterations):          

            self.reset_with(agent)
            
            while not self.finished:
                
                epistemic_state = self.epistemic_state()
                action_distribution = agent.get_action_distribution(epistemic_state)
                action = np.random.choice(self.actions, 1, p=action_distribution)[0]
                _, utility = self.do(action)
                
                history.append((epistemic_state, action, utility))
            
        return history
            
    def do(self, action):
        
        self.state = self.cause(self.state, action)
        utility = self.utility(self.state)
            
        if self.state in self.end_states:
            self.finished = True
        
        return (self.epistemic_state(), utility)
        
    def utility(self, state):
        try:
            return self.utility_dict[state]
        except:
            return 0
    
    def epistemic_state(self):
        return self.state_to_epistemic_state_dict[self.state]
        
    def cause(self, state, action):
        try:
            return self.causation_dict[(state, action)]
        except:
            return state     

In [85]:
class Evidential_Blackmail:
    
    # Full specification of the absent-minded driver problem
    
    def __init__(self):   
        self.actions = ["PAY", "DONT"]
        
        self.states = ["crash+blackmail", "no crash+blackmail", 
                       "crash+no blackmail", "no crash+no blackmail",
                       "crash+pay", "crash+no pay", "no crash+pay", "no crash+no pay"]
        
        self.end_states = ["crash+pay", "crash+no pay", "no crash+pay", "no crash+no pay"]
        
        self.epistemic_states = ["Blackmail", "No Blackmail", "END"]
        
        self.state_to_epistemic_state_dict = {"crash+blackmail":"Blackmail",
                                              "no crash+blackmail":"Blackmail",
                                              "crash+no blackmail":"No Blackmail",
                                              "no crash+no blackmail":"No Blackmail",
                                              "crash+pay":"END",
                                              "crash+no pay":"END",
                                              "no crash+pay":"END",
                                              "no crash+no pay":"END",
                                             }
    
        self.causation_dict = {("crash+blackmail", "PAY"):"crash+pay", 
                               ("crash+blackmail", "DONT"):"crash+no pay", 
                               ("no crash+blackmail", "PAY"):"no crash+pay", 
                               ("no crash+blackmail", "DONT"):"no crash+no pay", 
                               ("crash+no blackmail", "PAY"):"crash+pay",
                               ("crash+no blackmail", "DONT"):"crash+no pay", 
                               ("no crash+no blackmail", "PAY"):"no crash+pay", 
                               ("no crash+no blackmail", "DONT"):"no crash+no pay", 
                              }

        self.utility_dict = {"no crash+no pay":11,
                             "no crash+pay":10,
                             "crash+no pay":1,
                             "crash+pay":0
                            }

    def reset_with(self, agent):
        self.finished = False
        crash = np.random.choice([True, False], 1, p=[0.5, 0.5])[0]
        
        if crash:
            self.state = "crash+no blackmail"
            
        else:
            action_distribution = agent.get_action_distribution("Blackmail")
            action = np.random.choice(self.actions, 1, p=action_distribution)[0]
            
            if action == "PAY":
                self.state = "no crash+blackmail"
            else:
                self.state = "no crash+no blackmail"

    def run(self, agent, iterations):
        
        history = []
        
        for i in range(iterations):          

            self.reset_with(agent)
            
            while not self.finished:
                
                epistemic_state = self.epistemic_state()
                action_distribution = agent.get_action_distribution(epistemic_state)
                action = np.random.choice(self.actions, 1, p=action_distribution)[0]
                _, utility = self.do(action)
                
                history.append((epistemic_state, action, utility))
            
        return history
        
    def do(self, action):
        
        if action in self.actions:
            self.state = self.cause(self.state, action)
            utility = self.utility(self.state)
        else:
            assert False
            
        if self.state in self.end_states:
            self.finished = True
        
        return (self.epistemic_state(), utility)
        
    def utility(self, state):
        try:
            return self.utility_dict[state]
        except:
            return 0
    
    def epistemic_state(self):
        return self.state_to_epistemic_state_dict[self.state]
        
    def cause(self, state, action):
        try:
            return self.causation_dict[(state, action)]
        except:
            return state
  

In [86]:
def normalise(x):
    ans = x.copy()
    for i in range(len(x)):
        ans[i] /= sum(x) 
    return ans

class Softmax:
    
    def __init__(self, temperature):
        self.temperature = float(temperature)
        
    def function(self, x):
        x = [i/self.temperature for i in x]
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)
    
class Epsilon_Greedy:
    
    def __init__(self, epsilon):
        self.epsilon = float(epsilon)
        
    def function(self, x):
        exploring = np.random.choice([True, False], 1, p=[self.epsilon, 1-self.epsilon])[0]
        
        if exploring:
            return [1.0/len(x) for i in x]
        else:
            mx = max(x)
            mxs = list(filter())
            return [1.0/len(mxs) if i==mx else 0 for i in x]

In [88]:
class Action_Agent:
    
    # Agent that iterates over actions and selects one that has given it a lot of utility in the past

    def __init__(self, exploration_scheme, decision_problem):  

        self.actions = decision_problem.actions
        self.epistemic_states = decision_problem.epistemic_states
        
        self.exploration = exploration_scheme
        
        temp = {action:1 for action in self.actions}
        self.expected_utility = {es:temp.copy() for es in self.epistemic_states}
        self.times_action_taken = {es:temp.copy() for es in self.epistemic_states}
        
    def get_action_distribution(self, epistemic_state):
        
        xp = [self.expected_utility[epistemic_state][action] for action in self.actions]
        action_probabilities = self.exploration.function(xp)
        return action_probabilities
    
    def learn_from(self, training_data):
        
        average_utility = sum([entry[2] for entry in training_data])/len(training_data)
        
        for epistemic_state, action, utility in training_data:
        
            i = self.times_action_taken[epistemic_state][action]
            exp = self.expected_utility[epistemic_state][action]
            self.expected_utility[epistemic_state][action] = (average_utility+exp*i)/(i+1.0)
            self.times_action_taken[epistemic_state][action] += 1
                
                    

In [89]:
class Naive_Action_Agent:
    
    # Agent that iterates over actions, but only rewards the action immideately preceding the reward

    def __init__(self, exploration_scheme, decision_problem):  

        self.actions = decision_problem.actions
        self.epistemic_states = decision_problem.epistemic_states
        
        self.exploration = exploration_scheme
        
        temp = {action:1 for action in self.actions}
        self.expected_utility = {es:temp.copy() for es in self.epistemic_states}
        self.times_action_taken = {es:temp.copy() for es in self.epistemic_states}
        
    def get_action_distribution(self, epistemic_state):
        
        xp = [self.expected_utility[epistemic_state][action] for action in self.actions]
        action_probabilities = self.exploration.function(xp)
        return action_probabilities
    
    def learn_from(self, training_data):
        
        for epistemic_state, action, utility in training_data:
        
            i = self.times_action_taken[epistemic_state][action]
            exp = self.expected_utility[epistemic_state][action]
            self.expected_utility[epistemic_state][action] = (utility+exp*i)/(i+1)
            self.times_action_taken[epistemic_state][action] += 1
                    

In [90]:
class Policy_Agent:
    
    # Agent that iterated over policies

    def __init__(self, exploration_scheme, decision_problem):  

        self.actions = decision_problem.actions
        self.epistemic_states = decision_problem.epistemic_states
        
        self.exploration = exploration_scheme
        
        self.policies = [str(x) for x in np.arange(0.0, 1.0, 0.05)] # assumes only two possible actions atm
        
        temp = {policy:1 for policy in self.policies}
        self.expected_utility = {es:temp.copy() for es in self.epistemic_states}
        self.times_action_taken = {es:temp.copy() for es in self.epistemic_states}
        
        self.policy = {es:[1,0] for es in self.epistemic_states}
        self.decide_on_policy()
        
    def get_action_distribution(self, epistemic_state):

        return self.policy[epistemic_state]
    
    def decide_on_policy(self):
        
        for es in self.epistemic_states:
            
            xp = [self.expected_utility[es][policy] for policy in self.policies]
            policy_probabilities = self.exploration.function(xp)
            p = float(np.random.choice(self.policies, 1, p=policy_probabilities)[0])
            self.policy[es] = [p, 1-p]
            
    def learn_from(self, training_data):
        
        for epistemic_state, policy, utility in training_data:
            
            policy = str(policy)
        
            i = self.times_action_taken[epistemic_state][policy]
            exp = self.expected_utility[epistemic_state][policy]
            self.expected_utility[epistemic_state][policy] = (utility+exp*i)/(i+1)
            self.times_action_taken[epistemic_state][policy] += 1
                                 

In [93]:
for i in range(5):
    AMD = Absent_Minded_Driver()
    softmax = Softmax(100)
    agent = Action_Agent(softmax, AMD)
    
    total_utility = 0
    
    for j in range(1000):
        history = AMD.run(agent, 100)
        agent.learn_from(history)
        total_utility += sum([entry[2] for entry in history])

    print(total_utility/(1000*100))
    # optimal value: 4/3

    print(agent.get_action_distribution("Intersection")[0])
    # optimal value: 2/3
    
    print()

1.2474
0.500001053042
{'CONT': 0.8321567960066389, 'EXIT': 0.8317355792481576}
{'CONT': 74943, 'EXIT': 74997}

1.24691
0.499999161984
{'CONT': 0.8316734364944711, 'EXIT': 0.8320086427940054}
{'CONT': 74866, 'EXIT': 75034}

1.25273
0.500000971304
{'CONT': 0.834431595445814, 'EXIT': 0.8340430737911079}
{'CONT': 75295, 'EXIT': 74872}

1.24209
0.500001739046
{'CONT': 0.8295660178018505, 'EXIT': 0.8288703994723358}
{'CONT': 74777, 'EXIT': 75016}

1.24756
0.500000946895
{'CONT': 0.8322367279312615, 'EXIT': 0.831857970011479}
{'CONT': 74940, 'EXIT': 75001}



In [77]:
for i in range(5):
    AMD = Absent_Minded_Driver()
    softmax = Softmax(100)
    agent = Naive_Action_Agent(softmax, AMD)
    
    total_utility = 0
    
    for j in range(1000):
        history = AMD.run(agent, 100)
        agent.learn_from(history)
        total_utility += sum([entry[2] for entry in history])

    print(total_utility/(1000*100))
    # optimal value: 4/3

    print(agent.get_action_distribution("Intersection")[0])
    # optimal value: 2/3
    
    print()

1.23302
0.475532675082
{'Intersection': {'CONT': 0.32232106238243985, 'EXIT': 1.3017963761640214}, 'Stop_A': {'CONT': 1, 'EXIT': 1}, 'Stop_B': {'CONT': 1, 'EXIT': 1}, 'Stop_C': {'CONT': 1, 'EXIT': 1}}
{'Intersection': {'CONT': 70709, 'EXIT': 77211}, 'Stop_A': {'CONT': 1, 'EXIT': 1}, 'Stop_B': {'CONT': 1, 'EXIT': 1}, 'Stop_C': {'CONT': 1, 'EXIT': 1}}

1.22801
0.475700451986
{'Intersection': {'CONT': 0.3220211268606845, 'EXIT': 1.2947693660150985}, 'Stop_A': {'CONT': 1, 'EXIT': 1}, 'Stop_B': {'CONT': 1, 'EXIT': 1}, 'Stop_C': {'CONT': 1, 'EXIT': 1}}
{'Intersection': {'CONT': 70337, 'EXIT': 77352}, 'Stop_A': {'CONT': 1, 'EXIT': 1}, 'Stop_B': {'CONT': 1, 'EXIT': 1}, 'Stop_C': {'CONT': 1, 'EXIT': 1}}

1.22773
0.47570739576
{'Intersection': {'CONT': 0.32175497537226205, 'EXIT': 1.2942248062015622}, 'Stop_A': {'CONT': 1, 'EXIT': 1}, 'Stop_B': {'CONT': 1, 'EXIT': 1}, 'Stop_C': {'CONT': 1, 'EXIT': 1}}
{'Intersection': {'CONT': 70246, 'EXIT': 77400}, 'Stop_A': {'CONT': 1, 'EXIT': 1}, 'Stop_B': {'

In [None]:
for i in range(10):
    AMD = Absent_Minded_Driver()
    softmax = Softmax(100)
    agent = Policy_Agent(softmax, AMD)
    
    total_utility = 0

    for j in range(1000):
        
        agent.decide_on_policy()
        history = AMD.run(agent, 100)
        policy_history = [(es, agent.policy[es][0], utility) for (es, _, utility) in history]
        agent.learn_from(policy_history)
        total_utility += sum([entry[2] for entry in history])

    print(total_utility/(1000*100))
    # optimal value: 4/3

    print(agent.get_action_distribution("Intersection")[0])
    # optimal value: 2/3
    
    print()

0.95469
0.95

0.98586
0.05

0.97213
0.7

0.99984
0.0

0.99045
0.4

0.96792
0.65

0.97352
0.15



In [79]:
for i in range(5):
    EB = Evidential_Blackmail()
    softmax = Softmax(100)
    agent = Action_Agent(softmax, EB)
    
    total_utility = 0
    
    for j in range(1000):
        history = EB.run(agent, 100)
        agent.learn_from(history)
        total_utility += sum([entry[2] for entry in history])

    print(total_utility/(1000*100))

    print(agent.get_action_distribution("Blackmail")[0])
    print(agent.get_action_distribution("No Blackmail")[0])
    
    print()

5.49815
0.499628970013
0.49976818658
{'Blackmail': {'PAY': 5.5384274061989505, 'DONT': 5.553268608414139}, 'No Blackmail': {'PAY': 5.47740882407633, 'DONT': 5.486681361547453}, 'END': {'PAY': 1, 'DONT': 1}}
{'Blackmail': {'PAY': 12260, 'DONT': 12669}, 'No Blackmail': {'PAY': 37647, 'DONT': 37428}, 'END': {'PAY': 1, 'DONT': 1}}

5.51479
0.499877012448
0.499808345657
{'Blackmail': {'PAY': 5.561841262213704, 'DONT': 5.566760764412052}, 'No Blackmail': {'PAY': 5.494114593090252, 'DONT': 5.501780767181714}, 'END': {'PAY': 1, 'DONT': 1}}
{'Blackmail': {'PAY': 12486, 'DONT': 12611}, 'No Blackmail': {'PAY': 37367, 'DONT': 37540}, 'END': {'PAY': 1, 'DONT': 1}}

5.46644
0.499714577781
0.49997800393
{'Blackmail': {'PAY': 5.503606347170989, 'DONT': 5.515023237179489}, 'No Blackmail': {'PAY': 5.451505564387821, 'DONT': 5.452385407173078}, 'END': {'PAY': 1, 'DONT': 1}}
{'Blackmail': {'PAY': 12478, 'DONT': 12480}, 'No Blackmail': {'PAY': 37740, 'DONT': 37306}, 'END': {'PAY': 1, 'DONT': 1}}

5.5067
0.

In [80]:
for i in range(5):
    EB = Evidential_Blackmail()
    softmax = Softmax(100)
    agent = Naive_Action_Agent(softmax, EB)
    
    total_utility = 0
    
    for j in range(1000):
        history = EB.run(agent, 100)
        agent.learn_from(history)
        total_utility += sum([entry[2] for entry in history])

    print(total_utility/(1000*100))

    print(agent.get_action_distribution("Blackmail")[0])
    print(agent.get_action_distribution("No Blackmail")[0])
    
    print()

5.50014
0.475020549189
0.475255040582
{'Blackmail': {'PAY': 9.999191737763812, 'DONT': 10.99920229738355}, 'No Blackmail': {'PAY': 3.4189469606839427, 'DONT': 4.409554615575988}, 'END': {'PAY': 1, 'DONT': 1}}
{'Blackmail': {'PAY': 11135, 'DONT': 12536}, 'No Blackmail': {'PAY': 36143, 'DONT': 40190}, 'END': {'PAY': 1, 'DONT': 1}}

5.52833
0.475020955363
0.475483131417
{'Blackmail': {'PAY': 9.999206559111357, 'DONT': 10.999200831135633}, 'No Blackmail': {'PAY': 3.448046205714928, 'DONT': 4.429508032630981}, 'END': {'PAY': 1, 'DONT': 1}}
{'Blackmail': {'PAY': 11343, 'DONT': 12513}, 'No Blackmail': {'PAY': 36186, 'DONT': 39962}, 'END': {'PAY': 1, 'DONT': 1}}

5.52866
0.475021000668
0.473799154532
{'Blackmail': {'PAY': 9.999204525366812, 'DONT': 10.999196980647248}, 'No Blackmail': {'PAY': 3.418584659752651, 'DONT': 4.467579340385851}, 'END': {'PAY': 1, 'DONT': 1}}
{'Blackmail': {'PAY': 11314, 'DONT': 12453}, 'No Blackmail': {'PAY': 36062, 'DONT': 40175}, 'END': {'PAY': 1, 'DONT': 1}}

5.53

In [81]:
for i in range(5):
    EB = Evidential_Blackmail()
    softmax = Softmax(100)
    agent = Policy_Agent(softmax, EB)
    
    total_utility = 0
    
    for j in range(1000):
        agent.decide_on_policy()
        history = EB.run(agent, 100)
        policy_history = [(es, agent.policy[es][0], utility) for (es, _, utility) in history]
        agent.learn_from(policy_history)
        total_utility += sum([entry[2] for entry in history])

    print(total_utility/(1000*100))

    print(agent.get_action_distribution("Blackmail")[0])
    print(agent.get_action_distribution("No Blackmail")[0])
    
    print()

5.47789
0.9
0.6
{'Blackmail': {'0.0': 1, '0.05': 10.897637795275589, '0.1': 10.868632707774797, '0.15': 10.808290155440414, '0.2': 10.787698412698402, '0.25': 10.736714975845402, '0.3': 10.670774647887319, '0.35': 10.643914473684207, '0.4': 10.584552845528455, '0.45': 10.520263901979254, '0.5': 10.502534395365682, '0.55': 10.457831325301223, '0.6': 10.406579764121657, '0.65': 10.332359813084091, '0.7': 10.306293706293703, '0.75': 10.254098360655743, '0.8': 10.204826368452036, '0.85': 10.138927738927766, '0.9': 10.094454072790292, '0.95': 10.046277665995984}, 'No Blackmail': {'0.0': 4.421113959450017, '0.05': 4.552469135802466, '0.1': 3.928495575221239, '0.15': 3.7887232663642254, '0.2': 4.143349231584539, '0.25': 4.017539810754679, '0.3': 4.314716981132078, '0.35': 4.249879865449303, '0.4': 3.867139334155363, '0.45': 3.88831118813788, '0.5': 3.96816976127321, '0.55': 3.465994962216625, '0.6': 4.054646068711109, '0.65': 3.8957548354684723, '0.7': 3.803612783696155, '0.75': 3.89328876317

5.49777
0.45
0.55
{'Blackmail': {'0.0': 1, '0.05': 10.878504672897199, '0.1': 10.857142857142858, '0.15': 10.822222222222225, '0.2': 10.789473684210524, '0.25': 10.71870794078062, '0.3': 10.705974842767288, '0.35': 10.68490374873354, '0.4': 10.61442307692308, '0.45': 10.519458544839246, '0.5': 10.516366065464247, '0.55': 10.422202001819826, '0.6': 10.400943396226445, '0.65': 10.355214723926348, '0.7': 10.311827956989248, '0.75': 10.242083758937696, '0.8': 10.200379867046525, '0.85': 10.150317892824695, '0.9': 10.09292452830189, '0.95': 10.042544048130653}, 'No Blackmail': {'0.0': 4.3318613380437165, '0.05': 4.569790141532461, '0.1': 4.138209506229808, '0.15': 4.58945610687023, '0.2': 4.24321880650995, '0.25': 4.146871008939978, '0.3': 4.212439588148781, '0.35': 4.20859419464997, '0.4': 3.70177103099304, '0.45': 3.8010989010988996, '0.5': 4.084601769911506, '0.55': 3.7072558373085625, '0.6': 3.7126334519572954, '0.65': 3.8629639966508513, '0.7': 3.85491905354919, '0.75': 3.4321782178217