<center>
<h2 style="color:blue;font-size:30px;">Artificial Intelligence CS-414</h2>
<h3 style="color:purple">Assignment 4</h3>
 </center>

<br>

<p>Consider Volcano crossing problem discussed in the class. Consider an instance of the problem using different grid size, rewards of end states etc. You are free to define the problem, its states, actions etc. Solve the problem using the following algorithms.</p>
<ul style="color:purple">
<li>Model free Monte Carlo</li>
<li>SARSA</li>
<li>Q-Learning</li>
</ul>
<ol style="color:green">
<li>Run the algorithms using different number of episodes of uniformly random policy and show Q-values and average utility.</li>
<li>Use different slip probabilities ranging from 0.0 to 0.3 and show your results on different algorithms.</li>
<li>Use epsilon greedy algorithms to change generate episode from uniformly random policy for exploration as well as policy that chooses the best action.</li>
<li>Write a 2-3 page report and explain your code and results in it.</li>
Develop a GUI based user friendly application from which user to choose appropriate options e.g slip probability, epsilon value, no of episodes etc.

In [38]:
import numpy as np

<h3 style="color:purple">MODEL FREE MONTE CARLO</h3>

In [39]:
def initialize_environment(grid_size, start_state, safe_end_states, dangerous_end_state, slip_prob):
    grid = np.zeros(grid_size)
    
    for state in safe_end_states:
        if 0 <= state[0] < grid_size[0] and 0 <= state[1] < grid_size[1]:
            grid[state[0], state[1]] = 20

    for state in dangerous_end_state:
        if 0 <= state[0] < grid_size[0] and 0 <= state[1] < grid_size[1]:
            grid[state[0], state[1]] = -50

    specific_reward_location = (2, 0)
    if 0 <= specific_reward_location[0] < grid_size[0] and 0 <= specific_reward_location[1] < grid_size[1]:
        grid[specific_reward_location[0], specific_reward_location[1]] = 2
    
    return grid


In [40]:
def get_next_state(current_state, action, slip_prob, grid):
    directions = {"N": (-1, 0), "E": (0, 1), "S": (1, 0), "W": (0, -1)}
    
    action_str = list(directions.keys())[action]
    
    if np.random.rand() < slip_prob:
        possible_actions = list(directions.keys())
        possible_actions.remove(action_str)
        random_action = np.random.choice(possible_actions)
        next_state = tuple(np.array(current_state) + np.array(directions[random_action]))
    else:
        next_state = tuple(np.array(current_state) + np.array(directions[action_str]))
    
    next_state = (
        max(0, min(next_state[0], grid.shape[0] - 1)),
        max(0, min(next_state[1], grid.shape[1] - 1))
    )
    
    return next_state

def get_reward(state, grid):
    return grid[state]


In [41]:
def epsilon_greedy(Q, state, epsilon, num_actions):
    if np.random.rand() < epsilon:
        return np.random.choice(num_actions)
    else:
        return np.argmax(Q[state])

In [42]:
def monte_carlo(grid, slip_prob, num_episodes, epsilon):
    Q = np.zeros_like(grid)
    returns = np.zeros_like(grid)
    visit_count = np.zeros_like(grid)
    
    for episode in range(num_episodes):
        episode_states = []
        episode_actions = []
        episode_rewards = []
    
        current_state = (2, 1)
    
        while True:
            num_actions = len(Q)
            action = epsilon_greedy(Q, current_state, epsilon, num_actions)
        
            episode_states.append(current_state)
            episode_actions.append(action)
        
            next_state = get_next_state(current_state, action, slip_prob, grid)
        
            reward = get_reward(next_state, grid)
            episode_rewards.append(reward)
        
            current_state = next_state
        
            if next_state in [(0, 3), (2, 3)]:
                break
        
        total_return = 0
        for t in range(len(episode_states) - 1, -1, -1):
            total_return += episode_rewards[t]
            
            if episode_states[t] not in episode_states[:t]:
                state = episode_states[t]
                action = episode_actions[t]
                
                visit_count[state] += 1
                
                Q[state] += (total_return - Q[state]) / visit_count[state]
                
    average_utility = np.mean(Q)
    
    return Q, average_utility


In [43]:
def run_experiments():
    grid_size = (3, 4)
    start_state = (0, 0)
    safe_end_states = [(3, 1), (1, 4)]  
    dangerous_end_state = [(0, 2),(1,2)]  
    num_episodes = 1000  
    slip_probabilities = [0.0, 0.1, 0.2, 0.3]
    epsilon_values = [0.1]  
    
    for slip_prob in slip_probabilities:
        for epsilon in epsilon_values:
            grid = initialize_environment(grid_size, start_state, safe_end_states, dangerous_end_state, slip_prob)
      
            Q_values, avg_utility = monte_carlo(grid, slip_prob, num_episodes, epsilon)
            
            print(f"Results for Slip Probability {slip_prob} and Epsilon {epsilon}:")
            print("Q-values:")
            print(Q_values)
            print(f"Average Utility: {avg_utility}")
            print("\n")

def main():
    run_experiments()

if __name__ == "__main__":
    main()

Results for Slip Probability 0.0 and Epsilon 0.1:
Q-values:
[[    0.         -1528.5026738  -1471.09218437     0.        ]
 [    0.         -1528.32640333 -1484.30656934     0.        ]
 [    0.         -1522.7        -1521.42857143     0.        ]]
Average Utility: -754.6963668552972


Results for Slip Probability 0.1 and Epsilon 0.1:
Q-values:
[[-768.9380531  -799.4261242  -748.39473684    0.        ]
 [-796.53295129 -797.94957983 -752.82689076   -4.72972973]
 [-882.34       -797.284      -826.10810811    0.        ]]
Average Utility: -597.8775144876605


Results for Slip Probability 0.2 and Epsilon 0.1:
Q-values:
[[-546.28884826 -546.02981651 -492.12970711    0.        ]
 [-564.22972973 -547.49780702 -501.18935837  -93.03278689]
 [-543.61111111 -533.09       -438.08294931    0.        ]]
Average Utility: -400.4318428595691


Results for Slip Probability 0.3 and Epsilon 0.1:
Q-values:
[[-410.29276896 -411.72716489 -357.37323177    0.        ]
 [-405.17453799 -407.25784753 -357.478390

<h3 style="color:purple">SARSA</h3>

In [44]:
def sarsa(grid, slip_prob, num_episodes, epsilon, alpha, gamma):
    Q = np.zeros(grid.shape + (4,))
    
    for episode in range(num_episodes):
        current_state = (2, 1)
        current_action = epsilon_greedy(Q, current_state, epsilon, Q.shape[2])
        episode_complete = False
        
        while not episode_complete:
            next_state = get_next_state(current_state, current_action, slip_prob, grid)
            reward = get_reward(next_state, grid)
            next_action = epsilon_greedy(Q, next_state, epsilon, Q.shape[2])
            Q[current_state + (current_action,)] += alpha * (reward + gamma * Q[next_state + (next_action,)] - Q[current_state + (current_action,)])
            current_state = next_state
            current_action = next_action
            episode_complete = next_state in [(0, 3), (2, 3)]
    
    average_utility = np.mean(Q)
    
    return Q, average_utility


In [45]:
def run_sarsa_experiments():
   
    grid_size = (3, 4)
    start_state = (0, 0)
    safe_end_states = [(3, 1), (1, 4)]  
    dangerous_end_state = [(0, 2),(1,2)] 
    num_episodes = 100 
    
    slip_probabilities = [0.0, 0.1, 0.2, 0.3]
    epsilon_values = [0.1] 
    alpha_values = [0.1]
    gamma_values = [0.9] 
    
    for slip_prob in slip_probabilities:
        for epsilon in epsilon_values:
            for alpha in alpha_values:
                for gamma in gamma_values:
                    grid = initialize_environment(grid_size, start_state, safe_end_states, dangerous_end_state, slip_prob)
                    
                    Q_values, avg_utility = sarsa(grid, slip_prob, num_episodes, epsilon, alpha, gamma)

                    print(f"Results for Slip Probability {slip_prob}, Epsilon {epsilon}, Alpha {alpha}, Gamma {gamma}:")
                    print("Q-values:")
                    print(Q_values)
                    print(f"Average Utility: {avg_utility}")
                    print("\n")

def main_sarsa():
    run_sarsa_experiments()

if __name__ == "__main__":
    main_sarsa()


Results for Slip Probability 0.0, Epsilon 0.1, Alpha 0.1, Gamma 0.9:
Q-values:
[[[ 12.16527412  11.74817433  16.32964692  13.96229831]
  [  1.33477965 -23.42795     14.78876515   1.52068924]
  [ -9.5          0.          -3.7227284    0.        ]
  [  0.           0.           0.           0.        ]]

 [[ 14.18013789  15.05652086  18.9782171   16.9665368 ]
  [ 12.35259473 -41.65571166  16.27035895  16.87742052]
  [-23.42795     -0.45        14.31298034   1.52204199]
  [  0.           0.           0.          -9.5       ]]

 [[ 16.97140016  16.33317969  18.81658869  18.92385811]
  [ 14.2833169   13.37740501  16.94554735  18.76229795]
  [-40.92898866   0.           9.70526966  16.80867223]
  [  0.           0.           0.           0.        ]]]
Average Utility: 4.305846748394701


Results for Slip Probability 0.1, Epsilon 0.1, Alpha 0.1, Gamma 0.9:
Q-values:
[[[ 10.68104638   7.63883258  13.62096262   8.94179239]
  [  2.22189822 -32.3395503   -2.99348631   9.84124811]
  [ -9.5       

<h3 style="color:purple">Q LEARNING</h3>

In [46]:
def q_learning(grid, slip_prob, num_episodes, epsilon, alpha, gamma):
    Q = np.zeros(grid.shape + (4,))
    
    for episode in range(num_episodes):
        current_state = (2, 1)
        episode_complete = False
        
        while not episode_complete:
            num_actions = Q.shape[2]
            current_action = epsilon_greedy(Q, current_state, epsilon, num_actions)
            next_state = get_next_state(current_state, current_action, slip_prob, grid)
            reward = get_reward(next_state, grid)
            Q[current_state + (current_action,)] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[current_state + (current_action,)])
            current_state = next_state
            episode_complete = next_state in [(0, 3), (2, 3)]
    
    average_utility = np.mean(Q)
    
    return Q, average_utility


In [47]:
def run_q_learning_experiments():
    grid_size = (3, 4)
    start_state = (0, 0)
    safe_end_states = [(3, 1), (1, 4)] 
    dangerous_end_state = [(0, 2),(1,2)]  
    num_episodes = 100  
    slip_probabilities = [0.0, 0.1, 0.2, 0.3]
    epsilon_values = [0.1] 
    alpha_values = [0.1] 
    gamma_values = [0.9]  
    for slip_prob in slip_probabilities:
        for epsilon in epsilon_values:
            for alpha in alpha_values:
                for gamma in gamma_values:
                   
                    grid = initialize_environment(grid_size, start_state, safe_end_states, dangerous_end_state, slip_prob)
                    
                    Q_values, avg_utility = q_learning(grid, slip_prob, num_episodes, epsilon, alpha, gamma)
     
                    print(f"Results for Slip Probability {slip_prob}, Epsilon {epsilon}, Alpha {alpha}, Gamma {gamma}:")
                    print("Q-values:")
                    print(Q_values)
                    print(f"Average Utility: {avg_utility}")
                    print("\n")

def main_q_learning():
    run_q_learning_experiments()

if __name__ == "__main__":
    main_q_learning()


Results for Slip Probability 0.0, Epsilon 0.1, Alpha 0.1, Gamma 0.9:
Q-values:
[[[ 2.13307281e-01  8.87687091e+00  1.80000000e+00  0.00000000e+00]
  [ 0.00000000e+00 -4.99651901e+01  1.43551345e+01  1.80889154e-02]
  [-9.50000000e+00  0.00000000e+00 -5.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[ 4.62039381e+00  1.29456668e+01  2.00000000e+01  1.57779233e+01]
  [ 2.77976920e+00 -2.84766395e+01  1.79791216e+01  0.00000000e+00]
  [-5.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]]

 [[ 1.80000000e+01  1.80000000e+01  2.00000000e+01  2.00000000e+01]
  [ 1.42475166e+01  0.00000000e+00  1.72876040e+01  2.00000000e+01]
  [-1.35500000e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]]]
Average Utility: 2.4043659880199075


Results for Slip Probability 0.1, Epsilon 0.1, Alpha 0.1, Gam

<hr>
<h3 style="color:purple">GUI</h3>

In [48]:
def run_experiments(slip_prob, epsilon, num_episodes):
    grid_size = (3, 4)
    start_state = (0, 0)
    safe_end_states = [(3, 1), (1, 4)]  
    dangerous_end_state = [(0, 2),(1,2)]  
    
    if not isinstance(slip_prob, (list, tuple)):
        slip_prob = [slip_prob]
    
    if not isinstance(epsilon, (list, tuple)):
        epsilon = [epsilon]
    
    for slip_prob_val in slip_prob:
        for epsilon_val in epsilon:
            grid = initialize_environment(grid_size, start_state, safe_end_states, dangerous_end_state, slip_prob_val)
            
            Q_values, avg_utility = monte_carlo(grid, slip_prob_val, num_episodes, epsilon_val)
            
            print("MONTO CARLO FREE MODEL")
            print(f"Results for Slip Probability {slip_prob_val} and Epsilon {epsilon_val}:")
            print("Q-values:")
            print(Q_values)
            print(f"Average Utility: {avg_utility}")
            print("\n")


In [49]:
def run_sarsa_experiments(slip_probabilities, epsilon_values, num_episodes):
    grid_size = (3, 4)
    start_state = (0, 0)
    safe_end_states = [(3, 1), (1, 4)]  
    dangerous_end_state = [(0, 2),(1,2)]  
   
    alpha_values = [0.1]  
    gamma_values = [0.9]  
    
    if not isinstance(slip_probabilities, (list, tuple)):
        slip_probabilities = [slip_probabilities]
    
    if not isinstance(epsilon_values, (list, tuple)):
        epsilon_values = [epsilon_values]
    
    if not isinstance(alpha_values, (list, tuple)):
        alpha_values = [alpha_values]
    
    if not isinstance(gamma_values, (list, tuple)):
        gamma_values = [gamma_values]
    
    for slip_prob in slip_probabilities:
        for epsilon in epsilon_values:
            for alpha in alpha_values:
                for gamma in gamma_values:
                    grid = initialize_environment(grid_size, start_state, safe_end_states, dangerous_end_state, slip_prob)
              
                    Q_values, avg_utility = sarsa(grid, slip_prob, num_episodes, epsilon, alpha, gamma)
                    
                    print("SARSA")
                    print(f"Results for Slip Probability {slip_prob}, Epsilon {epsilon}, Alpha {alpha}, Gamma {gamma}:")
                    print("Q-values:")
                    print(Q_values)
                    print(f"Average Utility: {avg_utility}")
                    print("\n")



In [50]:
def run_q_learning_experiments(slip_probabilities, epsilon_values, num_episodes):
    grid_size = (3, 4)
    start_state = (0, 0)
    safe_end_states = [(3, 1), (1, 4)]
    dangerous_end_state = [(0, 2),(1,2)]  
    
    alpha_values = [0.1] 
    gamma_values = [0.9]
    
    if not isinstance(slip_probabilities, (list, tuple)):
        slip_probabilities = [slip_probabilities]
    
    if not isinstance(epsilon_values, (list, tuple)):
        epsilon_values = [epsilon_values]
    
    if not isinstance(alpha_values, (list, tuple)):
        alpha_values = [alpha_values]
    
    if not isinstance(gamma_values, (list, tuple)):
        gamma_values = [gamma_values]
    
    for slip_prob in slip_probabilities:
        for epsilon in epsilon_values:
            for alpha in alpha_values:
                for gamma in gamma_values:
                    grid = initialize_environment(grid_size, start_state, safe_end_states, dangerous_end_state, slip_prob)
                    
                    Q_values, avg_utility = q_learning(grid, slip_prob, num_episodes, epsilon, alpha, gamma)

                    print("Q LEARNING")
                    print(f"Results for Slip Probability {slip_prob}, Epsilon {epsilon}, Alpha {alpha}, Gamma {gamma}:")
                    print("Q-values:")
                    print(Q_values)
                    print(f"Average Utility: {avg_utility}")
                    print("\n")

In [51]:
def display_grid(canvas, grid):
    cell_size = 30
    canvas.delete("all")

    for row_index, row in enumerate(grid):
        for col_index, value in enumerate(row):
            x1 = col_index * cell_size
            y1 = row_index * cell_size
            x2 = x1 + cell_size
            y2 = y1 + cell_size

            canvas.create_rectangle(x1, y1, x2, y2, outline="black", fill="white")
            canvas.create_text((x1 + x2) / 2, (y1 + y2) / 2, text=str(value))

In [None]:
import tkinter as tk
from tkinter import ttk
from functools import partial
def on_run_button_click(canvas, slip_prob_entry, epsilon_entry, num_episodes_entry):
    slip_prob = float(slip_prob_entry.get())
    epsilon = float(epsilon_entry.get())
    num_episodes = int(num_episodes_entry.get())
    
    grid_size = (3, 4)
    start_state = (0, 0)
    safe_end_states = [(3, 1), (1, 4)] 
    dangerous_end_state = [(0, 2),(1,2)]  
    
    grid = initialize_environment(grid_size, start_state, safe_end_states, dangerous_end_state, slip_prob)
    display_grid(canvas, grid)

    run_experiments(slip_prob, epsilon, num_episodes)
    run_sarsa_experiments(slip_prob, epsilon, num_episodes)
    run_q_learning_experiments(slip_prob, epsilon, num_episodes)


def create_gui():
    root = tk.Tk()
    root.title("Reinforcement Learning Experiments")

    tk.Label(root, text="Slip Probability:").grid(row=0, column=0, padx=10, pady=5)
    slip_prob_entry = tk.Entry(root)
    slip_prob_entry.grid(row=0, column=1, padx=10, pady=5)

    tk.Label(root, text="Epsilon Value:").grid(row=1, column=0, padx=10, pady=5)
    epsilon_entry = tk.Entry(root)
    epsilon_entry.grid(row=1, column=1, padx=10, pady=5)

    tk.Label(root, text="Number of Episodes:").grid(row=2, column=0, padx=10, pady=5)
    num_episodes_entry = tk.Entry(root)
    num_episodes_entry.grid(row=2, column=1, padx=10, pady=5)

    canvas = tk.Canvas(root, width=120, height=120)
    canvas.grid(row=4, column=0, columnspan=2, pady=10)

    run_button = tk.Button(root, text="Run Experiments", command=partial(on_run_button_click, canvas, slip_prob_entry, epsilon_entry, num_episodes_entry))
    run_button.grid(row=3, column=0, columnspan=2, pady=10)

    root.mainloop()
    
create_gui()    

<h3 style="color:purple">Submitted By</h3>

<ul>
<li> Nasir Hussain</li>
<li> Laiba Masood</li>
</ul>