<a href="https://colab.research.google.com/github/manikanta-eng/Reinforcement-learning/blob/main/lab_02_rml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gymnasium as gym
import numpy as np
from collections import defaultdict

In [2]:
env = gym.make('Blackjack-v1', sab=True)

# Function to create a random policy
def create_random_policy(env):
    return lambda state: np.random.choice(env.action_space.n)

# Function to create a greedy policy based on Q-values
def create_greedy_policy(Q):
    def policy_fn(state):
        return np.argmax(Q[state])
    return policy_fn

In [3]:
def mc_policy_evaluation(policy, env, num_episodes, gamma=1.0):
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    V = defaultdict(float)

    for _ in range(num_episodes):
        episode = []
        state = env.reset()[0]
        done = False

        while not done:
            action = policy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated # Blackjack env can be terminated or truncated
            episode.append((state, action, reward))
            state = next_state

        visited_states = set()
        G = 0
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if state not in visited_states:
                returns_sum[state] += G
                returns_count[state] += 1
                V[state] = returns_sum[state] / returns_count[state]
                visited_states.add(state)
    return V

In [4]:
def mc_control_epsilon_greedy(env, num_episodes, gamma=1.0, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    def policy_fn(state):
        if np.random.rand() < epsilon:
            return np.random.choice(env.action_space.n)
        else:
            return np.argmax(Q[state])

    for _ in range(num_episodes):
        episode = []
        state = env.reset()[0]
        done = False

        while not done:
            action = policy_fn(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated # Blackjack env can be terminated or truncated
            episode.append((state, action, reward))
            state = next_state

        visited_state_action_pairs = set()
        G = 0
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            if (state, action) not in visited_state_action_pairs:
                old_q = Q[state][action]
                Q[state][action] = old_q + (G - old_q) / (1 + sum(1 for s, a, r in episode if s == state and a == action))
                visited_state_action_pairs.add((state, action))

    return Q, create_greedy_policy(Q)

In [None]:
if __name__ == "__main__":
    random_policy = create_random_policy(env)

    print("Evaluating random policy...")
    V = mc_policy_evaluation(random_policy, env, num_episodes=50000)
    print("Value function for random policy (sample):")
    for i, (state, value) in enumerate(list(V.items())[:10]):
        print(f"State: {state}, Value: {value:.2f}")

    print("\nTraining control policy with epsilon-greedy strategy...")
    Q, greedy_policy = mc_control_epsilon_greedy(env, num_episodes=500000)
    print("Learned Q-values (sample):")
    for i, (state, actions) in enumerate(list(Q.items())[:10]):
        print(f"State: {state}, Actions: {actions}")

Evaluating random policy...
Value function for random policy (sample):
State: (18, 2, 0), Value: -0.35
State: (20, 2, 0), Value: -0.09
State: (19, 2, 0), Value: -0.22
State: (21, 2, 1), Value: 0.31
State: (17, 4, 0), Value: -0.45
State: (20, 10, 0), Value: -0.24
State: (11, 2, 0), Value: -0.22
State: (18, 10, 0), Value: -0.54
State: (17, 10, 0), Value: -0.60
State: (13, 10, 0), Value: -0.62

Training control policy with epsilon-greedy strategy...
