# Experiment 9

## Problem Statement:

Implement Monte Carlo Algorithm.

## Code

In [2]:
# importing required libraries
import numpy as np

### First Visit

In [15]:
def monte_carlo_first_visit(episodes):
    returns = {}
    state_count = {}
    state_values = {}

    for episode in episodes:
        states, rewards = zip(*episode)
        total_return = 0

        for t in range(len(states) - 1, -1, -1):
            state = states[t]
            total_return += rewards[t]
            if state not in states[:t]:
                if state in returns:
                    returns[state].append(total_return)
                else:
                    returns[state] = [total_return]
                state_count[state] = len(returns[state])
                state_values[state] = sum(returns[state]) / state_count[state]

    return state_values


if __name__ == "__main__":
    num_episodes = 2
    episodes = [
        [('A', 3), ('A', 2), ('B', -4), ('A', 4), ('B', -3)],
        [('B', -2), ('A', 3), ('B', -3)],
    ]

    state_values = monte_carlo_first_visit(episodes)

    print("Episodes")
    i = 0
    for episode in episodes:
        i += 1
        print("Episode:" + str(i))
        for state, reward in episode:
            print(f"State {state}: Reward {reward}")
        print("")

    print("Estimated state values:")
    for state, value in state_values.items():
        print(f"State {state}: {value}")

Episodes
Episode:1
State A: Reward 3
State A: Reward 2
State B: Reward -4
State A: Reward 4
State B: Reward -3

Episode:2
State B: Reward -2
State A: Reward 3
State B: Reward -3

Estimated state values:
State B: -2.5
State A: 1.0


### Every Visit

In [14]:
gamma = 1.0


def calculate_return(episode, t):
    G = 0
    for i in range(t, len(episode)):
        reward = episode[i][1]
        G = G + (gamma ** (i - t)) * reward
    return G


def monte_carlo_every_visit(episodes):
    state_values = {}
    returns_sum = {}
    state_counts = {}

    for episode in episodes:
        for t in range(len(episode)):
            state = episode[t][0]
            if state not in state_counts:
                state_counts[state] = 0
            state_counts[state] += 1

            G = calculate_return(episode, t)

            if state not in returns_sum:
                returns_sum[state] = 0
            returns_sum[state] += G

            state_values[state] = returns_sum[state] / state_counts[state]

    return state_values


if __name__ == "__main__":
    episodes = [
        [('A', 3), ('A', 2), ('B', -4), ('A', 4), ('B', -3)],
        [('B', -2), ('A', 3), ('B', -3)],
    ]
    state_values = monte_carlo_every_visit(episodes)

    print("Episodes")
    i = 0
    for episode in episodes:
        i += 1
        print("Episode:" + str(i))
        for state, reward in episode:
            print(f"State {state}: Reward {reward}")
        print("")

    print("Estimated state values:")
    for state, value in state_values.items():
        print(f"State {state}: {value}")

Episodes
Episode:1
State A: Reward 3
State A: Reward 2
State B: Reward -4
State A: Reward 4
State B: Reward -3

Episode:2
State B: Reward -2
State A: Reward 3
State B: Reward -3

Estimated state values:
State A: 0.5
State B: -2.75
