# Reinforcement Learning

# Online prediction

This notebook presents the online prediction of a value function by **Monte-Carlo learning** and **TD learning**.

In [1]:
import numpy as np
from matplotlib import pyplot as plt

In [2]:
from model import Maze, Walk, TicTacToe, Nim, ConnectFour
from agent import Agent, OnlinePrediction
from dp import PolicyEvaluation

ModuleNotFoundError: No module named 'model'

## To do

* Complete the ``MCLearning`` class and test it on various environments.
* Complete the ``TDLearning`` class and test it on various environments.
* Compare with the exact solution obtained by Dynamic Programming when available.

In [22]:
import torch
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, labeled_data, unlabeled_data):
        self.labeled_data = labeled_data
        self.unlabeled_data = unlabeled_data

    def __len__(self):
        # The length is defined by the labeled data
        return len(self.labeled_data)

    def __getitem__(self, idx):
        # Return a tuple of (labeled_data, unlabeled_data)
        unlabeled_idx = idx % len(self.unlabeled_data)
        return self.labeled_data[idx], self.unlabeled_data[unlabeled_idx]

class CustomDataLoader:
    def __init__(self, dataset, batch_size=1, shuffle=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.labeled_indexes = list(range(len(dataset.labeled_data)))
        self.unlabeled_indexes = list(range(len(dataset.unlabeled_data)))
        self.unlabeled_pointer = 0

        if self.shuffle:
            torch.manual_seed(0)  # For reproducibility
            self.labeled_indexes = torch.randperm(len(self.labeled_indexes)).tolist()

    def __iter__(self):
        self.iter_labeled = iter(self.labeled_indexes)
        return self

    def __next__(self):
        batch = []
        for _ in range(self.batch_size):
            try:
                labeled_idx = next(self.iter_labeled)
            except StopIteration:
                raise StopIteration

            unlabeled_idx = self.unlabeled_indexes[self.unlabeled_pointer]
            self.unlabeled_pointer = (self.unlabeled_pointer + 1) % len(self.unlabeled_indexes)

            data = self.dataset[labeled_idx]
            batch.append(data)

        return batch

# Example usage
labeled_data = [torch.randn(5) for _ in range(10)]  # Replace this with your labeled data
unlabeled_data = [torch.randn(5) for _ in range(20)]  # Replace this with your unlabeled data
dataset = CustomDataset(labeled_data, unlabeled_data)
loader = CustomDataLoader(dataset, batch_size=32, shuffle=True)

for epoch in range(2):
    print ("hi")
    for labeled, unlabeled in loader:
        print ("hi")
        print (labeled, unlabeled)

hi
hi


In [14]:
import torch
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, labeled_data, unlabeled_data):
        self.labeled_data = labeled_data
        self.unlabeled_data = unlabeled_data

    def __len__(self):
        return max(len(self.labeled_data), len(self.unlabeled_data))

    def __getitem__(self, idx):
        labeled_idx = idx % len(self.labeled_data) if self.labeled_data else None
        unlabeled_idx = idx % len(self.unlabeled_data) if self.unlabeled_data else None

        labeled_sample = self.labeled_data[labeled_idx] if labeled_idx is not None else None
        unlabeled_sample = self.unlabeled_data[unlabeled_idx] if unlabeled_idx is not None else None

        return labeled_sample, unlabeled_sample

# Example usage
labeled_data = [torch.randn(5) for _ in range(10)]  # Replace this with your labeled data
unlabeled_data = [torch.randn(5) for _ in range(20)]  # Replace this with your unlabeled data

custom_dataset = CustomDataset(labeled_data, unlabeled_data)
custom_loader = DataLoader(custom_dataset, batch_size=2, shuffle=False)

# Training loop
for epoch in range(3):  # Replace 3 with the number of epochs you want
    for labeled_batch, unlabeled_batch in custom_loader:
        # Your training logic here
        print("Epoch {}, Labeled Batch Shape: {}, Unlabeled Batch Shape: {}".format(
            epoch, labeled_batch if labeled_batch is not None else None,
            unlabeled_batch if unlabeled_batch is not None else None))


Epoch 0, Labeled Batch Shape: tensor([[ 0.1704,  1.4061,  0.6540, -0.7445,  0.8497],
        [ 1.2216,  1.9943, -0.4317,  0.0179,  1.0350]]), Unlabeled Batch Shape: tensor([[-0.6894,  1.4095,  0.8840,  0.0268, -0.6692],
        [ 0.2597,  0.7758,  0.3978,  0.5062, -1.4441]])
Epoch 0, Labeled Batch Shape: tensor([[-0.7613, -1.2306,  1.3959,  0.8658,  2.0119],
        [-0.1957, -1.1572, -0.3526,  0.4504,  0.8583]]), Unlabeled Batch Shape: tensor([[ 0.3748,  1.7466, -0.9989, -0.0055, -0.3780],
        [-0.1546,  0.4912, -0.7754, -1.0701,  0.4034]])
Epoch 0, Labeled Batch Shape: tensor([[ 2.3396,  0.9407, -1.4042,  0.3062,  0.3633],
        [ 1.0997, -0.2325,  0.0442,  0.1226, -0.3461]]), Unlabeled Batch Shape: tensor([[ 1.3248, -0.6103,  0.3881,  1.2439, -0.7752],
        [-0.1293, -0.8127, -1.1407,  0.2515,  0.2278]])
Epoch 0, Labeled Batch Shape: tensor([[ 0.1260, -0.2679, -0.9576, -1.6293,  0.4730],
        [-0.4570,  0.7527,  1.0770, -1.6758, -0.6801]]), Unlabeled Batch Shape: tensor(

In [15]:
unlabeled_data

[tensor([-0.6894,  1.4095,  0.8840,  0.0268, -0.6692]),
 tensor([ 0.2597,  0.7758,  0.3978,  0.5062, -1.4441]),
 tensor([ 0.3748,  1.7466, -0.9989, -0.0055, -0.3780]),
 tensor([-0.1546,  0.4912, -0.7754, -1.0701,  0.4034]),
 tensor([ 1.3248, -0.6103,  0.3881,  1.2439, -0.7752]),
 tensor([-0.1293, -0.8127, -1.1407,  0.2515,  0.2278]),
 tensor([ 0.4190, -2.0997,  0.0444,  0.1684, -0.4006]),
 tensor([ 0.4713, -1.1705, -0.6428,  0.4750, -0.6701]),
 tensor([-0.2979,  1.5952,  0.4580,  1.5604,  0.9058]),
 tensor([-0.8698, -0.1078, -0.7981, -0.2681,  0.9069]),
 tensor([ 0.9083, -0.8002,  0.1161, -0.7313,  0.0266]),
 tensor([-0.5755, -0.0980,  0.0679, -0.0417, -0.8069]),
 tensor([ 0.0106, -0.4001,  1.1348, -0.9008,  0.3253]),
 tensor([ 0.4002, -0.2188, -0.8370,  1.7107, -2.2658]),
 tensor([-0.9896, -0.3366, -0.5367, -3.4630,  0.3103]),
 tensor([0.5729, 1.8105, 0.3606, 1.5962, 0.5569]),
 tensor([ 1.0846,  0.8740,  1.5544,  0.3421, -0.2276]),
 tensor([ 0.6083,  1.3213, -2.0395, -0.5397,  0.2655]

## Monte-Carlo learning

In [None]:
class MCLearning(OnlinePrediction):
    """Online prediction by Monte-Carlo."""
        
    def update_values(self):
        """Update the values from an episode."""
        stop, states, rewards = self.get_episode()
        gain = 0
        # backward update
        for state, reward in zip(reversed(states), reversed(rewards)):
            self.add_state(state)
            state_code = self.model.encode(state)
            # number of visits to this state
            self.state_count[state_code] += 1
            # value of this state
            # to be modified
            # begin
            gain = 0
            # end 
            diff = gain - self.state_value[state_code]
            count = self.state_count[state_code]
            self.state_value[state_code] += diff / count

## TD learning

In [None]:
class TDLearning(OnlinePrediction):
    """Online prediction by TD learning."""
        
    def update_values(self):
        """Update values online."""
        self.model.__init__()
        for t in range(self.n_steps):
            state = self.model.state
            self.add_state(state)
            state_code = self.model.encode(state)
            # number of visits to this state
            self.state_count[state_code] += 1
            
            # next state            
            action = self.get_action(state)
            reward, stop = self.model.step(action)
            next_state = self.model.state
            self.add_state(next_state)
            next_state_code = self.model.encode(next_state)
            
            # to be modified
            # begin
            self.state_value[state_code] += 0
            # end
            
            if stop:
                break

## Walk

In [None]:
model = Walk()

In [None]:
algo = MCLearning(model, policy='random', gamma=0.9)

In [None]:
n_episodes = 100
for t in range(n_episodes):
    algo.update_values()

In [None]:
values = algo.get_values()

In [None]:
model.display_values(values)

In [None]:
policy = algo.improve_policy()

In [None]:
model.display_policy(policy)

## Maze

In [None]:
model = Maze()
# set parameters
maze_map = np.load('maze_small.npy')
model.set_parameters(maze_map, (1, 0), [(3, 8)])
# init
model = Maze()

In [None]:
model.display()

In [None]:
algo = MCLearning(model, policy='random')

In [None]:
n_episodes = 1000
for t in range(n_episodes):
    algo.update_values()

In [None]:
values = algo.get_values()
model.display_values(values)

In [None]:
policy = algo.improve_policy()
model.display_policy(policy)

## Games

In [None]:
Game = TicTacToe

In [None]:
# random player
game = Game()
agent = Agent(game)

In [None]:
np.unique(agent.get_gains(), return_counts=True)

In [None]:
# online prediction of the random player
algo = MCLearning(game, policy='random')

In [None]:
# you might adapt the number of games
n_games = 100
for t in range(n_games):
    algo.update_values()

In [None]:
# policy improvement
policy = algo.improve_policy()

In [None]:
# test this new policy
agent = Agent(game, policy)
np.unique(agent.get_gains(), return_counts=True)

In [None]:
# a better adversary
game = TicTacToe(adversary_policy='one_step')
agent = Agent(game)

In [None]:
np.unique(agent.get_gains(), return_counts=True)

In [None]:
# online prediction against this adversary
algo = MCLearning(game, policy='random')

In [None]:
# train and improve your player!

In [None]:
# online prediction of a better player
algo = MCLearning(game, policy='one_step')