In [7]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
import random
import pandas as pd
import yaml
from liftoff import parse_opts
from argparse import Namespace
from experiment_src import *
import numpy as np
import networkx as nx
import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})

root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath("."))))
sys.path.append(root_dir)

from experiment_src import (
    train_net_with_neural_fitting,
    generate_random_policy_transitions,
    generate_transitions_observations,
    TransitionDataset
)
from experiments.experiment_utils import setup_logger, seed_everything
from overfitting.src.policy_iteration import random_policy_evaluation_q_stochastic
from overfitting.src.utils import (
    create_random_policy,
    extract_V_from_Q_for_stochastic_policy,
)
from overfitting.src.visualize import draw_simple_gridworld
from experiment_src import generate_train_test_split_with_valid_path, check_path_existence_to_any_terminal, get_frequency_scaling, normalize_frequencies

import logging

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def count_occurrences_and_compute_percentage(
    sampled_transitions_list, total_unique_transitions, N
):
    # Count occurrences of each index in the sampled list
    occurrences_count = {}
    for index in sampled_transitions_list:
        occurrences_count[index] = occurrences_count.get(index, 0) + 1

    # Compute the number of indexes that appear at least N times
    at_least_N = sum(1 for count in occurrences_count.values() if count >= N)

    # Compute the percentage relative to the total number of unique transitions
    percentage = (at_least_N / total_unique_transitions) * 100
    return percentage, occurrences_count

In [3]:
rows, cols = 10, 10
start_state = (0, 0)
terminal_states = {(rows - 2, cols - 2): 1.0}
p_success = 1
seed = 3
run_id = 0

num_steps = 40_000
min_samples = 20
# min_samples = 0

# Learning hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.05  # Convergence criterion
tau = 100
batch_size = 32
train_max_iterations = 50
theta = 1e-6

env = make_env(rows, cols, start_state, p_success, terminal_states, seed)

states = list(set([s for s, _ in env.mdp.keys()]))
actions = list(set([a for _, a in env.mdp.keys()]))

In [4]:
tau = 1000
transitions_list = [(key[0], key[1], *value[0]) for key, value in env.mdp.items()]
transitions_train, transitions_val = train_test_split(
    transitions_list, test_size=0.2, random_state=seed
)

random_policy_transitions = generate_transitions_observations(
    transitions_list,
    num_steps,
    tau=tau,
    min_samples=min_samples,
)


### Training
input_size = len(states[0])  # Or another way to represent the size of your input
output_size = len(actions)

# Initialize the DQN
qnet_random_policy = QNET(input_size, output_size)

# loss_record_random_policy = train_net_with_neural_fitting(
#     qnet_random_policy,
#     random_policy_transitions,
#     states,
#     actions,
#     gamma,
#     epsilon,
#     batch_size,
#     train_max_iterations,
#     logger,
# )

In [10]:
tau = 0.1
seed_everything(run_id)
env = make_env(rows, cols, start_state, p_success, terminal_states, run_id)

states = list(set([s for s, _ in env.mdp.keys()]))
actions = list(set([a for _, a in env.mdp.keys()]))
random_policy = create_random_policy(states, actions)
Q = {state: {action: 0 for action in actions} for state in states}
Q_pi_random = random_policy_evaluation_q_stochastic(
    states, actions, random_policy, Q, env.mdp, gamma, epsilon
)

transitions_list = [(key[0], key[1], *value[0]) for key, value in env.mdp.items()]

transitions_train, transitions_val = generate_train_test_split_with_valid_path(
    transitions_list=transitions_list,
    start_state=start_state,
    terminal_states=terminal_states,
    seed=run_id,
)

train_dataset_transitions = generate_transitions_observations(
        transitions_train, num_steps, tau=tau, min_samples=min_samples
    )

get_frequency_scaling(train_dataset_transitions)

normed_dataset = normalize_frequencies(train_dataset_transitions)
get_frequency_scaling(normed_dataset)

# random_policy_transitions = generate_random_policy_transitions(
#     transitions_train, num_steps, env, actions
# )

# seed_everything(seed)

# ### Training
# input_size = len(states[0])  # Or another way to represent the size of your input
# output_size = len(actions)

# # Initialize the DQN
# qnet_random_policy = QNET(input_size, output_size)

# loss_record = train_net_with_neural_fitted_q(
#     qnet_random_policy,
#     random_policy_transitions,
#     Q_pi_random,
#     states,
#     actions,
#     gamma,
#     epsilon,
#     batch_size=batch_size,
#     max_iterations=train_max_iterations,
#     frequency_scaling=False,
#     logger=None,
# )

{((6, 6), <Action.UP: 0>, (5, 6), 0, 0): 1.0912265386294195,
 ((9, 5), <Action.UP: 0>, (8, 5), 0, 0): 0.9307520476545049,
 ((6, 4), <Action.DOWN: 2>, (7, 4), 0, 0): 1.0912265386294195,
 ((7, 2), <Action.LEFT: 3>, (7, 1), 0, 0): 1.1403808872163304,
 ((2, 9), <Action.DOWN: 2>, (3, 9), 0, 0): 1.072731173567904,
 ((5, 0), <Action.LEFT: 3>, (5, 0), 0, 0): 0.9662769349695624,
 ((0, 1), <Action.LEFT: 3>, (0, 0), 0, 0): 1.1007154650522841,
 ((1, 7), <Action.RIGHT: 1>, (1, 8), 0, 0): 1.0375596596804317,
 ((0, 1), <Action.UP: 0>, (0, 1), 0, 0): 0.9737098344693282,
 ((0, 8), <Action.RIGHT: 1>, (0, 9), 0, 0): 0.9376465072667605,
 ((6, 4), <Action.RIGHT: 1>, (6, 5), 0, 0): 1.0548523206751055,
 ((9, 2), <Action.DOWN: 2>, (9, 2), 0, 0): 0.8790436005625879,
 ((5, 4), <Action.RIGHT: 1>, (5, 5), 0, 0): 0.9812579727210284,
 ((3, 1), <Action.LEFT: 3>, (3, 0), 0, 0): 0.9889240506329114,
 ((5, 8), <Action.LEFT: 3>, (5, 7), 0, 0): 0.8729812309035356,
 ((6, 4), <Action.UP: 0>, (5, 4), 0, 0): 0.923958237087683

In [6]:
max_iterations = 10
transitions = random_policy_transitions

net = QNET(input_size, output_size)
if logger is None:
    logger = logging.getLogger(__name__)

net.train()
dataset = TransitionDataset(transitions)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
optimizer = optim.Adam(net.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
loss_record = []

for epoch in range(max_iterations):
    total_loss = 0
    for state, action, next_state, reward, done in dataloader:
        optimizer.zero_grad()
        q_values = net(state)
        next_q_values = net(next_state)
        max_next_q_values = next_q_values.detach().max(1)[0]

        target_q_values_for_actions = reward + gamma * max_next_q_values * (~done)

        action_q_values = q_values.gather(1, action.unsqueeze(-1))

        loss = loss_fn(action_q_values, target_q_values_for_actions.unsqueeze(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        break
    break

NameError: name 'logger' is not defined

In [None]:
action_q_values.shape

torch.Size([32, 1])

In [None]:
q_values.shape

torch.Size([32, 4])

In [None]:
loss

tensor(0.7741, grad_fn=<MseLossBackward0>)

In [None]:
q_values.gather(1, action.unsqueeze(-1)).shape

torch.Size([32, 1])

In [None]:
q_values

tensor([[-1.2938e+00,  1.4427e-01, -3.4196e-01, -4.9531e-01],
        [-5.5965e-01,  1.5732e-01, -1.1921e-01, -1.1361e-01],
        [-9.2899e-01,  1.1963e-01, -6.7880e-02, -8.0473e-01],
        [-1.0815e+00,  1.6027e-01, -2.7088e-01, -4.0533e-01],
        [-9.2247e-01,  1.2536e-01, -1.3494e-01, -6.8017e-01],
        [-1.7271e+00,  1.5155e-01, -5.4521e-01, -1.4430e-01],
        [-1.3387e+00,  2.6558e-01, -4.1004e-01, -3.6590e-02],
        [-1.1518e+00,  2.8918e-01, -3.6429e-01,  4.8384e-03],
        [-1.6530e+00,  3.3030e-01, -5.4903e-01, -2.6282e-02],
        [-1.3390e+00,  1.5814e-01, -3.4963e-01, -6.2659e-01],
        [-1.1150e+00,  1.6729e-01, -2.7006e-01, -5.3678e-01],
        [-8.8873e-01,  1.4126e-01, -2.3497e-01, -1.3536e-01],
        [-6.0195e-01,  1.5063e-01, -9.8310e-04, -4.7670e-01],
        [-5.3520e-01,  1.4904e-01,  7.8122e-03, -3.8799e-01],
        [-1.4364e+00,  1.0457e-01, -3.8646e-01, -4.6702e-01],
        [-5.2487e-01,  1.6767e-01, -2.9157e-02, -2.6788e-01],
        

In [None]:
max_next_q_values * (~done.unsqueeze(1))

NameError: name 'max_next_q_values' is not defined

In [None]:
q_values.gather(1, action.unsqueeze(-1))

tensor([[ 1.3246],
        [-0.7336],
        [ 0.6904],
        [ 0.0315],
        [ 1.4223],
        [ 0.0071],
        [-0.0206],
        [-1.7084],
        [ 0.1359],
        [ 1.1714],
        [ 0.1897],
        [ 0.0509],
        [ 1.7091],
        [ 0.0664],
        [-1.0759],
        [ 0.8194],
        [-1.2782],
        [-0.4596],
        [ 0.6318],
        [-0.0032],
        [-1.5888],
        [ 0.8822],
        [-0.1998],
        [-1.2811],
        [-1.3465],
        [ 0.9563],
        [ 0.4550],
        [ 0.8518],
        [-0.6650],
        [ 1.4859],
        [-1.2025],
        [ 0.2654]], grad_fn=<GatherBackward0>)

In [None]:
action

tensor([0, 1, 3, 2, 3, 2, 2, 1, 2, 0, 2, 2, 0, 2, 1, 3, 1, 1, 2, 2, 1, 3, 1, 1,
        1, 2, 2, 0, 1, 0, 1, 3])

In [None]:
net(state).shape

torch.Size([32, 4])