In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
import random
import pandas as pd
import yaml
from liftoff import parse_opts
from argparse import Namespace
from experiment_src import *
import numpy as np
import networkx as nx
import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})

root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath("."))))
sys.path.append(root_dir)

from experiment_src import (
    train_net_with_value_function_approximation,
    generate_random_policy_transitions,
    generate_transitions_observations,
    TransitionDatasetWithScaling
)
from experiments.experiment_utils import setup_logger, seed_everything
from overfitting.src.policy_iteration import random_policy_evaluation_q_stochastic
from overfitting.src.utils import (
    create_random_policy,
    extract_V_from_Q_for_stochastic_policy,
)
from overfitting.src.visualize import draw_simple_gridworld
from experiment_src import generate_train_test_split_with_valid_path, check_path_existence_to_any_terminal

import logging

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def count_occurrences_and_compute_percentage(
    sampled_transitions_list, total_unique_transitions, N
):
    # Count occurrences of each index in the sampled list
    occurrences_count = {}
    for index in sampled_transitions_list:
        occurrences_count[index] = occurrences_count.get(index, 0) + 1

    # Compute the number of indexes that appear at least N times
    at_least_N = sum(1 for count in occurrences_count.values() if count >= N)

    # Compute the percentage relative to the total number of unique transitions
    percentage = (at_least_N / total_unique_transitions) * 100
    return percentage, occurrences_count

In [3]:
logger = logging.getLogger(__name__)

rows, cols = 10, 10
start_state = (0, 0)
terminal_states = {(rows - 2, cols - 2): 1.0}
p_success = 1
seed = 3

num_steps = 40_000
min_samples = 20
# min_samples = 0

# Learning hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.05  # Convergence criterion
tau = 100
batch_size = 32
train_max_iterations = 50
theta = 1e-6

env = make_env(rows, cols, start_state, p_success, terminal_states, seed)

states = list(set([s for s, _ in env.mdp.keys()]))
actions = list(set([a for _, a in env.mdp.keys()]))

In [4]:
tau = 1000
transitions_list = [(key[0], key[1], *value[0]) for key, value in env.mdp.items()]
transitions_train, transitions_val = train_test_split(
    transitions_list, test_size=0.2, random_state=seed
)

random_policy_transitions = generate_transitions_observations(
    transitions_list,
    num_steps,
    tau=tau,
    min_samples=min_samples,
)


### Training
input_size = len(states[0])  # Or another way to represent the size of your input
output_size = len(actions)

# Initialize the DQN
qnet_random_policy = QNET(input_size, output_size)

# loss_record_random_policy = train_net_with_value_function_approximation(
#     qnet_random_policy,
#     random_policy_transitions,
#     states,
#     actions,
#     gamma,
#     epsilon,
#     batch_size,
#     train_max_iterations,
#     logger,
# )

In [5]:
# # def train_net_with_neural_fitted_q_scaled_loss(
# #     net,
# #     transitions,
# #     Q_pi_random,
# #     states,
# #     actions,
# #     gamma,
# #     epsilon,
# #     batch_size,
# #     max_iterations,
# #     logger=None,
# # ):

# max_iterations = 10
# transitions = random_policy_transitions

# net = QNET(input_size, output_size)
# if logger is None:
#     logger = logging.getLogger(__name__)

# net.train()

# transitions_for_counting = [
#     (s, a, ns, r, int(d)) for s, a, ns, r, d, _ in transitions
# ]
# transition_counts = Counter(transitions_for_counting)

# # Calculate expected frequency under uniform distribution
# N_total = len(transitions)
# N_unique = len(set(transitions_for_counting))
# expected_frequency = N_total / N_unique

# # Compute scaling factor relative to uniform distribution
# inverse_frequency_scaling = {
#     t: expected_frequency / count for t, count in transition_counts.items()
# }

# dataset = TransitionDatasetWithScaling(transitions, inverse_frequency_scaling)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# optimizer = optim.Adam(net.parameters(), lr=0.001)
# loss_fn = nn.MSELoss(reduction="none")
# loss_record = []

# for epoch in range(max_iterations):
#     total_loss = 0
#     for state, action, next_state, reward, done, scale_factor in dataloader:
#         optimizer.zero_grad()
#         q_values = net(state)
#         next_q_values = net(next_state)
#         max_next_q_values = next_q_values.detach().max(1)[0].unsqueeze(1)
#         target_q_values = reward.unsqueeze(1) + gamma * max_next_q_values * (
#             ~done.unsqueeze(1)
#         )
#         target_q_values = torch.where(
#             done.unsqueeze(1), reward.unsqueeze(1), target_q_values
#         ) # explicit handling of terminal states

#         individual_losses = loss_fn(q_values.gather(1, action.unsqueeze(-1)), target_q_values)
#         scaled_losses = individual_losses.squeeze() * scale_factor.to(q_values.device)
    
#         loss = scaled_losses.mean()
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#         break
    
#     loss_record.append((epoch, total_loss, 0))
    
#     break




In [10]:
max_iterations = 10
transitions = random_policy_transitions

net = QNET(input_size, output_size)
if logger is None:
    logger = logging.getLogger(__name__)
    
net.train()
dataset = TransitionDataset(transitions)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
optimizer = optim.Adam(net.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
loss_record = []

for epoch in range(max_iterations):
    total_loss = 0
    for state, action, next_state, reward, done in dataloader:
        optimizer.zero_grad()
        q_values = net(state)
        next_q_values = net(next_state)
        max_next_q_values = next_q_values.detach().max(1)[0]
        
        target_q_values_for_actions = reward + gamma * max_next_q_values * (~done)

        action_q_values = q_values.gather(1, action.unsqueeze(-1))
         
        loss = loss_fn(action_q_values, target_q_values_for_actions.unsqueeze(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        break
    break

In [11]:
action_q_values.shape

torch.Size([32, 1])

In [15]:
target_q_values.shape

NameError: name 'target_q_values' is not defined

In [14]:
q_values.shape

torch.Size([32, 4])

In [13]:
loss

tensor(0.7741, grad_fn=<MseLossBackward0>)

In [None]:
q_values.gather(1, action.unsqueeze(-1)).shape

torch.Size([32, 1])

In [None]:
q_values

tensor([[-1.2938e+00,  1.4427e-01, -3.4196e-01, -4.9531e-01],
        [-5.5965e-01,  1.5732e-01, -1.1921e-01, -1.1361e-01],
        [-9.2899e-01,  1.1963e-01, -6.7880e-02, -8.0473e-01],
        [-1.0815e+00,  1.6027e-01, -2.7088e-01, -4.0533e-01],
        [-9.2247e-01,  1.2536e-01, -1.3494e-01, -6.8017e-01],
        [-1.7271e+00,  1.5155e-01, -5.4521e-01, -1.4430e-01],
        [-1.3387e+00,  2.6558e-01, -4.1004e-01, -3.6590e-02],
        [-1.1518e+00,  2.8918e-01, -3.6429e-01,  4.8384e-03],
        [-1.6530e+00,  3.3030e-01, -5.4903e-01, -2.6282e-02],
        [-1.3390e+00,  1.5814e-01, -3.4963e-01, -6.2659e-01],
        [-1.1150e+00,  1.6729e-01, -2.7006e-01, -5.3678e-01],
        [-8.8873e-01,  1.4126e-01, -2.3497e-01, -1.3536e-01],
        [-6.0195e-01,  1.5063e-01, -9.8310e-04, -4.7670e-01],
        [-5.3520e-01,  1.4904e-01,  7.8122e-03, -3.8799e-01],
        [-1.4364e+00,  1.0457e-01, -3.8646e-01, -4.6702e-01],
        [-5.2487e-01,  1.6767e-01, -2.9157e-02, -2.6788e-01],
        

In [None]:
max_next_q_values * (~done.unsqueeze(1))

NameError: name 'max_next_q_values' is not defined

In [None]:
q_values.gather(1, action.unsqueeze(-1))

tensor([[ 1.3246],
        [-0.7336],
        [ 0.6904],
        [ 0.0315],
        [ 1.4223],
        [ 0.0071],
        [-0.0206],
        [-1.7084],
        [ 0.1359],
        [ 1.1714],
        [ 0.1897],
        [ 0.0509],
        [ 1.7091],
        [ 0.0664],
        [-1.0759],
        [ 0.8194],
        [-1.2782],
        [-0.4596],
        [ 0.6318],
        [-0.0032],
        [-1.5888],
        [ 0.8822],
        [-0.1998],
        [-1.2811],
        [-1.3465],
        [ 0.9563],
        [ 0.4550],
        [ 0.8518],
        [-0.6650],
        [ 1.4859],
        [-1.2025],
        [ 0.2654]], grad_fn=<GatherBackward0>)

In [None]:
action

tensor([0, 1, 3, 2, 3, 2, 2, 1, 2, 0, 2, 2, 0, 2, 1, 3, 1, 1, 2, 2, 1, 3, 1, 1,
        1, 2, 2, 0, 1, 0, 1, 3])

In [None]:
net(state).shape

torch.Size([32, 4])