In [1]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import random
import torch
import pandas as pd
import sys
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# This starts from the current working directory
# and goes up until it finds the 'fogas_torch' folder or '.git'
def find_root(current_path, marker="fogas_torch"):
    current_path = Path(current_path).resolve()
    for parent in [current_path] + list(current_path.parents):
        if (parent / marker).exists():
            return parent
    return current_path

PROJECT_ROOT = find_root(Path.cwd())
print(f"Project root found at: {PROJECT_ROOT}")
# Now define the dataset path
DATASET_PATH = PROJECT_ROOT / "datasets" / "test_fqi.csv"
print(f"Loading dataset from: {DATASET_PATH}")

from fogas_torch import PolicySolver, EnvDataCollector
from fogas_torch.algorithm import (
    FOGASSolverVectorized,
    FOGASOracleSolverVectorized,
    FOGASHyperOptimizer,
    FOGASEvaluator,
)
from fogas.dataset_collection.dataset_analyzer import DatasetAnalyzer
from fogas_torch.fqi.fqi_solver import FQISolver
from fogas_torch.fqi.fqi_evaluator import FQIEvaluator

seed = 42
np.random.seed(seed) # Add this
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Project root found at: /shared/home/mauro.diaz/work/FOGAS
Loading dataset from: /shared/home/mauro.diaz/work/FOGAS/datasets/test_fqi.csv
Using device: cuda


## 3 grid

In [2]:
states = torch.arange(9)
actions = torch.arange(4)
N = len(states)  # number of states
A = len(actions) # number of actions
gamma = 0.9

x_0 = 0 # fixed initial state

goal_grid = 8   # absorbing terminal state

def phi(x, a):
    # Convert to scalar if tensor
    if isinstance(x, torch.Tensor): x = x.item()
    if isinstance(a, torch.Tensor): a = a.item()
    
    vec = torch.zeros(N * A, dtype=torch.float64)
    vec[x * A + a] = 1.0
    return vec

omega = torch.zeros(N * A, dtype=torch.float64)
omega[8 * A : 8 * A + 4] = 1.0

# Helper to convert index <-> (row, col)
def to_rc(s): 
    if isinstance(s, torch.Tensor): s = s.item()
    return divmod(s, 3)

def to_s(r, c): 
    return r*3 + c

def next_state(s, a):
    if isinstance(s, torch.Tensor): s = s.item()
    if isinstance(a, torch.Tensor): a = a.item()

    if s == goal_grid:
        return goal_grid  # absorbing

    r, c = to_rc(s)

    if a == 0:  # Up
        r = max(0, r-1)
    elif a == 1:  # Down
        r = min(2, r+1)
    elif a == 2:  # Left
        c = max(0, c-1)
    elif a == 3:  # Right
        c = min(2, c+1)

    return to_s(r, c)

def psi(xp):
    if isinstance(xp, torch.Tensor): xp = xp.item()
    
    v = torch.zeros(N * A, dtype=torch.float64)
    # Iterating over tensors (states/actions) yields 0-d tensors, so we use .item()
    for x in states:
        for a in actions:
            if next_state(x, a) == xp:
                idx = x.item() * A + a.item()
                v[idx] = 1.0
    return v

# Initialize the solver
mdp = PolicySolver(states=states, actions=actions, phi=phi, omega=omega, gamma=gamma, x0=x_0, psi=psi)

In [3]:
# Initialize FQI Solver
solver_fqi = FQISolver(
    mdp=mdp,
    csv_path=str("/shared/home/mauro.diaz/work/FOGAS/datasets/grid3_problem.csv"),
    device=device,
    seed=seed,
    ridge=1e-6 
)
evaluator_fqi = FQIEvaluator(solver_fqi)
pi_fqi = solver_fqi.run(
    K=1000, 
    tau=0.1, 
    verbose=True
)
print("\nFQI Policy:")
evaluator_fqi.print_policy()
evaluator_fqi.compare_final_rewards()
evaluator_fqi.print_optimal_path(max_steps=30)

FQI: 100%|████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3503.25it/s, theta_norm=47.6510]



FQI Policy:

  State 0: π(a=0|s=0) = 0.00  π(a=1|s=0) = 0.00  π(a=2|s=0) = 0.00  π(a=3|s=0) = 1.00  --> best action: 3
  State 1: π(a=0|s=1) = 0.00  π(a=1|s=1) = 1.00  π(a=2|s=1) = 0.00  π(a=3|s=1) = 0.00  --> best action: 1
  State 2: π(a=0|s=2) = 0.00  π(a=1|s=2) = 1.00  π(a=2|s=2) = 0.00  π(a=3|s=2) = 0.00  --> best action: 1
  State 3: π(a=0|s=3) = 0.00  π(a=1|s=3) = 0.00  π(a=2|s=3) = 0.00  π(a=3|s=3) = 1.00  --> best action: 3
  State 4: π(a=0|s=4) = 0.00  π(a=1|s=4) = 0.00  π(a=2|s=4) = 0.00  π(a=3|s=4) = 1.00  --> best action: 3
  State 5: π(a=0|s=5) = 0.00  π(a=1|s=5) = 1.00  π(a=2|s=5) = 0.00  π(a=3|s=5) = 0.00  --> best action: 1
  State 6: π(a=0|s=6) = 0.00  π(a=1|s=6) = 0.00  π(a=2|s=6) = 0.00  π(a=3|s=6) = 1.00  --> best action: 3
  State 7: π(a=0|s=7) = 0.00  π(a=1|s=7) = 0.00  π(a=2|s=7) = 0.00  π(a=3|s=7) = 1.00  --> best action: 3
  State 8: π(a=0|s=8) = 1.00  π(a=1|s=8) = 0.00  π(a=2|s=8) = 0.00  π(a=3|s=8) = 0.00  --> best action: 0




J*(π*)   = 0.656100
J(π_FQI)

## 3 grid wall

In [13]:
states = torch.arange(9)
actions = torch.arange(4)
N = len(states) # number of states
A = len(actions) # number of actions
gamma = 0.9

x_0 = 0 # fixed initial state

goal = 8   # absorbing terminal state
pit = 5    # absorbing terminal state

def phi(x, a):
    # Convert to scalar if tensor
    if isinstance(x, torch.Tensor): x = x.item()
    if isinstance(a, torch.Tensor): a = a.item()

    vec = torch.zeros(N * A, dtype=torch.float64)
    vec[x * A + a] = 1.0
    return vec

step_cost = -0.1
goal_reward = 1.0
pit_reward  = -5.0

omega = torch.full((N * A,), step_cost, dtype=torch.float64)

# terminals: override step cost
omega[goal * A : goal * A + A] = goal_reward
omega[pit  * A : pit  * A + A] = pit_reward

# Helper to convert index <-> (row, col)
def to_rc(s):
    if isinstance(s, torch.Tensor): s = s.item()
    return divmod(s, 3)

def to_s(r, c): 
    return r*3 + c

wall = 4

def next_state(s, a):
    if isinstance(s, torch.Tensor): s = s.item()
    if isinstance(a, torch.Tensor): a = a.item()

    # absorbing terminals
    if s == goal or s == pit:
        return s

    r, c = to_rc(s)

    if a == 0:      # Up
        r2, c2 = max(0, r-1), c
    elif a == 1:    # Down
        r2, c2 = min(2, r+1), c
    elif a == 2:    # Left
        r2, c2 = r, max(0, c-1)
    elif a == 3:    # Right
        r2, c2 = r, min(2, c+1)

    sp = to_s(r2, c2)

    # wall blocks transition
    if sp == wall:
        return s

    return sp

def psi(xp):
    if isinstance(xp, torch.Tensor): xp = xp.item()
    
    v = torch.zeros(N * A, dtype=torch.float64)
    for x in states:
        for a in actions:
            if next_state(x, a) == xp:
                idx = x.item() * A + a.item()
                v[idx] = 1.0
    return v

# Initialize the solver with explicit terminal states
mdp = PolicySolver(
    states=states, 
    actions=actions, 
    phi=phi, 
    omega=omega, 
    gamma=gamma, 
    x0=x_0, 
    psi=psi,
    terminal_states=[goal, pit] 
)

In [8]:
collector = EnvDataCollector(mdp=mdp, env_name="3grid_wall", max_steps=50)
collector.collect_dataset(n_steps=3000, save_path=str(DATASET_PATH), verbose=True)

✅ Dataset saved to: /shared/home/mauro.diaz/work/FOGAS/datasets/test_fqi.csv
   Total transitions: 3000


Unnamed: 0,episode,step,state,action,reward,next_state
0,0,0,0,0,-0.1,0
1,0,1,0,3,-0.1,1
2,0,2,1,2,-0.1,0
3,0,3,0,1,-0.1,3
4,0,4,3,1,-0.1,6
...,...,...,...,...,...,...
2995,125,5,2,2,-0.1,1
2996,125,6,1,1,-0.1,1
2997,125,7,1,1,-0.1,1
2998,125,8,1,1,-0.1,1


In [18]:
solver_fqi = FQISolver(
    mdp=mdp,
    csv_path=str(DATASET_PATH),
    device=device,
    seed=seed,
    ridge=1e-6 
)
evaluator_fqi = FQIEvaluator(solver_fqi)
pi_fqi = solver_fqi.run(
    K=1000, 
    tau=0.3, 
    verbose=True
)
print("\nFQI Policy:")
evaluator_fqi.print_policy()
evaluator_fqi.compare_final_rewards()
evaluator_fqi.print_optimal_path(max_steps=30)

FQI: 100%|█████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4107.45it/s, theta_norm=1.3305]



FQI Policy:

  State 0: π(a=0|s=0) = 0.00  π(a=1|s=0) = 0.00  π(a=2|s=0) = 0.00  π(a=3|s=0) = 1.00  --> best action: 3
  State 1: π(a=0|s=1) = 0.00  π(a=1|s=1) = 0.00  π(a=2|s=1) = 0.00  π(a=3|s=1) = 1.00  --> best action: 3
  State 2: π(a=0|s=2) = 0.00  π(a=1|s=2) = 1.00  π(a=2|s=2) = 0.00  π(a=3|s=2) = 0.00  --> best action: 1
  State 3: π(a=0|s=3) = 0.00  π(a=1|s=3) = 1.00  π(a=2|s=3) = 0.00  π(a=3|s=3) = 0.00  --> best action: 1
  State 4: π(a=0|s=4) = 1.00  π(a=1|s=4) = 0.00  π(a=2|s=4) = 0.00  π(a=3|s=4) = 0.00  --> best action: 0
  State 5: π(a=0|s=5) = 1.00  π(a=1|s=5) = 0.00  π(a=2|s=5) = 0.00  π(a=3|s=5) = 0.00  --> best action: 0
  State 6: π(a=0|s=6) = 0.00  π(a=1|s=6) = 0.00  π(a=2|s=6) = 0.00  π(a=3|s=6) = 1.00  --> best action: 3
  State 7: π(a=0|s=7) = 0.00  π(a=1|s=7) = 0.00  π(a=2|s=7) = 0.00  π(a=3|s=7) = 1.00  --> best action: 3
  State 8: π(a=0|s=8) = 1.00  π(a=1|s=8) = 0.00  π(a=2|s=8) = 0.00  π(a=3|s=8) = 0.00  --> best action: 0




J*(π*)   = 0.621710
J(π_FQI)

In [27]:
df = collector.collect_dataset_terminal_aware(
    policy="random", 
    n_steps=1000, 
    extra_steps=5,
    save_path=str(DATASET_PATH)
)

✅ Terminal-aware dataset saved to: /shared/home/mauro.diaz/work/FOGAS/datasets/test_fqi.csv
   Total transitions: 1000


In [28]:
# Initialize FQI Solver
solver_fqi = FQISolver(
    mdp=mdp,
    csv_path=str(DATASET_PATH),
    device=device,
    seed=seed,
    ridge=1e-6 
)
evaluator_fqi = FQIEvaluator(solver_fqi)
pi_fqi = solver_fqi.run(
    K=2000, 
    tau=0.0, 
    verbose=True,

)
print("\nFQI Policy:")
evaluator_fqi.print_policy()
evaluator_fqi.compare_final_rewards()
evaluator_fqi.print_optimal_path(max_steps=30)

FQI: 100%|█████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 3970.47it/s, theta_norm=1.3305]



FQI Policy:

  State 0: π(a=0|s=0) = 0.00  π(a=1|s=0) = 0.00  π(a=2|s=0) = 0.00  π(a=3|s=0) = 1.00  --> best action: 3
  State 1: π(a=0|s=1) = 0.00  π(a=1|s=1) = 0.00  π(a=2|s=1) = 0.00  π(a=3|s=1) = 1.00  --> best action: 3
  State 2: π(a=0|s=2) = 0.00  π(a=1|s=2) = 1.00  π(a=2|s=2) = 0.00  π(a=3|s=2) = 0.00  --> best action: 1
  State 3: π(a=0|s=3) = 0.00  π(a=1|s=3) = 1.00  π(a=2|s=3) = 0.00  π(a=3|s=3) = 0.00  --> best action: 1
  State 4: π(a=0|s=4) = 1.00  π(a=1|s=4) = 0.00  π(a=2|s=4) = 0.00  π(a=3|s=4) = 0.00  --> best action: 0
  State 5: π(a=0|s=5) = 1.00  π(a=1|s=5) = 0.00  π(a=2|s=5) = 0.00  π(a=3|s=5) = 0.00  --> best action: 0
  State 6: π(a=0|s=6) = 0.00  π(a=1|s=6) = 0.00  π(a=2|s=6) = 0.00  π(a=3|s=6) = 1.00  --> best action: 3
  State 7: π(a=0|s=7) = 0.00  π(a=1|s=7) = 0.00  π(a=2|s=7) = 0.00  π(a=3|s=7) = 1.00  --> best action: 3
  State 8: π(a=0|s=8) = 1.00  π(a=1|s=8) = 0.00  π(a=2|s=8) = 0.00  π(a=3|s=8) = 0.00  --> best action: 0




J*(π*)   = 0.621710
J(π_FQI)