In [43]:
import torch
import torch.nn as nn
from RubikCube.src.env import *
import torch.nn.functional as F

class ValuePolicyNetwork(nn.Module):
    def __init__(self):
        super(ValuePolicyNetwork, self).__init__()

        # Define shared layers
        self.shared_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(480, 4096),  # Input: 20x24 flattened to 480, then 4096
            nn.ELU(),
            nn.Linear(4096, 2048),  # 4096 -> 2048
            nn.ELU()
        )

        # Define value head
        self.value_head = nn.Sequential(
            nn.Linear(2048, 512),  # 2048 -> 512
            nn.ELU(),
            nn.Linear(512, 1)  # 512 -> 1 (scalar value)
        )

        # Define policy head
        self.policy_head = nn.Sequential(
            nn.Linear(2048, 512),  # 2048 -> 512
            nn.ELU(),
            nn.Linear(512, 12)  # 512 -> 12 (policy logits)
        )

    def forward(self, x):
        shared_out = self.shared_layers(x.unsqueeze(0))
        value_out = self.value_head(shared_out)
        policy_out = self.policy_head(shared_out)
        return value_out, policy_out


In [44]:
import pickle
import torch
from copy import deepcopy, copy
import random
import json

# Assuming Cube class and move definitions are already provided

def generate_samples(k: int, l: int):
    samples = []
    for _ in range(l):
        cube = Cube()
        actions = []
        # Make random moves up to depth k
        for _ in range(k):
            move_index = random.randint(0, 11)
            cube.move(move_index)
            actions.append(move_index)
            state = (deepcopy(cube.get_state()), deepcopy(actions))
            samples.append(state)

    # Transform samples to dictionary format
    samples_dict = []
    for state, actions in samples:
        sample_dict = {
            "state": [state[0].tolist(), state[1].tolist()],
            "actions": actions
        }
        samples_dict.append(sample_dict)

    return samples_dict


def get_all_childs(cube: Cube):
    samples = []
    for i in range(len(idx2move)):
        current_cube = deepcopy(cube)
        actions = []
        move_index = random.randint(0, 11)
        current_cube.move(move_index)
        actions.append(move_index)
        state = (deepcopy(current_cube.get_state()), deepcopy(actions))
        samples.append(state)

    # Transform samples to dictionary format
    samples_dict = []
    for state, actions in samples:
        sample_dict = {
            "state": [state[0].tolist(), state[1].tolist()],
            "actions": actions
        }
        samples_dict.append(sample_dict)

    return samples_dict



In [45]:
corners, edges = torch.tensor(obj[0]['state'][0]), torch.tensor(obj[0]['state'][1])
cube_representation = torch.concat([corners, edges], dim=0)
cube_representation_encoded = F.one_hot(cube_representation, num_classes=24)

In [46]:
cube = Cube(corners, edges)
get_all_childs(cube)


[{'state': [[5, 7, 19, 17, 6, 4, 16, 18],
   [6, 1, 2, 18, 11, 8, 23, 20, 5, 13, 14, 17]],
  'actions': [3]},
 {'state': [[4, 5, 2, 11, 12, 22, 16, 18],
   [4, 1, 10, 3, 9, 14, 23, 20, 12, 13, 21, 17]],
  'actions': [7]},
 {'state': [[0, 1, 2, 3, 12, 13, 14, 15],
   [0, 1, 2, 3, 9, 10, 21, 22, 12, 13, 14, 15]],
  'actions': [8]},
 {'state': [[5, 7, 10, 3, 23, 13, 17, 16],
   [6, 9, 2, 3, 13, 10, 23, 20, 12, 22, 14, 16]],
  'actions': [4]},
 {'state': [[5, 11, 2, 3, 14, 12, 16, 22],
   [6, 1, 2, 3, 9, 10, 7, 20, 13, 15, 12, 17]],
  'actions': [11]},
 {'state': [[7, 6, 2, 20, 12, 9, 16, 18],
   [7, 1, 21, 3, 9, 2, 23, 20, 12, 13, 10, 17]],
  'actions': [6]},
 {'state': [[9, 7, 0, 2, 12, 13, 20, 18],
   [6, 0, 3, 1, 9, 10, 23, 4, 12, 13, 14, 17]],
  'actions': [0]},
 {'state': [[4, 5, 2, 11, 12, 22, 16, 18],
   [4, 1, 10, 3, 9, 14, 23, 20, 12, 13, 21, 17]],
  'actions': [7]},
 {'state': [[5, 7, 19, 17, 6, 4, 16, 18],
   [6, 1, 2, 18, 11, 8, 23, 20, 5, 13, 14, 17]],
  'actions': [3]},
 {'s

In [59]:

from tqdm import tqdm  # Import tqdm for progress bar

def train_autodidactive(model, optimizer, loss_fn, device, iterations, k):
    """
    Train the model using an autodidactic iteration strategy with weighted samples and tqdm progress bar.

    Parameters:
        model: The neural network model that predicts value (v) and policy (p).
        optimizer: Optimizer for the model.
        loss_fn: Loss function for training.
        device: Torch device (e.g., 'cuda' or 'cpu').
        iterations: Number of iterations to train.
        k: Number of scrambled cubes to generate per iteration.
    """
    # Use tqdm to display a progress bar for iterations
    bar = tqdm(range(iterations), desc="Training Progress", unit="iter")
    running_loss = 0
    for iter in bar:
        # Generate scrambled cubes
        X = generate_samples(k, 1)  # List of scrambled cube states
        for dst, xi in enumerate(X):
            v_alls, r_alls, weights = [], [], []

            # Compute distance-based weight for the current sample
            distance_to_solved = dst + 1
            weight_xi = 1 / (distance_to_solved + 1e-6)  # Avoid division by zero
            weights.append(weight_xi)

            # Iterate over all possible actions
            for a in idx2move.keys():
                # Apply move `a` to the current scrambled cube
                cube_xia = Cube(torch.tensor(xi['state'][0]), torch.tensor(xi['state'][1]))
                cube_xia.move(a)

                # Get cube representation after move
                corners, edges = cube_xia.get_state()
                cube_representation = torch.concat([corners, edges], dim=0)
                cube_representation_encoded = F.one_hot(cube_representation, num_classes=24).float().to(device)

                # Predict value (v) and policy (p) using the model
                v_xia, p_xia = model(cube_representation_encoded)
                v_alls.append(v_xia)

                # Calculate reward
                
                if cube_xia.is_solved():
                    r_alls.append(1)  # Reward for solving the cube
                else:
                    r_alls.append(-1)  # Penalty for not solving the cube

            # Compute target values
            r_alls = torch.tensor(r_alls).to(device)
            v_alls = torch.stack(v_alls)

            # Optimal action is the one maximizing reward + value
            y_vi = r_alls + v_alls.squeeze()  # R + V for each action
            a_star = torch.argmax(y_vi)  # Best action

            # Targets for training
            y_vi_target = y_vi[a_star]
            y_pi_target = F.one_hot(a_star, num_classes=len(idx2move)).float()

            # Train the model with weighted loss
            optimizer.zero_grad()
            v_pred, p_pred = model(cube_representation_encoded)
            loss_v = loss_fn(v_pred, y_vi_target.unsqueeze(0))  # Value loss
            loss_p = loss_fn(p_pred, y_pi_target.unsqueeze(0))  # Policy loss
            loss = (loss_v + loss_p) * weight_xi  # Apply sample weight
            running_loss += loss.item()
            bar.set_description(f'loss: {running_loss / (iter + 1):4f}')
            loss.backward()
            optimizer.step()

    print("Training completed.")


In [92]:
from torch.optim import RMSprop

model = ValuePolicyNetwork()
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
optimizer = RMSprop(model.parameters(), lr=0.001, alpha=0.99, eps=1e-8)
loss_fn = nn.MSELoss()
iterations = 500_000
k = 1
model.to(device)
train_autodidactive(model, optimizer, loss_fn, device, iterations, k)

loss: 19.483307:   1%|          | 2605/500000 [01:57<6:14:50, 22.12iter/s] 


KeyboardInterrupt: 

In [93]:
cube = Cube()
cube.move(1)
#cube.move(11)
corners, edges = cube.get_state()
cube_representation = torch.concat([corners, edges], dim=0)
cube_representation_encoded = F.one_hot(cube_representation, num_classes=24).float().to(device)
model.eval()
v, p = model(cube_representation_encoded)
v

tensor([[2.3035]], device='cuda:0', grad_fn=<AddmmBackward0>)