In [1]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple, Optional
import math
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


## Tokenizer and Data Loader


In [2]:
class ARCTokenizer:
    """Tokenizer for ARC challenges with special tokens for structure"""
    
    def __init__(self):
        # Value tokens (0-9)
        self.value_tokens = list(range(10))
        
        # Special tokens
        self.PAD_TOKEN = 10
        self.SOS_TOKEN = 11  # Start of sequence
        self.EOS_TOKEN = 12  # End of sequence
        self.TRAIN_TOKEN = 13  # Start of training example
        self.TEST_TOKEN = 14  # Start of test example
        self.INPUT_TOKEN = 15  # Start of input grid
        self.OUTPUT_TOKEN = 16  # Start of output grid
        self.NEWLINE_TOKEN = 17  # Grid separator (], [)
        
        self.vocab_size = 18
        
        # Token mappings
        self.token_to_id = {
            'PAD': self.PAD_TOKEN,
            'SOS': self.SOS_TOKEN,
            'EOS': self.EOS_TOKEN,
            'TRAIN': self.TRAIN_TOKEN,
            'TEST': self.TEST_TOKEN,
            'INPUT': self.INPUT_TOKEN,
            'OUTPUT': self.OUTPUT_TOKEN,
            'NEWLINE': self.NEWLINE_TOKEN
        }
    
    def grid_to_tokens(self, grid: List[List[int]]) -> List[int]:
        """Convert 2D grid to token sequence"""
        if not grid or not grid[0]:
            return []
        
        tokens = []
        for i, row in enumerate(grid):
            for j, value in enumerate(row):
                tokens.append(value)  # Just the value, position will be encoded separately
            if i < len(grid) - 1:  # Add newline between rows (except last)
                tokens.append(self.NEWLINE_TOKEN)
        
        return tokens
    
    def tokens_to_grid(self, tokens: List[int], target_shape: Tuple[int, int]) -> List[List[int]]:
        """Convert token sequence back to 2D grid"""
        h, w = target_shape
        grid = [[0 for _ in range(w)] for _ in range(h)]
        
        # Filter out special tokens and newlines
        values = [t for t in tokens if t < 10]  # Only keep value tokens (0-9)
        
        idx = 0
        for i in range(h):
            for j in range(w):
                if idx < len(values):
                    grid[i][j] = values[idx]
                    idx += 1
        
        return grid
    
    def create_input_sequence(self, train_examples: List[Dict], test_input: List[List[int]]) -> List[int]:
        """Create input sequence from training examples and test input"""
        sequence = [self.SOS_TOKEN]
        
        # Add training examples (exactly 2)
        for i, example in enumerate(train_examples[:2]):
            sequence.append(self.TRAIN_TOKEN)
            
            # Add input
            sequence.append(self.INPUT_TOKEN)
            input_tokens = self.grid_to_tokens(example['input'])
            sequence.extend(input_tokens)
            
            # Add output
            sequence.append(self.OUTPUT_TOKEN)
            output_tokens = self.grid_to_tokens(example['output'])
            sequence.extend(output_tokens)
        
        # Add test input
        sequence.append(self.TEST_TOKEN)
        sequence.append(self.INPUT_TOKEN)
        test_tokens = self.grid_to_tokens(test_input)
        sequence.extend(test_tokens)
        
        return sequence
    
    def create_target_sequence(self, target_grid: List[List[int]]) -> List[int]:
        """Create target sequence for training"""
        sequence = [self.SOS_TOKEN]
        sequence.append(self.OUTPUT_TOKEN)
        target_tokens = self.grid_to_tokens(target_grid)
        sequence.extend(target_tokens)
        sequence.append(self.EOS_TOKEN)
        return sequence
    
    def pad_sequence(self, sequence: List[int], max_length: int) -> List[int]:
        """Pad sequence to max_length"""
        if len(sequence) > max_length:
            return sequence[:max_length]
        return sequence + [self.PAD_TOKEN] * (max_length - len(sequence))

# Initialize tokenizer
tokenizer = ARCTokenizer()
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Special tokens: {tokenizer.token_to_id}")

# Test tokenizer
test_grid = [[1, 2, 3], [4, 5, 6]]
tokens = tokenizer.grid_to_tokens(test_grid)
print(f"\nTest grid: {test_grid}")
print(f"Tokens: {tokens}")
print(f"Back to grid: {tokenizer.tokens_to_grid(tokens, (2, 3))}")


Vocabulary size: 18
Special tokens: {'PAD': 10, 'SOS': 11, 'EOS': 12, 'TRAIN': 13, 'TEST': 14, 'INPUT': 15, 'OUTPUT': 16, 'NEWLINE': 17}

Test grid: [[1, 2, 3], [4, 5, 6]]
Tokens: [1, 2, 3, 17, 4, 5, 6]
Back to grid: [[1, 2, 3], [4, 5, 6]]


## Token converter (enrich with position info)

In [3]:
## Token to 3D Vector Converter

class TokenTo3DConverter:
    """Converts token sequences to 3D vectors [value, x, y] with coordinate information"""
    
    def __init__(self, tokenizer: ARCTokenizer):
        self.tokenizer = tokenizer
    
    def tokens_to_3d(self, 
                     tokens: List[int],
                     input_dims: List[Tuple[int, int]],
                     output_dims: List[Tuple[int, int]],
                     test_input_dims: Tuple[int, int],
                     test_output_dims: Optional[Tuple[int, int]] = None,
                     is_target: bool = False) -> torch.Tensor:
        """
        Convert token sequence to 3D vectors [value, x, y]
        
        Args:
            tokens: List of token IDs
            input_dims: List of (height, width) for training input grids
            output_dims: List of (height, width) for training output grids
            test_input_dims: (height, width) for test input grid
            is_target: If True, this is a target sequence (starts with OUTPUT_TOKEN)
        
        Returns:
            Tensor of shape [seq_len, 3] where each row is [value, x, y]
            Special tokens have x=-1, y=-1
        """
        result = []
        
        # Track current grid context
        current_grid_type = None  # 'train_input', 'train_output', 'test_input'
        current_grid_idx = 0
        current_row = 0
        current_col = 0
        current_grid_dims = None
        
        i = 0
        while i < len(tokens):
            token = tokens[i]
            
            # Handle special tokens that change context
            if token == self.tokenizer.SOS_TOKEN:
                result.append([token, -1, -1])
                i += 1
                continue
            elif token == self.tokenizer.EOS_TOKEN:
                result.append([token, -1, -1])
                i += 1
                continue
            elif token == self.tokenizer.PAD_TOKEN:
                result.append([token, -1, -1])
                i += 1
                continue
            elif token == self.tokenizer.TRAIN_TOKEN:
                current_grid_type = None
                current_grid_idx = 0
                result.append([token, -1, -1])
                i += 1
                continue
            elif token == self.tokenizer.TEST_TOKEN:
                current_grid_type = None
                result.append([token, -1, -1])
                i += 1
                continue
            elif token == self.tokenizer.INPUT_TOKEN:
                # Determine which input grid we're in
                if is_target:
                    # In target sequence, INPUT_TOKEN shouldn't appear
                    result.append([token, -1, -1])
                    i += 1
                    continue
                
                if current_grid_type is None:
                    # First INPUT after TRAIN - this is training input
                    if current_grid_idx < len(input_dims):
                        current_grid_dims = input_dims[current_grid_idx]
                        current_grid_type = 'train_input'
                elif current_grid_type == 'train_output':
                    # INPUT after OUTPUT in training - next training example
                    current_grid_idx += 1
                    if current_grid_idx < len(input_dims):
                        current_grid_dims = input_dims[current_grid_idx]
                        current_grid_type = 'train_input'
                elif current_grid_type is None:
                    # INPUT after TEST - this is test input
                    current_grid_dims = test_input_dims
                    current_grid_type = 'test_input'
                
                current_row = 0
                current_col = 0
                result.append([token, -1, -1])
                i += 1
                continue
            elif token == self.tokenizer.OUTPUT_TOKEN:
                # Determine which output grid we're in
                if current_grid_type == 'train_input':
                    # OUTPUT after INPUT in training
                    if current_grid_idx < len(output_dims):
                        current_grid_dims = output_dims[current_grid_idx]
                        current_grid_type = 'train_output'
                elif current_grid_type is None:
                    # OUTPUT at start (for target sequence) or after TEST
                    if is_target:
                        # For target sequence, use test_output_dims if available
                        if test_output_dims is not None:
                            current_grid_dims = test_output_dims
                        elif len(output_dims) > 0:
                            current_grid_dims = output_dims[0]  # Fallback to first output dims
                        else:
                            current_grid_dims = (1, 1)  # Default fallback
                    elif len(output_dims) > 0:
                        current_grid_dims = output_dims[0]
                    current_grid_type = 'train_output'
                
                current_row = 0
                current_col = 0
                result.append([token, -1, -1])
                i += 1
                continue
            elif token == self.tokenizer.NEWLINE_TOKEN:
                # Move to next row
                if current_grid_dims is not None:
                    current_row += 1
                    current_col = 0
                result.append([token, -1, -1])
                i += 1
                continue
            elif token < 10:  # Value token (0-9)
                # This is a grid value - add coordinates
                if current_grid_dims is not None:
                    h, w = current_grid_dims
                    # Clamp to valid ranges
                    row = min(current_row, h - 1)
                    col = min(current_col, w - 1)
                    result.append([token, col, row])  # x=col, y=row
                    
                    # Move to next column
                    current_col += 1
                else:
                    # No grid context, treat as special
                    result.append([token, -1, -1])
                i += 1
            else:
                # Unknown token, treat as special
                result.append([token, -1, -1])
                i += 1
        
        return torch.tensor(result, dtype=torch.long)


# Initialize converter
token_converter = TokenTo3DConverter(tokenizer)


## Data Loader with Augmentation


In [4]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple, Optional
import math
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


In [5]:
class ARCDataset:
    """Dataset class for ARC challenges with data augmentation"""
    
    def __init__(self, challenges_path: str, solutions_path: str = None):
        self.challenges_path = challenges_path
        self.solutions_path = solutions_path
        
        # Load challenges
        with open(challenges_path, 'r') as f:
            self.challenges = json.load(f)
        
        # Load solutions if provided
        self.solutions = None
        if solutions_path:
            with open(solutions_path, 'r') as f:
                self.solutions = json.load(f)
    
    def get_challenge_data(self, challenge_id: str) -> Dict:
        """Get data for a specific challenge"""
        challenge = self.challenges[challenge_id]
        
        # Get training examples
        train_examples = challenge.get('train', [])
        
        # Get test examples
        test_examples = challenge.get('test', [])
        
        # Get solution if available
        solution = None
        if self.solutions and challenge_id in self.solutions:
            solution = self.solutions[challenge_id][0]  # First solution
        
        return {
            'train_examples': train_examples,
            'test_examples': test_examples,
            'solution': solution,
            'challenge_id': challenge_id
        }
    
    def get_all_challenges(self) -> List[str]:
        """Get list of all challenge IDs"""
        return list(self.challenges.keys())
    
    def create_augmented_samples(self, challenge_id: str) -> List[Dict]:
        """Create augmented training samples from a challenge"""
        data = self.get_challenge_data(challenge_id)
        
        # For training data, we can create augmented samples using the original train examples
        # and the test examples (which don't have outputs, so we can't use them for training)
        samples = []
        
        # Use original training examples (these have both input and output)
        if len(data['train_examples']) >= 2:
            # Create sample with first 2 training examples
            train_examples = data['train_examples'][:2]
            test_input = data['test_examples'][0]['input'] if data['test_examples'] else []
            test_output = data['solution']
            
            samples.append({
                'train_examples': train_examples,
                'test_input': test_input,
                'test_output': test_output,
                'challenge_id': challenge_id,
                'sample_id': f"{challenge_id}_orig"
            })
            
            # If we have more training examples, create additional samples
            if len(data['train_examples']) >= 4:
                # Use examples 2 and 3 as training
                train_examples = data['train_examples'][2:4]
                samples.append({
                    'train_examples': train_examples,
                    'test_input': test_input,
                    'test_output': test_output,
                    'challenge_id': challenge_id,
                    'sample_id': f"{challenge_id}_aug_0"
                })
            
            # If we have even more, use examples 1 and 3
            if len(data['train_examples']) >= 4:
                train_examples = [data['train_examples'][1], data['train_examples'][3]]
                samples.append({
                    'train_examples': train_examples,
                    'test_input': test_input,
                    'test_output': test_output,
                    'challenge_id': challenge_id,
                    'sample_id': f"{challenge_id}_aug_1"
                })
        
        return samples

class ARCTorchDataset(Dataset):
    """PyTorch Dataset for ARC challenges"""
    
    def __init__(self, arc_dataset: ARCDataset, tokenizer: ARCTokenizer, 
                 token_converter = None):  # Optional converter
        self.arc_dataset = arc_dataset
        self.tokenizer = tokenizer
        self.token_converter = token_converter  # Optional converter for 3D vectors
        
        # Create all samples with augmentation
        self.samples = []
        for challenge_id in arc_dataset.get_all_challenges():
            samples = arc_dataset.create_augmented_samples(challenge_id)
            self.samples.extend(samples)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # Create input sequence
        input_seq = self.tokenizer.create_input_sequence(
            sample['train_examples'], 
            sample['test_input']
        )
        
        # Create target sequence
        if sample['test_output']:
            target_seq = self.tokenizer.create_target_sequence(sample['test_output'])
        else:
            # Create dummy target for test data
            target_seq = [self.tokenizer.SOS_TOKEN, self.tokenizer.EOS_TOKEN]
        
        # Pad sequences
        input_seq = self.tokenizer.pad_sequence(input_seq, 5400)   # 5 * 30x30 + bunch of extra tokens + possible target 30x30= 6*30x30
        target_seq = self.tokenizer.pad_sequence(target_seq, 1000) # max 30x30 + punch of extra tokens
        
        # Calculate dimensions
        input_dims = []
        output_dims = []
        
        for example in sample['train_examples']:
            input_dims.append((len(example['input']), len(example['input'][0]) if example['input'] else 0))
            output_dims.append((len(example['output']), len(example['output'][0]) if example['output'] else 0))
        
        test_input_dims = (len(sample['test_input']), len(sample['test_input'][0]) if sample['test_input'] else 0)
        test_output_dims = (len(sample['test_output']), len(sample['test_output'][0]) if sample['test_output'] else 0)
        
        # Convert to 3D vectors if converter is provided
        if self.token_converter is not None:
            input_3d = self.token_converter.tokens_to_3d(
                input_seq,
                input_dims,
                output_dims,
                test_input_dims,
                test_output_dims=test_output_dims,
                is_target=False
            )
            target_3d = self.token_converter.tokens_to_3d(
                target_seq,
                input_dims,
                output_dims,
                test_input_dims,
                test_output_dims=test_output_dims,
                is_target=True
            )
            return {
                'input': input_3d,  # Shape: [seq_len, 3] - [value, x, y]
                'target': target_3d,  # Shape: [seq_len, 3] - [value, x, y]
                'input_tokens': torch.tensor(input_seq, dtype=torch.int8),  # Keep original tokens too (int8 for memory efficiency)
                'target_tokens': torch.tensor(target_seq, dtype=torch.int8),  # Keep original tokens too (int8 for memory efficiency)
                'sample_id': sample['sample_id'],
                'challenge_id': sample['challenge_id'],
                'input_dims': input_dims,
                'output_dims': output_dims,
                'test_input_dims': test_input_dims,
                'test_output_dims': test_output_dims
            }
        else:
            # Return original token format
            return {
                'input': torch.tensor(input_seq, dtype=torch.int8),  # int8 for memory efficiency
                'target': torch.tensor(target_seq, dtype=torch.int8),  # int8 for memory efficiency
                'sample_id': sample['sample_id'],
                'challenge_id': sample['challenge_id'],
                'input_dims': input_dims,
                'output_dims': output_dims,
                'test_input_dims': test_input_dims,
                'test_output_dims': test_output_dims
            }

# Load datasets
print("Loading datasets...")
train_dataset = ARCDataset(
    challenges_path='arc-agi_training_challenges.json',
    solutions_path='arc-agi_training_solutions.json'
)

test_dataset = ARCDataset(
    challenges_path='arc-agi_test_challenges.json'
)

print(f"Training challenges: {len(train_dataset.get_all_challenges())}")
print(f"Test challenges: {len(test_dataset.get_all_challenges())}")

# Create PyTorch datasets
train_torch_dataset = ARCTorchDataset(train_dataset, tokenizer, token_converter=token_converter)
test_torch_dataset = ARCTorchDataset(test_dataset, tokenizer, token_converter=token_converter)

print(f"\nTraining samples (with augmentation): {len(train_torch_dataset)}")
print(f"Test samples: {len(test_torch_dataset)}")

# Test data loading
sample = train_torch_dataset[0]
print(f"\nSample data:")
print(f"Sample ID: {sample['sample_id']}")
print(f"Challenge ID: {sample['challenge_id']}")
print(f"Input sequence length: {len(sample['input'])}")
print(f"Target sequence length: {len(sample['target'])}")
print(f"Input dims: {sample['input_dims']}")
print(f"Output dims: {sample['output_dims']}")
print(f"Test input dims: {sample['test_input_dims']}")
print(f"Test output dims: {sample['test_output_dims']}")


Loading datasets...
Training challenges: 400
Test challenges: 100

Training samples (with augmentation): 614
Test samples: 152

Sample data:
Sample ID: 007bbfb7_orig
Challenge ID: 007bbfb7
Input sequence length: 5400
Target sequence length: 1000
Input dims: [(3, 3), (3, 3)]
Output dims: [(9, 9), (9, 9)]
Test input dims: (3, 3)
Test output dims: (9, 9)


In [6]:
length_arr =[]
for i in range(len(train_torch_dataset)):
    length_arr.append(len(train_torch_dataset[i]['target']))
max(length_arr)

1000

In [7]:
# self.PAD_TOKEN = 10
# self.SOS_TOKEN = 11  # Start of sequence
# self.EOS_TOKEN = 12  # End of sequence
# self.TRAIN_TOKEN = 13  # Start of training example
# self.TEST_TOKEN = 14  # Start of test example
# self.INPUT_TOKEN = 15  # Start of input grid
# self.OUTPUT_TOKEN = 16  # Start of output grid
# self.NEWLINE_TOKEN = 17  # Grid separator (], [)
train_torch_dataset[0]['input']#[0:300]

tensor([[11, -1, -1],
        [13, -1, -1],
        [15, -1, -1],
        ...,
        [10, -1, -1],
        [10, -1, -1],
        [10, -1, -1]])

In [8]:
train_torch_dataset[0]['target'][0:300]

tensor([[11, -1, -1],
        [16, -1, -1],
        [ 7,  0,  0],
        [ 0,  1,  0],
        [ 7,  2,  0],
        [ 0,  3,  0],
        [ 0,  4,  0],
        [ 0,  5,  0],
        [ 7,  6,  0],
        [ 0,  7,  0],
        [ 7,  8,  0],
        [17, -1, -1],
        [ 7,  0,  1],
        [ 0,  1,  1],
        [ 7,  2,  1],
        [ 0,  3,  1],
        [ 0,  4,  1],
        [ 0,  5,  1],
        [ 7,  6,  1],
        [ 0,  7,  1],
        [ 7,  8,  1],
        [17, -1, -1],
        [ 7,  0,  2],
        [ 7,  1,  2],
        [ 0,  2,  2],
        [ 0,  3,  2],
        [ 0,  4,  2],
        [ 0,  5,  2],
        [ 7,  6,  2],
        [ 7,  7,  2],
        [ 0,  8,  2],
        [17, -1, -1],
        [ 7,  0,  3],
        [ 0,  1,  3],
        [ 7,  2,  3],
        [ 0,  3,  3],
        [ 0,  4,  3],
        [ 0,  5,  3],
        [ 7,  6,  3],
        [ 0,  7,  3],
        [ 7,  8,  3],
        [17, -1, -1],
        [ 7,  0,  4],
        [ 0,  1,  4],
        [ 7,  2,  4],
        [ 

In [9]:
len(train_torch_dataset), len(test_torch_dataset)

(614, 152)

## Autoregressive Dataset for LLM Training

For autoregressive training, we need to:
1. Concatenate input + target into one sequence
2. Create labels shifted by 1 position (next token prediction)
3. Use causal masking so model can't see future tokens

**Important**: This is NOT data leakage! During training, the model learns to predict the next token given previous tokens. During inference, we'll use the same autoregressive generation process.


In [None]:
# Refactored ARCExplodedDataset - works directly with 3D vectors
class ARCExplodedDataset(Dataset):
    """
    Explodes ARCTorchDataset into trainable samples.
    
    Takes each sample from ARCTorchDataset and creates multiple training samples:
    - Sample 0: input → predict target[0]
    - Sample 1: input + target[0] → predict target[1]
    - Sample 2: input + target[0:2] → predict target[2]
    - etc.
    
    Expects both input and target to be in 3D vector format [value, x, y].
    When adding target tokens:
    1. Loop through input sequence and replace first PAD token with target token
    2. If no PAD token found, append to end and remove first token
    """
    
    def __init__(self, torch_dataset: ARCTorchDataset, tokenizer: ARCTokenizer, sequence_length: int = 5400):
        self.torch_dataset = torch_dataset
        self.tokenizer = tokenizer
        self.sequence_length = sequence_length
        
        # Create all exploded samples
        self.exploded_samples = []
        
        print(f"Exploding {len(torch_dataset)} base samples...")
        for base_idx in tqdm(range(len(torch_dataset))):
            base_sample = torch_dataset[base_idx]
            
            # Get input and target as 3D tensors [seq_len, 3]
            input_3d = base_sample['input']  # Shape: [max_length, 3]
            target_3d = base_sample['target']  # Shape: [max_length, 3]
            
            # Find actual length of input (before padding)
            # PAD token has value = PAD_TOKEN (10), x = -1, y = -1
            input_actual_len = 0
            for i in range(input_3d.shape[0]):
                if input_3d[i, 0].item() == self.tokenizer.PAD_TOKEN:
                    break
            input_actual_len = i-1
            
            target_actual_len = 0
            for i in range(target_3d.shape[0]):
                if target_3d[i, 0].item() == self.tokenizer.PAD_TOKEN:
                    break
            target_actual_len = i-1
            
            print(input_actual_len, target_actual_len)
            
            target_vectors = target_3d[0:target_actual_len]
            
            
            # Optimized version - remove unnecessary cloning and use input_actual_len directly
            # Replace the target_vectors collection and loop in cell 9 with this:

            # In the target_vectors collection (around line 43-50):
            # Change: target_vectors.append(target_3d[i].clone())
            # To:     target_vectors.append(target_3d[i])  # No clone needed

            # In the loop (around line 58-67):
            # Replace the entire loop with this optimized version:

            # Start with full input sequence (we'll modify it in place)
            current_seq = input_3d.clone()
            for i, target_vector in enumerate(target_vectors):
                # Calculate position where we should place this target token
                # Start from input_actual_len and add i (position in target sequence)
                target_pos = input_actual_len + i
                
                if i>0:
                    # first target vector is not added to the input sequence
                    if target_pos < self.sequence_length:
                        # Check if position has a PAD token
                        if current_seq[target_pos, 0].item() == self.tokenizer.PAD_TOKEN:
                            # Replace PAD token with target vector
                            current_seq[target_pos] = target_vectors[i-1]
                        else:
                            #print("Sequence is full - append and remove from beginning", target_pos, current_seq.shape)
                            # Sequence is full - append and remove from beginning
                            current_seq = torch.cat([current_seq[1:], target_vectors[i-1].unsqueeze(0)], dim=0)
                    else:
                        current_seq = torch.cat([current_seq[1:], target_vectors[i-1].unsqueeze(0)], dim=0)

                # Store exploded sample
                exploded_sample = {
                    'input_3d': current_seq.clone(),
                    'target_vector': target_vector.clone(),  # Clone here since we store it separately
                    'target_position': i,
                    'base_sample_idx': base_idx,
                    'base_sample_id': base_sample.get('sample_id', f'sample_{base_idx}'),
                    'challenge_id': base_sample.get('challenge_id', ''),
                    'input_dims': base_sample.get('input_dims', []),
                    'output_dims': base_sample.get('output_dims', []),
                    'test_input_dims': base_sample.get('test_input_dims', (0, 0)),
                    'test_output_dims': base_sample.get('test_output_dims', (0, 0)),
                }

                self.exploded_samples.append(exploded_sample)
        
        print(f"Created {len(self.exploded_samples)} exploded samples from {len(torch_dataset)} base samples")
    
    def __len__(self):
        return len(self.exploded_samples)
    
    def __getitem__(self, idx):
        sample = self.exploded_samples[idx]
        
        input_3d = sample['input_3d']  # Shape: [max_length, 3]
        target_vector = sample['target_vector']  # Shape: [3]
        
        # Create attention mask (1 for non-padding, 0 for padding)
        attention_mask = (input_3d[:, 0] != self.tokenizer.PAD_TOKEN).long()
        
        return {
            'input_3d': input_3d,  # [max_length, 3] - full 3D vectors
            'target_vector': target_vector,  # [3] - target as 3D vector
            'target_value': target_vector[0].item(),  # Just the value token for convenience
            'attention_mask': attention_mask,  # [max_length]
            'target_position': sample['target_position'],
            'base_sample_idx': sample['base_sample_idx'],
            'base_sample_id': sample['base_sample_id'],
            'challenge_id': sample['challenge_id'],
            'input_dims': sample['input_dims'],
            'output_dims': sample['output_dims'],
            'test_input_dims': sample['test_input_dims'],
            'test_output_dims': sample['test_output_dims'],
        }

# Create exploded datasets from existing ARCTorchDataset
print("Creating exploded training dataset...")
train_exploded_dataset = ARCExplodedDataset(train_torch_dataset, tokenizer)

Creating exploded training dataset...


AttributeError: 'ARCTorchDataset' object has no attribute 'max_length'

In [None]:
for i in range(10):
    print(train_exploded_dataset[i]['input_3d'][218:228], train_exploded_dataset[i]['target_vector'])