# Vision Transformer Model Training

This notebook demonstrates how to train a Vision Transformer model to generate G-code from images using Hugging Face's ecosystem.

In [1]:
# Install necessary libraries
!pip install transformers datasets torch torchvision pytorch-lightning tokenizers

## Train a Custom Tokenizer

Train a custom tokenizer on the G-code dataset.

In [1]:
import os
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents

# Train a custom tokenizer
def train_tokenizer(gcode_dir, vocab_size=8000, min_frequency=2, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>']):
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()
    
    # Get a list of all G-code files
    gcode_files = [os.path.join(gcode_dir, f) for f in os.listdir(gcode_dir) if f.endswith('.txt')]
    
    # Train the tokenizer
    tokenizer.train(files=gcode_files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens)
    
    return tokenizer

# Directory containing G-code files
gcode_dir = "dataset/gcode"  # Replace with the path to your G-code directory

# Train and save the tokenizer
tokenizer = train_tokenizer(gcode_dir)
tokenizer.save_model("./gcode_tokenizer")






['./gcode_tokenizer/vocab.json', './gcode_tokenizer/merges.txt']

## Data Preparation

Create a dataset class to handle the image and G-code pairs and a function to load the dataset using the custom tokenizer.

In [2]:
import os
import random
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from transformers import PreTrainedTokenizerFast

# Load the custom tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./gcode_tokenizer/tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '<pad>', 'eos_token': '</s>', 'bos_token': '<s>'})

# Dataset class to handle image and G-code pairs
class ImageGCodeDataset(Dataset):
    def __init__(self, image_dir, gcode_dir, transform=None, tokenizer=None):
        self.image_dir = image_dir
        self.gcode_dir = gcode_dir
        self.transform = transform
        self.tokenizer = tokenizer
        self.image_files = sorted(os.listdir(image_dir))
        self.gcode_files = sorted(os.listdir(gcode_dir))

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        gcode_path = os.path.join(self.gcode_dir, self.gcode_files[idx])
        
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        
        with open(gcode_path, 'r', encoding='utf-8', errors='ignore') as f:
            gcode = f.read()

        if self.tokenizer:
            gcode = self.tokenizer(gcode, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

        return {"pixel_values": image, "labels": gcode['input_ids'].squeeze()}

# Function to load the dataset
def load_dataset(image_dir, gcode_dir, tokenizer):
    # Define the image transformations
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize the images to 224x224 pixels
        transforms.ToTensor(),          # Convert the images to PyTorch tensors
    ])
    
    # Create the dataset object
    dataset = ImageGCodeDataset(image_dir, gcode_dir, transform, tokenizer)
    return dataset

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load dataset
image_dir = "dataset/images"  # Replace with the path to your image directory
gcode_dir = "dataset/gcode"   # Replace with the path to your G-code directory
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./gcode_tokenizer/tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '<pad>', 'eos_token': '</s>', 'bos_token': '<s>'})
dataset = load_dataset(image_dir, gcode_dir, tokenizer)

In [11]:
tokenizer.decode(dataset[0]['labels'])


'G21 ; Set units to millimeters\nG90 ; Absolute positioning\nG1 X4.329 Y3.405 F1200\nG1 X4.214 Y4.499 F1200\nG1 X3.114 Y4.479 F1200\nG1 X2.999 Y3.385 F1200<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [16]:
dataset[0]['pixel_values'].numpy()

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]], dtype=float32)

In [14]:
from PIL import Image

image = Image.fromarray(dataset[0]['pixel_values'].numpy())
image.save('name.png', format='PNG')

TypeError: Cannot handle this data type: (1, 1, 224), <f4

## Model Definition

Define a Vision Transformer model for image encoding and add a custom head for text (G-code) generation.

In [3]:
from transformers import ViTForImageClassification, ViTConfig, BertModel

class ViTForImageToGCode(ViTForImageClassification):
    def __init__(self, config):
        super().__init__(config)
        self.vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k')
        self.text_decoder = BertModel.from_pretrained('bert-base-uncased')

        # Custom linear layer to output G-code tokens
        self.text_decoder.cls = torch.nn.Linear(self.text_decoder.config.hidden_size, self.vit.config.num_labels)

    def forward(self, pixel_values, labels=None):
        # Forward pass through the Vision Transformer
        outputs = self.vit(pixel_values)
        hidden_states = outputs.last_hidden_state

        # Forward pass through the text decoder
        text_outputs = self.text_decoder(input_ids=labels, encoder_hidden_states=hidden_states)

        logits = text_outputs.logits

        loss = None
        if labels is not None:
            # Calculate the loss if labels are provided
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits}

## Training

Define the training loop using Hugging Face's `Trainer`.

In [5]:
from datasets import load_metric
from transformers import Trainer, TrainingArguments
import torch

# Load dataset
image_dir = "dataset/images"  # Replace with the path to your image directory
gcode_dir = "dataset/gcode"   # Replace with the path to your G-code directory
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./gcode_tokenizer/tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '<pad>', 'eos_token': '</s>', 'bos_token': '<s>'})
dataset = load_dataset(image_dir, gcode_dir, tokenizer)

# Split dataset into train and validation
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Load metric
metric = load_metric("accuracy")

# Function to compute metrics
def compute_metrics(p):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Logging steps
)

# Initialize model
model = ViTForImageToGCode.from_pretrained('google/vit-base-patch16-224-in21k')
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Ensure dataset is on the correct device
def collate_fn(batch):
    pixel_values = torch.stack([item['pixel_values'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    pixel_values = pixel_values.to(device)
    labels = labels.to(device)
    return {'pixel_values': pixel_values, 'labels': labels}

# Initialize Trainer
trainer = Trainer(
    model=model,                         # The instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
    compute_metrics=compute_metrics,     # The callback that computes metrics of interest
    data_collator=collate_fn             # Custom data collator to handle device placement
)

# Train the model
trainer.train()

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageToGCode were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight', 'text_decoder.cls.bias', 'text_decoder.cls.weight', 'text_decoder.embeddings.LayerNorm.bias', 'text_decoder.embeddings.LayerNorm.weight', 'text_decoder.embeddings.position_embeddings.weight', 'text_decoder.embeddings.token_type_embeddings.weight', 'text_decoder.embeddings.word_embeddings.weight', 'text_decoder.encoder.layer.0.attention.output.LayerNorm.bias', 'text_decoder.encoder.layer.0.attention.output.LayerNorm.weight', 'text_decoder.encoder.layer.0.attention.output.dense.bias', 'text_decoder.encode

RuntimeError: Mismatched Tensor types in NNPack convolutionOutput

## Evaluation

Evaluate the model's performance on the validation set.

In [6]:
# Evaluate the model
results = trainer.evaluate()

print(f"Validation Accuracy: {results['eval_accuracy']}")

## Final Remarks

This setup provides a basic framework to train a Vision Transformer model to generate G-code from images using Hugging Face's ecosystem. The training script initializes the dataset, defines the model, sets up the trainer, and evaluates the model.

You can customize the model architecture, training parameters, and evaluation metrics according to your specific requirements.