## Example of a Neural Network Pipeline with Pytorch

In [1]:
# Import all necessary modules
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import os
import shutil
import datetime

# Check if tensorboard is installed, if not install it
try:
    from torch.utils.tensorboard import SummaryWriter
    print("✅ TensorBoard is available!")
except ImportError:
    print("Tensorboard is not installed. Installing now...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorboard'])
    from torch.utils.tensorboard import SummaryWriter
    print("✅ TensorBoard installed successfully!")

✅ TensorBoard is available!


In [2]:
# Simple Vision Transformer (ViT) implementation
class SimpleViT(nn.Module):
    def __init__(self, image_size=28, patch_size=7, num_classes=10, dim=64, depth=4, heads=4, mlp_dim=128):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = patch_size * patch_size

        self.patch_size = patch_size
        self.dim = dim
        self.patch_embedding = nn.Linear(patch_dim, dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim),
            num_layers=depth
        )
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, x):
        B, C, H, W = x.shape
        x = x.reshape(B, C, H // self.patch_size, self.patch_size, W // self.patch_size, self.patch_size)
        x = x.permute(0, 2, 4, 1, 3, 5).reshape(B, -1, C * self.patch_size * self.patch_size)
        x = self.patch_embedding(x)
        x = x + self.pos_embedding
        x = self.transformer(x)
        x = x.mean(dim=1)
        return self.mlp_head(x)


In [3]:
# Simple Swin Transformer implementation
class SimpleSwinTransformer(nn.Module):
    def __init__(self, img_size=28, patch_size=4, in_chans=1, num_classes=10,
                embed_dim=96, depths=[2, 2], num_heads=[3, 6],
                window_size=7, mlp_ratio=4., drop_rate=0., drop_path_rate=0.1):
        super().__init__()
        
        self.num_classes = num_classes
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.patch_size = patch_size
        self.window_size = window_size
        
        # Patch embedding layer
        self.patch_embed = PatchEmbed(
            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches
        
        # Absolute position embedding
        self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)
        
        # Build layers
        self.layers = nn.ModuleList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2 ** i_layer),
                input_resolution=(img_size // patch_size // (2 ** i_layer),
                                img_size // patch_size // (2 ** i_layer)),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=mlp_ratio,
                drop=drop_rate,
                drop_path=drop_path_rate,
                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None)
            self.layers.append(layer)
        
        # Classification head
        self.norm = nn.LayerNorm(int(embed_dim * 2 ** (self.num_layers - 1)))
        self.head = nn.Linear(int(embed_dim * 2 ** (self.num_layers - 1)), num_classes)
        
        # Initialize weights
        self.apply(self._init_weights)
    
    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.trunc_normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
    
    def forward(self, x):
        # Patch embedding
        x = self.patch_embed(x)  # B, L, C
        x = x + self.absolute_pos_embed
        x = self.pos_drop(x)
        
        # Forward through layers
        for layer in self.layers:
            x = layer(x)
        
        # Classification head
        x = self.norm(x)  # B, L, C
        x = x.mean(dim=1)  # Global average pooling: B, C
        x = self.head(x)
        return x


class PatchEmbed(nn.Module):
    """Image to Patch Embedding"""
    def __init__(self, img_size=28, patch_size=4, in_chans=1, embed_dim=96):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.patches_resolution = [img_size // patch_size, img_size // patch_size]
        self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
        
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        
    def forward(self, x):
        B, C, H, W = x.shape
        x = self.proj(x).flatten(2).transpose(1, 2)  # B, Ph*Pw, C
        return x


class WindowAttention(nn.Module):
    """Window based multi-head self attention (W-MSA) module"""
    def __init__(self, dim, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.dim = dim
        self.window_size = window_size
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5
        
        # Relative position bias table
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))
        
        # Get pair-wise relative position index
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing='ij'))
        coords_flatten = torch.flatten(coords, 1)
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
        relative_coords[:, :, 0] += self.window_size[0] - 1
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)
        self.register_buffer("relative_position_index", relative_position_index)
        
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        
        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
    
    def forward(self, x, mask=None):
        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))
        
        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
        attn = attn + relative_position_bias.unsqueeze(0)
        
        if mask is not None:
            nW = mask.shape[0]
            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
            attn = attn.softmax(dim=-1)
        else:
            attn = attn.softmax(dim=-1)
        
        attn = self.attn_drop(attn)
        
        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class SwinTransformerBlock(nn.Module):
    """Swin Transformer Block"""
    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
                mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.):
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        
        if min(self.input_resolution) <= self.window_size:
            self.shift_size = 0
            self.window_size = min(self.input_resolution)
            
        self.norm1 = nn.LayerNorm(dim)
        self.attn = WindowAttention(
            dim, window_size=(self.window_size, self.window_size), num_heads=num_heads,
            qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
        
        self.drop_path = nn.Identity() if drop_path <= 0. else nn.Dropout(drop_path)
        self.norm2 = nn.LayerNorm(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_hidden_dim),
            nn.GELU(),
            nn.Dropout(drop),
            nn.Linear(mlp_hidden_dim, dim),
            nn.Dropout(drop)
        )
    
    def forward(self, x):
        H, W = self.input_resolution
        B, L, C = x.shape
        assert L == H * W, "input feature has wrong size"
        
        shortcut = x
        x = self.norm1(x)
        x = x.view(B, H, W, C)
        
        # Cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
        else:
            shifted_x = x
        
        # Partition windows
        x_windows = window_partition(shifted_x, self.window_size)
        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)
        
        # W-MSA/SW-MSA
        attn_windows = self.attn(x_windows)
        
        # Merge windows
        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
        shifted_x = window_reverse(attn_windows, self.window_size, H, W)
        
        # Reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
        else:
            x = shifted_x
        x = x.view(B, H * W, C)
        
        # FFN
        x = shortcut + self.drop_path(x)
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        
        return x


class PatchMerging(nn.Module):
    """Patch Merging Layer"""
    def __init__(self, input_resolution, dim):
        super().__init__()
        self.input_resolution = input_resolution
        self.dim = dim
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
        self.norm = nn.LayerNorm(4 * dim)
    
    def forward(self, x):
        H, W = self.input_resolution
        B, L, C = x.shape
        assert L == H * W, "input feature has wrong size"
        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
        
        x = x.view(B, H, W, C)
        
        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
        
        x = self.norm(x)
        x = self.reduction(x)
        
        return x


class BasicLayer(nn.Module):
    """A basic Swin Transformer layer for one stage"""
    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
                mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0.,
                drop_path=0., downsample=None):
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.depth = depth
        
        # Build blocks
        self.blocks = nn.ModuleList([
            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
                        num_heads=num_heads, window_size=window_size,
                        shift_size=0 if (i % 2 == 0) else window_size // 2,
                        mlp_ratio=mlp_ratio,
                        qkv_bias=qkv_bias, drop=drop, attn_drop=attn_drop,
                        drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path)
            for i in range(depth)])
        
        # Patch merging layer
        if downsample is not None:
            self.downsample = downsample(input_resolution, dim=dim)
        else:
            self.downsample = None
    
    def forward(self, x):
        for blk in self.blocks:
            x = blk(x)
        if self.downsample is not None:
            x = self.downsample(x)
        return x


def window_partition(x, window_size):
    """Partition into non-overlapping windows"""
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
    return windows


def window_reverse(windows, window_size, H, W):
    """Reverse of window partition"""
    B = int(windows.shape[0] / (H * W / window_size / window_size))
    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x

In [4]:
# Data preparation
transform = transforms.Compose([
    transforms.ToTensor(),
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# check of run path exists and create if not
if os.path.exists('runs'):
    print("✅ 'runs' directory already exists. Using existing logs directory...")
else:
    print("❗ 'runs' directory does not exist. Creating new logs directory...")
    os.makedirs('runs', exist_ok=True)

✅ 'runs' directory already exists. Using existing logs directory...


In [5]:
# Model setup
#model = SimpleViT() # or SimpleSwinTransformer()
model = SimpleSwinTransformer() # or SimpleViT()
# model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print(f"Model created with patch_size={model.patch_size}")
print(f"Initial resolution after patching: {28//7}x{28//7} = 4x4")
print("✅ This ensures even dimensions for patch merging!")

Model created with patch_size=7
Initial resolution after patching: 4x4 = 4x4
✅ This ensures even dimensions for patch merging!




In [6]:
# TensorBoard writer with automatic model type detection
import datetime

# Detect model type automatically
model_type = type(model).__name__
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Create descriptive experiment name
experiment_name = f"{model_type}_{timestamp}"

# Create TensorBoard writer with custom experiment name
try:
    writer = SummaryWriter(f'runs/{experiment_name}')
    print("✅ TensorBoard writer created successfully!")
    print(f"📊 Experiment: {experiment_name}")
    print(f"📁 Logging to: {writer.log_dir}")
except Exception as e:
    print(f"❌ Error creating TensorBoard writer: {e}")
    # Fallback: create manual logs
    import tempfile
    temp_dir = tempfile.mkdtemp(prefix=f'tensorboard_{model_type}_')
    writer = SummaryWriter(temp_dir)
    print(f"✅ Using temporary directory: {temp_dir}")

# Training loop with enhanced logging
print(f"\n🏋️ Starting training with {model_type}...")
model.train()

for batch_idx, (data, target) in enumerate(train_loader):
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

    # Log every 10 batches with model-specific naming
    if batch_idx % 10 == 0:
        writer.add_scalar(f'{model_type}/Loss_Train', loss.item(), batch_idx)
        writer.add_scalar('Training/Loss', loss.item(), batch_idx)
        print(f"[{model_type}] Batch {batch_idx}, Loss: {loss.item():.4f}")
    
    if batch_idx >= 100:  # limit for demo
        break

# Log final metrics with model info
writer.add_scalar(f'{model_type}/Final_Loss', loss.item(), 0)
writer.add_scalar('Final/TrainingLoss', loss.item(), 0)

# Add model parameters as text for comparison
model_params = sum(p.numel() for p in model.parameters())
writer.add_text('Model/Info', f"""
**Model Type**: {model_type}
**Total Parameters**: {model_params:,}
**Patch Size**: {getattr(model, 'patch_size', 'N/A')}
**Embed Dim**: {getattr(model, 'embed_dim', getattr(model, 'dim', 'N/A'))}
**Training Date**: {timestamp}
""")

writer.close()

print(f"\n✅ Training complete!")
print(f"🤖 Model: {model_type}")
print(f"📊 Parameters: {model_params:,}")
print(f"📁 TensorBoard logs saved to: {writer.log_dir}")
print(f"🌐 Run TensorBoard to compare {model_type} with other models!")

✅ TensorBoard writer created successfully!
Logging to: runs/Aug06_09-35-25_Moritz-HP

🏋️ Starting training...
Batch 0, Loss: 2.4274
Batch 10, Loss: 2.3069
Batch 20, Loss: 2.3067
Batch 30, Loss: 2.3092
Batch 40, Loss: 2.3644
Batch 50, Loss: 2.3401
Batch 60, Loss: 2.2718
Batch 70, Loss: 2.2712
Batch 80, Loss: 2.0993
Batch 90, Loss: 1.9156
Batch 100, Loss: 1.6519

✅ Training complete!
📊 TensorBoard logs saved to: runs/Aug06_09-35-25_Moritz-HP
🌐 Now run the TensorBoard launcher below to visualize the results!


In [7]:
# Check if tensorboard is installed, if not install it
try:
    from torch.utils.tensorboard import SummaryWriter
    print("✅ TensorBoard is available!")
except ImportError:
    print("Tensorboard is not installed. Installing now...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorboard'])
    from torch.utils.tensorboard import SummaryWriter
    print("✅ TensorBoard installed successfully!")

✅ TensorBoard is available!


In [8]:
# Launch TensorBoard from within the notebook
import subprocess
import sys
import os
import threading
import time

def launch_tensorboard():
    """Launch TensorBoard in a separate thread"""
    try:
        # Get the current directory
        current_dir = os.getcwd()
        runs_dir = os.path.join(current_dir, 'runs')
        
        # Check if runs directory exists
        if not os.path.exists(runs_dir):
            print(f"Warning: {runs_dir} directory not found. Make sure you have run the training loop first.")
            return
        
        print(f"Starting TensorBoard...")
        print(f"Log directory: {runs_dir}")
        print("TensorBoard will be available at: http://localhost:6006")
        print("Press Ctrl+C in the terminal to stop TensorBoard")
        
        # Launch TensorBoard using Python module
        cmd = [sys.executable, '-m', 'tensorboard.main', '--logdir', runs_dir, '--port', '6006']
        process = subprocess.run(cmd, check=True)
        
    except subprocess.CalledProcessError as e:
        print(f"Error starting TensorBoard: {e}")
    except KeyboardInterrupt:
        print("\nTensorBoard stopped by user")

# Start TensorBoard in a separate thread so it doesn't block the notebook
tensorboard_thread = threading.Thread(target=launch_tensorboard, daemon=True)
tensorboard_thread.start()

# Give it a moment to start
time.sleep(2)
print("\nIf TensorBoard started successfully, you can access it at:")
print("http://localhost:6006")
print("\nTo stop TensorBoard, restart the kernel or use Ctrl+C in the terminal.")

Starting TensorBoard...
Log directory: /home/moritz_s/Documents/RKIM_1/F_u_E_KI_Pipeline/KI_Training_Pipeline/runs
TensorBoard will be available at: http://localhost:6006
Press Ctrl+C in the terminal to stop TensorBoard


TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.19.0 at http://localhost:6006/ (Press CTRL+C to quit)



If TensorBoard started successfully, you can access it at:
http://localhost:6006

To stop TensorBoard, restart the kernel or use Ctrl+C in the terminal.


## 📊 TensorBoard Anleitung

### Was ist TensorBoard?
TensorBoard ist ein Visualisierungstool für Machine Learning Experimente. Es hilft dir dabei:
- **Trainingsverläufe** zu visualisieren (Loss, Accuracy, etc.)
- **Modellarchitekturen** zu betrachten
- **Histogramme** von Gewichten und Gradienten zu analysieren
- **Bilder und Embeddings** zu visualisieren

### 🚀 Wie verwendest du TensorBoard?

#### Schritt 1: Training ausführen
Stelle sicher, dass du die Trainingszellen ausgeführt hast. Diese erstellen die Log-Dateien im `runs/` Ordner.

#### Schritt 2: TensorBoard öffnen
- Öffne deinen Browser
- Gehe zu: `http://localhost:6006`
- Du siehst jetzt deine Trainingsgraphen!

### 🔍 Was siehst du in TensorBoard?
- **SCALARS Tab**: Hier siehst du den Trainingsloss über die Zeit
- **GRAPHS Tab**: Hier kannst du die Modellarchitektur visualisieren
- **DISTRIBUTIONS/HISTOGRAMS**: Gewichtsverteilungen (falls geloggt)

2. **SCALARS Tab**: 
   - Du siehst einen Graphen namens "Loss/Train"
   - Dieser zeigt, wie der Trainingsloss über die 100 Batches sinkt (von ~2.5 auf ~2.0)
   - Du siehst auch "Final/TrainingLoss" mit dem finalen Loss-Wert

3. **Was die Graphen bedeuten**:
   - X-Achse: Batch-Nummer (0 bis 100)
   - Y-Achse: Loss-Wert
   - Der Graph sollte eine fallende Tendenz zeigen → das Modell lernt! 📈

   ### 💡 Tipps
- TensorBoard aktualisiert sich automatisch, wenn neue Daten hinzugefügt werden
- Du kannst mehrere Experimente vergleichen, indem du verschiedene Ordner in `runs/` erstellst
- Verwende aussagekräftige Namen für deine Logs: `writer = SummaryWriter('runs/experiment_1')`