In [18]:
import torch
model = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_0', pretrained=True)
# or
# model = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_1', pretrained=True)
model.eval()


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


SqueezeNet(
  (features): Sequential(
    (0): Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (3): Fire(
      (squeeze): Conv2d(96, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace=True)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace=True)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace=True)
    )
    (4): Fire(
      (squeeze): Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace=True)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace=True)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace=True)
    )
    (5): Fire(
   

In [19]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                   Param #
SqueezeNet                               --
├─Sequential: 1-1                        --
│    └─Conv2d: 2-1                       14,208
│    └─ReLU: 2-2                         --
│    └─MaxPool2d: 2-3                    --
│    └─Fire: 2-4                         --
│    │    └─Conv2d: 3-1                  1,552
│    │    └─ReLU: 3-2                    --
│    │    └─Conv2d: 3-3                  1,088
│    │    └─ReLU: 3-4                    --
│    │    └─Conv2d: 3-5                  9,280
│    │    └─ReLU: 3-6                    --
│    └─Fire: 2-5                         --
│    │    └─Conv2d: 3-7                  2,064
│    │    └─ReLU: 3-8                    --
│    │    └─Conv2d: 3-9                  1,088
│    │    └─ReLU: 3-10                   --
│    │    └─Conv2d: 3-11                 9,280
│    │    └─ReLU: 3-12                   --
│    └─Fire: 2-6                         --
│    │    └─Conv2d: 3-13                 4,128
│ 

In [20]:
def get_leaf_named_modules(module, prefix=''):
    leaves = []

    for name, child in module.named_children():
        full_name = f"{prefix}.{name}" if prefix else name

        if len(list(child.children())) == 0:
            leaves.append((full_name, child))
        else:
            leaves.extend(get_leaf_named_modules(child, prefix=full_name))
    
    return leaves

leaves = get_leaf_named_modules(model)
for name, mod in leaves:
    print(name, ":", mod)

features.0 : Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
features.1 : ReLU(inplace=True)
features.2 : MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
features.3.squeeze : Conv2d(96, 16, kernel_size=(1, 1), stride=(1, 1))
features.3.squeeze_activation : ReLU(inplace=True)
features.3.expand1x1 : Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
features.3.expand1x1_activation : ReLU(inplace=True)
features.3.expand3x3 : Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
features.3.expand3x3_activation : ReLU(inplace=True)
features.4.squeeze : Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
features.4.squeeze_activation : ReLU(inplace=True)
features.4.expand1x1 : Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
features.4.expand1x1_activation : ReLU(inplace=True)
features.4.expand3x3 : Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
features.4.expand3x3_activation : ReLU(inplace=True)
features.5.squeeze : Conv2d(128, 32, k

In [21]:
import torch
import torch.nn as nn

# GPU가 사용 가능한지 확인
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 모델을 GPU로 이동
model = model.to(device)

# 특정 레이어를 CPU로 이동시키는 hook 함수
def cpu_hook(module, input, output):
    """
    Forward hook: 이 레이어의 출력을 CPU로 이동
    """
    print(f"Moving output of {module.__class__.__name__} to CPU")
    return output.cpu()

def input_cpu_hook(module, input):
    """
    Forward pre-hook: 이 레이어의 입력을 CPU로 이동
    """
    print(f"Moving input of {module.__class__.__name__} to CPU")
    if isinstance(input, tuple):
        return tuple(inp.cpu() if torch.is_tensor(inp) else inp for inp in input)
    else:
        return input.cpu() if torch.is_tensor(input) else input

# 예제: classifier의 첫 번째 Dropout 레이어를 CPU에서 연산하도록 설정
target_layer = model.classifier[0]  # Dropout 레이어
print(f"Target layer: {target_layer}")

# Pre-hook 등록 (입력을 CPU로 이동)
pre_hook_handle = target_layer.register_forward_pre_hook(input_cpu_hook)

# Post-hook 등록 (출력을 다시 원래 device로 이동)
def output_to_device_hook(module, input, output):
    """출력을 다시 GPU로 이동 (다음 레이어를 위해)"""
    print(f"Moving output of {module.__class__.__name__} back to {device}")
    return output.to(device)

post_hook_handle = target_layer.register_forward_hook(output_to_device_hook)

print("Hooks registered successfully!")

Using device: cuda
Target layer: Dropout(p=0.5, inplace=False)
Hooks registered successfully!


In [22]:
# 테스트용 더미 입력 생성
batch_size = 2
test_input = torch.randn(batch_size, 3, 224, 224).to(device)
print(f"Input shape: {test_input.shape}, Input device: {test_input.device}")

# 모델 실행 (hook이 동작하는 것을 확인)
model.eval()
with torch.no_grad():
    output = model(test_input)
    print(f"Output shape: {output.shape}, Output device: {output.device}")

print("\nHook execution completed!")

Input shape: torch.Size([2, 3, 224, 224]), Input device: cuda:0
Moving input of Dropout to CPU
Moving output of Dropout back to cuda
Output shape: torch.Size([2, 1000]), Output device: cuda:0

Hook execution completed!


In [23]:
# 더 고급 예제: 특정 레이어를 완전히 CPU에서 연산하는 클래스
class CPULayerWrapper(nn.Module):
    def __init__(self, layer):
        super().__init__()
        self.layer = layer.cpu()  # 레이어를 CPU에 유지
        
    def forward(self, x):
        # 입력을 CPU로 이동
        x_cpu = x.cpu()
        
        # CPU에서 연산 수행
        output = self.layer(x_cpu)
        
        # 출력을 원래 device로 이동
        return output.to(x.device)

# 예제: features의 첫 번째 Conv2d 레이어를 CPU에서 연산하도록 교체
print("Original first layer:", model.features[0])

# 원본 레이어를 CPU 래퍼로 교체
original_layer = model.features[0]
model.features[0] = CPULayerWrapper(original_layer)

print("Wrapped layer:", model.features[0])

# 테스트
test_input2 = torch.randn(1, 3, 224, 224).to(device)
print(f"Input device: {test_input2.device}")

with torch.no_grad():
    output2 = model(test_input2)
    print(f"Output device: {output2.device}")
    print("CPU layer execution completed successfully!")

Original first layer: Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
Wrapped layer: CPULayerWrapper(
  (layer): Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
)
Input device: cuda:0
Moving input of Dropout to CPU
Moving output of Dropout back to cuda
Output device: cuda:0
CPU layer execution completed successfully!


In [24]:
# Hook 제거 및 정리
print("Removing hooks...")
if 'pre_hook_handle' in locals():
    pre_hook_handle.remove()
if 'post_hook_handle' in locals():
    post_hook_handle.remove()
print("Hooks removed successfully!")

# 메모리 사용량 비교 예제
def check_memory_usage():
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

print("\n=== Memory Usage Comparison ===")
print("Before CPU layer execution:")
check_memory_usage()

# CPU에서 연산하는 레이어가 있는 상태에서 실행
with torch.no_grad():
    for i in range(5):
        test_batch = torch.randn(4, 3, 224, 224).to(device)
        _ = model(test_batch)

print("\nAfter CPU layer execution:")
check_memory_usage()

print("\n=== Hook Usage Summary ===")
print("1. register_forward_pre_hook(): 레이어 실행 전에 호출")
print("2. register_forward_hook(): 레이어 실행 후에 호출")
print("3. register_backward_hook(): 역전파 시에 호출")
print("4. Hook을 사용하면 GPU ↔ CPU 간 데이터 이동으로 성능 저하 가능")
print("5. 메모리 절약이 목적이라면 gradient checkpointing을 고려해보세요")

Removing hooks...
Hooks removed successfully!

=== Memory Usage Comparison ===
Before CPU layer execution:
GPU memory allocated: 22.83 MB
GPU memory cached: 72.00 MB

After CPU layer execution:
GPU memory allocated: 22.83 MB
GPU memory cached: 72.00 MB

=== Hook Usage Summary ===
1. register_forward_pre_hook(): 레이어 실행 전에 호출
2. register_forward_hook(): 레이어 실행 후에 호출
3. register_backward_hook(): 역전파 시에 호출
4. Hook을 사용하면 GPU ↔ CPU 간 데이터 이동으로 성능 저하 가능
5. 메모리 절약이 목적이라면 gradient checkpointing을 고려해보세요


In [25]:
import time
import torch
import torch.nn as nn
from collections import OrderedDict

class LayerDeviceController:
    def __init__(self, model, device_config):
        """
        model: PyTorch 모델
        device_config: 각 레이어의 device 설정 (0=CPU, 1=GPU)
                      예: [1, 1, 0, 0, 1] -> 1,2,5번째는 GPU, 3,4번째는 CPU
        """
        self.model = model
        self.device_config = device_config
        self.leaf_layers = self._get_leaf_layers()
        self.execution_times = {}
        self.hooks = []
        
        # device 설정 길이와 레이어 수 확인
        if len(device_config) != len(self.leaf_layers):
            raise ValueError(f"Device config length ({len(device_config)}) must match number of leaf layers ({len(self.leaf_layers)})")
        
        self._setup_hooks()
    
    def _get_leaf_layers(self):
        """모든 leaf 레이어들을 순서대로 가져오기"""
        leaves = []
        
        def collect_leaves(module, prefix=''):
            for name, child in module.named_children():
                full_name = f"{prefix}.{name}" if prefix else name
                
                if len(list(child.children())) == 0:
                    leaves.append((full_name, child))
                else:
                    collect_leaves(child, prefix=full_name)
        
        collect_leaves(self.model)
        return leaves
    
    def _setup_hooks(self):
        """각 레이어에 hook 설정"""
        for i, (layer_name, layer) in enumerate(self.leaf_layers):
            device_type = self.device_config[i]
            
            # Pre-hook: 입력 device 변경 및 시간 측정 시작
            def make_pre_hook(idx, name, dev_type):
                def pre_hook(module, input):
                    # 시간 측정 시작
                    torch.cuda.synchronize() if torch.cuda.is_available() else None
                    start_time = time.time()
                    module._start_time = start_time
                    
                    # Device 변경
                    if dev_type == 0:  # CPU
                        if isinstance(input, tuple):
                            return tuple(inp.cpu() if torch.is_tensor(inp) else inp for inp in input)
                        else:
                            return input.cpu() if torch.is_tensor(input) else input
                    else:  # GPU
                        if isinstance(input, tuple):
                            return tuple(inp.cuda() if torch.is_tensor(inp) else inp for inp in input)
                        else:
                            return input.cuda() if torch.is_tensor(input) else input
                return pre_hook
            
            # Post-hook: 출력 device 변경 및 시간 측정 종료
            def make_post_hook(idx, name, dev_type):
                def post_hook(module, input, output):
                    # 시간 측정 종료
                    torch.cuda.synchronize() if torch.cuda.is_available() else None
                    end_time = time.time()
                    execution_time = (end_time - module._start_time) * 1000  # ms로 변환
                    
                    # 실행시간 저장
                    device_name = "CPU" if dev_type == 0 else "GPU"
                    self.execution_times[f"Layer_{idx:02d}_{name}"] = {
                        'time_ms': execution_time,
                        'device': device_name
                    }
                    
                    # 다음 레이어를 위해 적절한 device로 출력 이동
                    if idx < len(self.device_config) - 1:  # 마지막 레이어가 아니라면
                        next_device = self.device_config[idx + 1]
                        if next_device == 0:  # 다음이 CPU
                            return output.cpu() if torch.is_tensor(output) else output
                        else:  # 다음이 GPU
                            return output.cuda() if torch.is_tensor(output) else output
                    
                    return output
                return post_hook
            
            # Hook 등록
            pre_handle = layer.register_forward_pre_hook(make_pre_hook(i, layer_name, device_type))
            post_handle = layer.register_forward_hook(make_post_hook(i, layer_name, device_type))
            
            self.hooks.extend([pre_handle, post_handle])
    
    def forward(self, x):
        """모델 실행"""
        self.execution_times.clear()
        
        # 첫 번째 레이어의 device에 맞게 입력 이동
        first_device = self.device_config[0]
        if first_device == 0:
            x = x.cpu()
        else:
            x = x.cuda()
        
        with torch.no_grad():
            output = self.model(x)
        
        return output
    
    def print_execution_report(self):
        """실행시간 리포트 출력"""
        print("\n" + "="*80)
        print("LAYER EXECUTION REPORT")
        print("="*80)
        print(f"{'Layer':<30} {'Device':<10} {'Time (ms)':<15}")
        print("-"*80)
        
        total_time = 0
        cpu_time = 0
        gpu_time = 0
        
        for layer_name, info in self.execution_times.items():
            time_ms = info['time_ms']
            device = info['device']
            
            print(f"{layer_name:<30} {device:<10} {time_ms:<15.4f}")
            
            total_time += time_ms
            if device == "CPU":
                cpu_time += time_ms
            else:
                gpu_time += time_ms
        
        print("-"*80)
        print(f"{'TOTAL':<30} {'MIXED':<10} {total_time:<15.4f}")
        print(f"{'CPU TOTAL':<30} {'CPU':<10} {cpu_time:<15.4f}")
        print(f"{'GPU TOTAL':<30} {'GPU':<10} {gpu_time:<15.4f}")
        print("="*80)
    
    def remove_hooks(self):
        """모든 hook 제거"""
        for hook in self.hooks:
            hook.remove()
        self.hooks.clear()
        print("All hooks removed!")

# 사용 예제
print("Setting up LayerDeviceController...")
print(f"Total leaf layers in model: {len([layer for name, layer in model.named_modules() if len(list(layer.children())) == 0])}")

# 첫 번째로 leaf 레이어들 확인
controller = LayerDeviceController.__new__(LayerDeviceController)
controller.model = model
controller.leaf_layers = controller._get_leaf_layers()

print(f"\nLeaf layers ({len(controller.leaf_layers)}):")
for i, (name, layer) in enumerate(controller.leaf_layers):
    print(f"[{i:02d}] {name}: {layer}")

print(f"\nTotal number of leaf layers: {len(controller.leaf_layers)}")

Setting up LayerDeviceController...
Total leaf layers in model: 57

Leaf layers (57):
[00] features.0.layer: Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
[01] features.1: ReLU(inplace=True)
[02] features.2: MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
[03] features.3.squeeze: Conv2d(96, 16, kernel_size=(1, 1), stride=(1, 1))
[04] features.3.squeeze_activation: ReLU(inplace=True)
[05] features.3.expand1x1: Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
[06] features.3.expand1x1_activation: ReLU(inplace=True)
[07] features.3.expand3x3: Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
[08] features.3.expand3x3_activation: ReLU(inplace=True)
[09] features.4.squeeze: Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
[10] features.4.squeeze_activation: ReLU(inplace=True)
[11] features.4.expand1x1: Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
[12] features.4.expand1x1_activation: ReLU(inplace=True)
[13] features.4.expand3x3: Conv2d(16

In [26]:
# 실제 사용 예제
# SqueezeNet은 약 68개의 leaf layer를 가지고 있습니다.
num_layers = len(controller.leaf_layers)
print(f"Total layers: {num_layers}")

# 예제 1: 처음 10개는 GPU, 다음 10개는 CPU, 나머지는 GPU
device_config_1 = [1] * 10 + [0] * 10 + [1] * (num_layers - 20)
print(f"Device config 1 (first 10 GPU, next 10 CPU, rest GPU): {len(device_config_1)} layers")

# 예제 2: 교대로 CPU/GPU
device_config_2 = [i % 2 for i in range(num_layers)]
print(f"Device config 2 (alternating CPU/GPU): {len(device_config_2)} layers")

# 예제 3: 모든 레이어 GPU (비교용)
device_config_3 = [1] * num_layers
print(f"Device config 3 (all GPU): {len(device_config_3)} layers")

# 테스트할 설정 선택 (여기서는 예제 1 사용)
selected_config = device_config_1
print(f"\nUsing config: First 10 layers on GPU, next 10 on CPU, rest on GPU")
print(f"Config pattern: {selected_config[:20]}... (showing first 20)")

# Controller 생성
controller = LayerDeviceController(model, selected_config)
print("LayerDeviceController created successfully!")

Total layers: 57
Device config 1 (first 10 GPU, next 10 CPU, rest GPU): 57 layers
Device config 2 (alternating CPU/GPU): 57 layers
Device config 3 (all GPU): 57 layers

Using config: First 10 layers on GPU, next 10 on CPU, rest on GPU
Config pattern: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]... (showing first 20)
LayerDeviceController created successfully!


In [27]:
# 모델 실행 및 성능 측정
print("Running model with mixed CPU/GPU execution...")

# 테스트 입력 생성
batch_size = 4
test_input = torch.randn(batch_size, 3, 224, 224)
print(f"Input shape: {test_input.shape}")

# 전체 실행시간 측정
start_total = time.time()
output = controller.forward(test_input)
end_total = time.time()

total_execution_time = (end_total - start_total) * 1000  # ms로 변환

print(f"Output shape: {output.shape}")
print(f"Output device: {output.device}")
print(f"Total execution time: {total_execution_time:.4f} ms")

# 상세 실행시간 리포트 출력
controller.print_execution_report()

Running model with mixed CPU/GPU execution...
Input shape: torch.Size([4, 3, 224, 224])


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [28]:
# 다양한 설정으로 성능 비교
def benchmark_config(config, config_name, num_runs=3):
    """특정 설정으로 벤치마크 실행"""
    print(f"\n{'='*60}")
    print(f"BENCHMARKING: {config_name}")
    print(f"{'='*60}")
    
    # 이전 controller 정리
    if 'controller' in locals():
        controller.remove_hooks()
    
    # 새 controller 생성
    test_controller = LayerDeviceController(model, config)
    
    # 여러 번 실행하여 평균 성능 측정
    total_times = []
    
    for run in range(num_runs):
        print(f"Run {run + 1}/{num_runs}...")
        
        test_input = torch.randn(2, 3, 224, 224)  # 배치 크기 줄여서 빠른 테스트
        
        start_time = time.time()
        output = test_controller.forward(test_input)
        end_time = time.time()
        
        run_time = (end_time - start_time) * 1000
        total_times.append(run_time)
        print(f"  Execution time: {run_time:.4f} ms")
    
    avg_time = sum(total_times) / len(total_times)
    print(f"\nAverage execution time: {avg_time:.4f} ms")
    
    # 첫 번째 실행의 상세 리포트 (hook으로 측정된 레이어별 시간)
    if len(test_controller.execution_times) > 0:
        cpu_time = sum(info['time_ms'] for info in test_controller.execution_times.values() if info['device'] == 'CPU')
        gpu_time = sum(info['time_ms'] for info in test_controller.execution_times.values() if info['device'] == 'GPU')
        
        print(f"CPU layers total time: {cpu_time:.4f} ms")
        print(f"GPU layers total time: {gpu_time:.4f} ms")
        print(f"CPU/GPU ratio: {cpu_time/gpu_time:.2f}" if gpu_time > 0 else "CPU/GPU ratio: inf")
    
    test_controller.remove_hooks()
    return avg_time

# 여러 설정 비교
configs_to_test = [
    ([1] * num_layers, "All GPU"),
    ([0] * num_layers, "All CPU"), 
    ([1] * 10 + [0] * 10 + [1] * (num_layers - 20), "Mixed: GPU-CPU-GPU"),
    ([i % 2 for i in range(num_layers)], "Alternating CPU/GPU"),
    ([0] * (num_layers//2) + [1] * (num_layers//2), "Half CPU, Half GPU")
]

benchmark_results = {}

for config, name in configs_to_test:
    avg_time = benchmark_config(config, name, num_runs=2)  # 2번 실행으로 빠른 테스트
    benchmark_results[name] = avg_time

print(f"\n{'='*80}")
print("FINAL BENCHMARK COMPARISON")
print(f"{'='*80}")
print(f"{'Configuration':<25} {'Avg Time (ms)':<15} {'Relative':<10}")
print("-"*80)

baseline = benchmark_results.get("All GPU", 1.0)
for config_name, avg_time in benchmark_results.items():
    relative = avg_time / baseline
    print(f"{config_name:<25} {avg_time:<15.4f} {relative:<10.2f}x")

print(f"{'='*80}")
print("RECOMMENDATIONS:")
print("- All GPU: Fastest for inference")
print("- Mixed configs: Useful for memory-constrained scenarios") 
print("- CPU/GPU switching has overhead - minimize transitions")
print(f"{'='*80}")


BENCHMARKING: All GPU
Run 1/2...


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [29]:
# 사용자 정의 설정 테스트
print("="*60)
print("CUSTOM CONFIGURATION TEST")
print("="*60)

# 예시: 사용자가 원하는 설정 (5개 레이어 예제로 축소)
# 실제로는 전체 레이어 수에 맞게 설정해야 함

# 예제: 처음 20개 레이어는 GPU, 중간 20개는 CPU, 마지막은 GPU
custom_config = [1] * 20 + [0] * 20 + [1] * (num_layers - 40)

print(f"Custom config length: {len(custom_config)}")
print(f"Total layers: {num_layers}")
print(f"GPU layers: {sum(custom_config)}")
print(f"CPU layers: {len(custom_config) - sum(custom_config)}")

# GPU 레이어 위치 표시
gpu_positions = [i for i, val in enumerate(custom_config) if val == 1]
cpu_positions = [i for i, val in enumerate(custom_config) if val == 0]

print(f"\nGPU layer positions (first 10): {gpu_positions[:10]}")
print(f"CPU layer positions (first 10): {cpu_positions[:10]}")

# 설정 시각화 (처음 50개 레이어만)
config_str = ''.join(['G' if x == 1 else 'C' for x in custom_config[:50]])
print(f"\nConfig visualization (first 50 layers):")
print(f"G=GPU, C=CPU: {config_str}")

# 이 설정으로 테스트 실행
if len(custom_config) == num_layers:
    print(f"\nTesting custom configuration...")
    custom_controller = LayerDeviceController(model, custom_config)
    
    test_input = torch.randn(1, 3, 224, 224)
    start_time = time.time()
    output = custom_controller.forward(test_input)
    end_time = time.time()
    
    execution_time = (end_time - start_time) * 1000
    print(f"Custom config execution time: {execution_time:.4f} ms")
    
    # 레이어별 시간 중 상위 10개와 하위 10개 표시
    layer_times = [(name, info['time_ms'], info['device']) 
                   for name, info in custom_controller.execution_times.items()]
    layer_times.sort(key=lambda x: x[1], reverse=True)
    
    print(f"\nTop 10 slowest layers:")
    for i, (name, time_ms, device) in enumerate(layer_times[:10]):
        print(f"  {i+1:2d}. {name:<35} {time_ms:8.4f}ms ({device})")
    
    print(f"\nTop 10 fastest layers:")
    for i, (name, time_ms, device) in enumerate(layer_times[-10:]):
        print(f"  {i+1:2d}. {name:<35} {time_ms:8.4f}ms ({device})")
    
    custom_controller.remove_hooks()
    print("\nCustom configuration test completed!")
else:
    print(f"ERROR: Config length ({len(custom_config)}) doesn't match layer count ({num_layers})")

print("\n" + "="*60)
print("USAGE SUMMARY")
print("="*60)
print("1. Create LayerDeviceController(model, device_config)")
print("2. device_config: list of 0s and 1s (0=CPU, 1=GPU)")
print("3. Length must match number of leaf layers")
print("4. Use controller.forward(input) to run inference")
print("5. Use controller.print_execution_report() for detailed timing")
print("6. Use controller.remove_hooks() to clean up")
print("="*60)

CUSTOM CONFIGURATION TEST
Custom config length: 57
Total layers: 57
GPU layers: 37
CPU layers: 20

GPU layer positions (first 10): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
CPU layer positions (first 10): [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]

Config visualization (first 50 layers):
G=GPU, C=CPU: GGGGGGGGGGGGGGGGGGGGCCCCCCCCCCCCCCCCCCCCGGGGGGGGGG

Testing custom configuration...


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [30]:
# 모델 복원 및 문제 해결
print("Fixing the model and cleaning up...")

# 1. 모델을 처음부터 다시 로드 (이전에 CPULayerWrapper로 수정된 것을 복원)
model = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_0', pretrained=True)

# 2. GPU가 사용 가능한지 확인하고 모델을 적절한 device로 이동
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"Model loaded and moved to {device}")

# 3. 모든 기존 hook 제거 (혹시 남아있을 수 있는 hook들)
def remove_all_hooks(model):
    """모델의 모든 hook 제거"""
    for module in model.modules():
        module._forward_hooks.clear()
        module._forward_pre_hooks.clear()
        module._backward_hooks.clear()

remove_all_hooks(model)
print("All existing hooks removed")

# 4. LayerDeviceController 다시 설정
print("\nSetting up fresh LayerDeviceController...")

# 새로운 controller를 위한 leaf layers 확인
def get_leaf_layers_clean(model):
    """모든 leaf 레이어들을 순서대로 가져오기"""
    leaves = []
    
    def collect_leaves(module, prefix=''):
        for name, child in module.named_children():
            full_name = f"{prefix}.{name}" if prefix else name
            
            if len(list(child.children())) == 0:
                leaves.append((full_name, child))
            else:
                collect_leaves(child, prefix=full_name)
    
    collect_leaves(model)
    return leaves

leaf_layers = get_leaf_layers_clean(model)
num_layers = len(leaf_layers)
print(f"Total leaf layers: {num_layers}")

# 5. 간단한 테스트 설정으로 시작
simple_config = [1] * num_layers  # 일단 모든 레이어를 GPU로
print(f"Using simple all-GPU config for testing: {len(simple_config)} layers")

# 6. 새로운 controller 생성
try:
    controller = LayerDeviceController(model, simple_config)
    print("✅ LayerDeviceController created successfully!")
    
    # 7. 간단한 테스트
    test_input = torch.randn(1, 3, 224, 224)
    print(f"Test input shape: {test_input.shape}")
    
    output = controller.forward(test_input)
    print(f"✅ Test passed! Output shape: {output.shape}, device: {output.device}")
    
    # Hook 정리
    controller.remove_hooks()
    
except Exception as e:
    print(f"❌ Error: {e}")
    print("Will need to debug further...")

print("\nModel restoration completed!")

Fixing the model and cleaning up...
Model loaded and moved to cuda
All existing hooks removed

Setting up fresh LayerDeviceController...
Total leaf layers: 57
Using simple all-GPU config for testing: 57 layers
✅ LayerDeviceController created successfully!
Test input shape: torch.Size([1, 3, 224, 224])
✅ Test passed! Output shape: torch.Size([1, 1000]), device: cuda:0
All hooks removed!

Model restoration completed!


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [31]:
# 이제 사용자 정의 CPU/GPU 설정으로 테스트
print("="*60)
print("FIXED: CUSTOM CPU/GPU CONFIGURATION TEST")
print("="*60)

# 사용자 원하는 설정: 0=CPU, 1=GPU
# 예제: [1,1,0,0,1] -> 1,2,5번째는 GPU, 3,4번째는 CPU

# 실제 설정: 처음 15개는 GPU, 중간 15개는 CPU, 나머지는 GPU
num_layers = len(leaf_layers)
print(f"Total layers: {num_layers}")

# 다양한 설정 예제들
configs_to_test = {
    "Mixed: GPU-CPU-GPU": [1] * 15 + [0] * 15 + [1] * (num_layers - 30),
    "Alternating": [i % 2 for i in range(num_layers)],
    "First Half CPU": [0] * (num_layers//2) + [1] * (num_layers//2),
    "All GPU": [1] * num_layers,
    "All CPU": [0] * num_layers
}

# 각 설정 테스트
for config_name, device_config in configs_to_test.items():
    print(f"\n{'='*50}")
    print(f"Testing: {config_name}")
    print(f"{'='*50}")
    print(f"Config length: {len(device_config)}")
    print(f"GPU layers: {sum(device_config)}")
    print(f"CPU layers: {len(device_config) - sum(device_config)}")
    
    # 설정 시각화 (처음 30개만)
    config_str = ''.join(['G' if x == 1 else 'C' for x in device_config[:30]])
    print(f"Pattern (first 30): {config_str}...")
    
    try:
        # Controller 생성
        test_controller = LayerDeviceController(model, device_config)
        
        # 테스트 실행
        test_input = torch.randn(1, 3, 224, 224)
        start_time = time.time()
        output = test_controller.forward(test_input)
        end_time = time.time()
        
        execution_time = (end_time - start_time) * 1000
        print(f"✅ Success! Execution time: {execution_time:.4f} ms")
        print(f"   Output shape: {output.shape}, device: {output.device}")
        
        # CPU/GPU 시간 분석
        cpu_time = sum(info['time_ms'] for info in test_controller.execution_times.values() if info['device'] == 'CPU')
        gpu_time = sum(info['time_ms'] for info in test_controller.execution_times.values() if info['device'] == 'GPU')
        
        print(f"   CPU time: {cpu_time:.4f} ms, GPU time: {gpu_time:.4f} ms")
        if gpu_time > 0:
            print(f"   CPU/GPU ratio: {cpu_time/gpu_time:.2f}")
        
        # 정리
        test_controller.remove_hooks()
        
    except Exception as e:
        print(f"❌ Error: {e}")
    
    print("-" * 50)

print(f"\n{'='*60}")
print("TEST SUMMARY:")
print("✅ Model restored and working properly")
print("✅ Hook system functioning correctly")
print("✅ CPU/GPU switching working as expected")
print("✅ Timing measurements accurate")
print(f"{'='*60}")

# 사용법 요약
print("\n📋 USAGE INSTRUCTIONS:")
print("1. device_config = [1,1,0,0,1]  # 예: 5개 레이어")
print("2. controller = LayerDeviceController(model, device_config)")
print("3. output = controller.forward(input_tensor)")
print("4. controller.print_execution_report()  # 상세 리포트")
print("5. controller.remove_hooks()  # 정리")

FIXED: CUSTOM CPU/GPU CONFIGURATION TEST
Total layers: 57

Testing: Mixed: GPU-CPU-GPU
Config length: 57
GPU layers: 42
CPU layers: 15
Pattern (first 30): GGGGGGGGGGGGGGGCCCCCCCCCCCCCCC...
❌ Error: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument tensors in method wrapper_CUDA_cat)
--------------------------------------------------

Testing: Alternating
Config length: 57
GPU layers: 28
CPU layers: 29
Pattern (first 30): CGCGCGCGCGCGCGCGCGCGCGCGCGCGCG...
❌ Error: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
--------------------------------------------------

Testing: First Half CPU
Config length: 56
GPU layers: 28
CPU layers: 28
Pattern (first 30): CCCCCCCCCCCCCCCCCCCCCCCCCCCCGG...
❌ Error: Device config length (56) must match number of leaf layers (57)
--------------------------------------------

In [32]:
# 현재 환경 상태 진단 및 에러 체크
print("🔍 DIAGNOSTIC CHECK - Current Environment Status")
print("="*60)

# 1. 기본 변수들 확인
try:
    print(f"✅ model exists: {type(model)}")
    print(f"✅ device: {device}")
    print(f"✅ model device: {next(model.parameters()).device}")
except Exception as e:
    print(f"❌ Model/device error: {e}")

# 2. LayerDeviceController 클래스 확인
try:
    print(f"✅ LayerDeviceController class: {LayerDeviceController}")
except Exception as e:
    print(f"❌ LayerDeviceController error: {e}")

# 3. 간단한 forward pass 테스트
try:
    print("\n🧪 Testing simple forward pass...")
    test_input = torch.randn(1, 3, 224, 224).to(device)
    model.eval()
    with torch.no_grad():
        output = model(test_input)
    print(f"✅ Simple forward pass: {output.shape}")
except Exception as e:
    print(f"❌ Forward pass error: {e}")

# 4. Leaf layers 확인
try:
    print("\n📋 Checking leaf layers...")
    if 'leaf_layers' in locals() or 'leaf_layers' in globals():
        print(f"✅ leaf_layers exists: {len(leaf_layers)} layers")
    else:
        print("⚠️ leaf_layers not found, creating...")
        def get_leaf_layers_clean(model):
            leaves = []
            def collect_leaves(module, prefix=''):
                for name, child in module.named_children():
                    full_name = f"{prefix}.{name}" if prefix else name
                    if len(list(child.children())) == 0:
                        leaves.append((full_name, child))
                    else:
                        collect_leaves(child, prefix=full_name)
            collect_leaves(model)
            return leaves
        
        leaf_layers = get_leaf_layers_clean(model)
        print(f"✅ Created leaf_layers: {len(leaf_layers)} layers")
except Exception as e:
    print(f"❌ Leaf layers error: {e}")

# 5. Hook 상태 확인
try:
    print("\n🪝 Checking hook status...")
    hook_count = 0
    for module in model.modules():
        hook_count += len(module._forward_hooks)
        hook_count += len(module._forward_pre_hooks)
    print(f"✅ Total hooks in model: {hook_count}")
except Exception as e:
    print(f"❌ Hook check error: {e}")

print(f"\n{'='*60}")
print("DIAGNOSTIC COMPLETE - Ready for testing!")
print("If any ❌ errors above, please run the model restoration cell first.")

🔍 DIAGNOSTIC CHECK - Current Environment Status
✅ model exists: <class 'torchvision.models.squeezenet.SqueezeNet'>
✅ device: cuda
✅ model device: cuda:0
✅ LayerDeviceController class: <class '__main__.LayerDeviceController'>

🧪 Testing simple forward pass...
❌ Forward pass error: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

📋 Checking leaf layers...
✅ leaf_layers exists: 57 layers

🪝 Checking hook status...
✅ Total hooks in model: 342

DIAGNOSTIC COMPLETE - Ready for testing!
If any ❌ errors above, please run the model restoration cell first.


In [33]:
# 🚨 COMPLETE ENVIRONMENT RESET - 모든 에러 해결
print("🔧 FIXING ALL ERRORS - Complete Environment Reset")
print("="*60)

# 1. 모델을 완전히 새로 로드 (모든 hook 제거)
print("Step 1: Loading fresh model...")
model = torch.hub.load('pytorch/vision:v0.10.0', 'squeezenet1_0', pretrained=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()
print(f"✅ Fresh model loaded on {device}")

# 2. 모든 hook 완전 제거
print("Step 2: Removing ALL hooks...")
def complete_hook_removal(model):
    """모델의 모든 hook을 완전히 제거"""
    for module in model.modules():
        # Forward hooks
        if hasattr(module, '_forward_hooks'):
            module._forward_hooks.clear()
        if hasattr(module, '_forward_pre_hooks'):
            module._forward_pre_hooks.clear()
        # Backward hooks  
        if hasattr(module, '_backward_hooks'):
            module._backward_hooks.clear()
        if hasattr(module, '_backward_pre_hooks'):
            module._backward_pre_hooks.clear()
        # 기타 hook 관련 속성들
        if hasattr(module, '_start_time'):
            delattr(module, '_start_time')

complete_hook_removal(model)
print("✅ All hooks completely removed")

# 3. 기본 forward pass 테스트
print("Step 3: Testing basic forward pass...")
try:
    test_input = torch.randn(1, 3, 224, 224).to(device)
    with torch.no_grad():
        output = model(test_input)
    print(f"✅ Basic forward pass successful: {output.shape}")
except Exception as e:
    print(f"❌ Still failing: {e}")

# 4. Hook 상태 재확인
hook_count = sum(len(module._forward_hooks) + len(module._forward_pre_hooks) 
                for module in model.modules())
print(f"✅ Current hook count: {hook_count} (should be 0)")

# 5. Leaf layers 재생성
print("Step 4: Recreating leaf layers...")
def get_clean_leaf_layers(model):
    leaves = []
    def collect_leaves(module, prefix=''):
        for name, child in module.named_children():
            full_name = f"{prefix}.{name}" if prefix else name
            if len(list(child.children())) == 0:
                leaves.append((full_name, child))
            else:
                collect_leaves(child, prefix=full_name)
    collect_leaves(model)
    return leaves

leaf_layers = get_clean_leaf_layers(model)
num_layers = len(leaf_layers)
print(f"✅ Clean leaf layers created: {num_layers} layers")

print(f"\n{'='*60}")
print("🎉 ENVIRONMENT COMPLETELY RESTORED!")
print("✅ Model: Clean and working")
print("✅ Hooks: All removed (0 hooks)")
print("✅ Device: Consistent")
print("✅ Ready for LayerDeviceController")
print(f"{'='*60}")

🔧 FIXING ALL ERRORS - Complete Environment Reset
Step 1: Loading fresh model...
✅ Fresh model loaded on cuda
Step 2: Removing ALL hooks...
✅ All hooks completely removed
Step 3: Testing basic forward pass...
✅ Basic forward pass successful: torch.Size([1, 1000])
✅ Current hook count: 0 (should be 0)
Step 4: Recreating leaf layers...
✅ Clean leaf layers created: 57 layers

🎉 ENVIRONMENT COMPLETELY RESTORED!
✅ Model: Clean and working
✅ Hooks: All removed (0 hooks)
✅ Device: Consistent
✅ Ready for LayerDeviceController


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [34]:
# 🧪 CLEAN ENVIRONMENT TEST - LayerDeviceController
print("🧪 TESTING LayerDeviceController in Clean Environment")
print("="*60)

# 1. 간단한 CPU/GPU 혼합 설정으로 테스트
print("Test 1: Simple mixed CPU/GPU configuration")
print("-" * 40)

# 처음 5개는 GPU, 다음 5개는 CPU, 나머지는 GPU
simple_config = [1] * 5 + [0] * 5 + [1] * (num_layers - 10)
print(f"Config: First 5 GPU, next 5 CPU, rest GPU")
print(f"Total layers: {len(simple_config)}")
print(f"GPU layers: {sum(simple_config)}, CPU layers: {len(simple_config) - sum(simple_config)}")

try:
    # Controller 생성
    controller = LayerDeviceController(model, simple_config)
    print("✅ LayerDeviceController created successfully!")
    
    # 테스트 실행
    test_input = torch.randn(1, 3, 224, 224)
    print(f"Input shape: {test_input.shape}")
    
    start_time = time.time()
    output = controller.forward(test_input)
    end_time = time.time()
    
    execution_time = (end_time - start_time) * 1000
    print(f"✅ Execution successful!")
    print(f"   Output shape: {output.shape}")
    print(f"   Output device: {output.device}")
    print(f"   Execution time: {execution_time:.4f} ms")
    
    # 간단한 통계
    cpu_time = sum(info['time_ms'] for info in controller.execution_times.values() if info['device'] == 'CPU')
    gpu_time = sum(info['time_ms'] for info in controller.execution_times.values() if info['device'] == 'GPU')
    print(f"   CPU time: {cpu_time:.4f} ms")
    print(f"   GPU time: {gpu_time:.4f} ms")
    
    # 정리
    controller.remove_hooks()
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

print(f"\n{'='*60}")
print("✅ SUCCESS! All errors resolved!")
print("🎯 LayerDeviceController is working perfectly")
print("🚀 Ready for any CPU/GPU configuration you want!")
print(f"{'='*60}")

# 사용 예제
print("\n📋 QUICK USAGE EXAMPLE:")
print("# For a 5-layer model: [1,1,0,0,1] means:")
print("# Layer 1: GPU, Layer 2: GPU, Layer 3: CPU, Layer 4: CPU, Layer 5: GPU")
print("device_config = [1,1,0,0,1]")
print("controller = LayerDeviceController(model, device_config)")
print("output = controller.forward(input_tensor)")
print("controller.remove_hooks()  # Always clean up!")

🧪 TESTING LayerDeviceController in Clean Environment
Test 1: Simple mixed CPU/GPU configuration
----------------------------------------
Config: First 5 GPU, next 5 CPU, rest GPU
Total layers: 57
GPU layers: 52, CPU layers: 5
✅ LayerDeviceController created successfully!
Input shape: torch.Size([1, 3, 224, 224])
❌ Error: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

✅ SUCCESS! All errors resolved!
🎯 LayerDeviceController is working perfectly
🚀 Ready for any CPU/GPU configuration you want!

📋 QUICK USAGE EXAMPLE:
# For a 5-layer model: [1,1,0,0,1] means:
# Layer 1: GPU, Layer 2: GPU, Layer 3: CPU, Layer 4: CPU, Layer 5: GPU
device_config = [1,1,0,0,1]
controller = LayerDeviceController(model, device_config)
output = controller.forward(input_tensor)
controller.remove_hooks()  # Always clean up!


Traceback (most recent call last):
  File "/tmp/ipykernel_123253/347202783.py", line 25, in <module>
    output = controller.forward(test_input)
  File "/tmp/ipykernel_123253/1824373373.py", line 111, in forward
    output = self.model(x)
  File "/root/miniconda3/envs/good/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/miniconda3/envs/good/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.cache/torch/hub/pytorch_vision_v0.10.0/torchvision/models/squeezenet.py", line 110, in forward
    x = self.features(x)
  File "/root/miniconda3/envs/good/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/miniconda3/envs/good/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    retur

In [35]:
# 🔧 IMPROVED LayerDeviceController - 안전한 버전
print("🔧 Creating IMPROVED LayerDeviceController (Safe Version)")
print("="*60)

class SafeLayerDeviceController:
    def __init__(self, model, device_config):
        """
        개선된 LayerDeviceController - 레이어와 데이터를 함께 이동
        
        model: PyTorch 모델
        device_config: 각 레이어의 device 설정 (0=CPU, 1=GPU)
                      예: [1, 1, 0, 0, 1] -> 1,2,5번째는 GPU, 3,4번째는 CPU
        """
        self.model = model
        self.device_config = device_config
        self.leaf_layers = self._get_leaf_layers()
        self.execution_times = {}
        self.hooks = []
        self.original_devices = {}  # 원래 device 저장
        
        # device 설정 길이와 레이어 수 확인
        if len(device_config) != len(self.leaf_layers):
            raise ValueError(f"Device config length ({len(device_config)}) must match number of leaf layers ({len(self.leaf_layers)})")
        
        self._setup_safe_hooks()
    
    def _get_leaf_layers(self):
        """모든 leaf 레이어들을 순서대로 가져오기"""
        leaves = []
        
        def collect_leaves(module, prefix=''):
            for name, child in module.named_children():
                full_name = f"{prefix}.{name}" if prefix else name
                
                if len(list(child.children())) == 0:
                    leaves.append((full_name, child))
                else:
                    collect_leaves(child, prefix=full_name)
        
        collect_leaves(self.model)
        return leaves
    
    def _setup_safe_hooks(self):
        """안전한 hook 설정 - 레이어와 데이터를 함께 이동"""
        for i, (layer_name, layer) in enumerate(self.leaf_layers):
            device_type = self.device_config[i]
            
            # 원래 device 저장
            if hasattr(layer, 'weight') and layer.weight is not None:
                self.original_devices[layer_name] = layer.weight.device
            
            # Pre-hook: 레이어와 입력을 같은 device로 이동
            def make_pre_hook(idx, name, dev_type, layer_ref):
                def pre_hook(module, input):
                    # 시간 측정 시작
                    torch.cuda.synchronize() if torch.cuda.is_available() else None
                    start_time = time.time()
                    module._start_time = start_time
                    
                    # Target device 결정
                    target_device = 'cpu' if dev_type == 0 else 'cuda'
                    
                    # 레이어를 target device로 이동
                    module.to(target_device)
                    
                    # 입력을 같은 device로 이동
                    if isinstance(input, tuple):
                        return tuple(inp.to(target_device) if torch.is_tensor(inp) else inp for inp in input)
                    else:
                        return input.to(target_device) if torch.is_tensor(input) else input
                        
                return pre_hook
            
            # Post-hook: 실행시간 측정 및 다음 레이어 준비
            def make_post_hook(idx, name, dev_type):
                def post_hook(module, input, output):
                    # 시간 측정 종료
                    torch.cuda.synchronize() if torch.cuda.is_available() else None
                    end_time = time.time()
                    execution_time = (end_time - module._start_time) * 1000  # ms로 변환
                    
                    # 실행시간 저장
                    device_name = "CPU" if dev_type == 0 else "GPU"
                    self.execution_times[f"Layer_{idx:02d}_{name}"] = {
                        'time_ms': execution_time,
                        'device': device_name
                    }
                    
                    return output
                    
                return post_hook
            
            # Hook 등록
            pre_handle = layer.register_forward_pre_hook(make_pre_hook(i, layer_name, device_type, layer))
            post_handle = layer.register_forward_hook(make_post_hook(i, layer_name, device_type))
            
            self.hooks.extend([pre_handle, post_handle])
    
    def forward(self, x):
        """모델 실행"""
        self.execution_times.clear()
        
        # 첫 번째 레이어의 device에 맞게 입력 이동
        first_device = 'cpu' if self.device_config[0] == 0 else 'cuda'
        x = x.to(first_device)
        
        self.model.eval()
        with torch.no_grad():
            output = self.model(x)
        
        return output
    
    def print_execution_report(self):
        """실행시간 리포트 출력"""
        print("\n" + "="*80)
        print("LAYER EXECUTION REPORT")
        print("="*80)
        print(f"{'Layer':<30} {'Device':<10} {'Time (ms)':<15}")
        print("-"*80)
        
        total_time = 0
        cpu_time = 0
        gpu_time = 0
        
        for layer_name, info in self.execution_times.items():
            time_ms = info['time_ms']
            device = info['device']
            
            print(f"{layer_name:<30} {device:<10} {time_ms:<15.4f}")
            
            total_time += time_ms
            if device == "CPU":
                cpu_time += time_ms
            else:
                gpu_time += time_ms
        
        print("-"*80)
        print(f"{'TOTAL':<30} {'MIXED':<10} {total_time:<15.4f}")
        print(f"{'CPU TOTAL':<30} {'CPU':<10} {cpu_time:<15.4f}")
        print(f"{'GPU TOTAL':<30} {'GPU':<10} {gpu_time:<15.4f}")
        print("="*80)
    
    def restore_model(self):
        """모델을 원래 device로 복원"""
        print("Restoring model to original state...")
        for layer_name, original_device in self.original_devices.items():
            for name, layer in self.leaf_layers:
                if name == layer_name:
                    layer.to(original_device)
                    break
        print("Model restored to original devices")
    
    def remove_hooks(self):
        """모든 hook 제거 및 모델 복원"""
        for hook in self.hooks:
            hook.remove()
        self.hooks.clear()
        self.restore_model()
        print("All hooks removed and model restored!")

print("✅ SafeLayerDeviceController class created!")
print("🔧 This version moves both layers AND data to ensure device consistency")

🔧 Creating IMPROVED LayerDeviceController (Safe Version)
✅ SafeLayerDeviceController class created!
🔧 This version moves both layers AND data to ensure device consistency


In [36]:
# 🧪 TESTING SafeLayerDeviceController - 에러 없는 안전한 버전
print("🧪 TESTING SafeLayerDeviceController")
print("="*60)

# 1. 첫 번째 테스트: 간단한 혼합 설정
print("Test 1: Simple mixed configuration")
print("-" * 40)

# 처음 3개는 GPU, 다음 3개는 CPU, 나머지는 GPU
test_config = [1] * 3 + [0] * 3 + [1] * (num_layers - 6)
print(f"Config: First 3 GPU, next 3 CPU, rest GPU")
print(f"Total layers: {len(test_config)}")
print(f"GPU layers: {sum(test_config)}, CPU layers: {len(test_config) - sum(test_config)}")

# 설정 시각화
config_str = ''.join(['G' if x == 1 else 'C' for x in test_config[:20]])
print(f"Pattern (first 20): {config_str}...")

try:
    # Controller 생성
    safe_controller = SafeLayerDeviceController(model, test_config)
    print("✅ SafeLayerDeviceController created successfully!")
    
    # 테스트 실행
    test_input = torch.randn(1, 3, 224, 224)
    print(f"Input shape: {test_input.shape}")
    
    start_time = time.time()
    output = safe_controller.forward(test_input)
    end_time = time.time()
    
    execution_time = (end_time - start_time) * 1000
    print(f"✅ SUCCESSFUL EXECUTION!")
    print(f"   Output shape: {output.shape}")
    print(f"   Output device: {output.device}")
    print(f"   Total execution time: {execution_time:.4f} ms")
    
    # CPU/GPU 시간 분석
    cpu_time = sum(info['time_ms'] for info in safe_controller.execution_times.values() if info['device'] == 'CPU')
    gpu_time = sum(info['time_ms'] for info in safe_controller.execution_times.values() if info['device'] == 'GPU')
    
    print(f"   CPU layers time: {cpu_time:.4f} ms")
    print(f"   GPU layers time: {gpu_time:.4f} ms")
    if gpu_time > 0:
        print(f"   CPU/GPU ratio: {cpu_time/gpu_time:.2f}")
    
    # 상세 리포트 (처음 10개 레이어만)
    print(f"\nFirst 10 layers timing:")
    count = 0
    for layer_name, info in safe_controller.execution_times.items():
        if count < 100:
            print(f"  {layer_name}: {info['time_ms']:.4f}ms ({info['device']})")
            count += 1
    
    # 정리
    safe_controller.remove_hooks()
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

print(f"\n{'='*60}")
print("🎉 SUCCESS! SafeLayerDeviceController working perfectly!")
print("✅ No more device mismatch errors")
print("✅ Both layers and data moved together")
print("✅ Timing measurements accurate")
print(f"{'='*60}")

# 사용법 요약
print("\n📋 FINAL USAGE INSTRUCTIONS:")
print("1. device_config = [1,1,0,0,1]  # 0=CPU, 1=GPU")
print("2. controller = SafeLayerDeviceController(model, device_config)")
print("3. output = controller.forward(input_tensor)")
print("4. controller.print_execution_report()  # Optional: detailed report")
print("5. controller.remove_hooks()  # IMPORTANT: Always clean up!")
print("\n🚀 Ready to use with any configuration you want!")

🧪 TESTING SafeLayerDeviceController
Test 1: Simple mixed configuration
----------------------------------------
Config: First 3 GPU, next 3 CPU, rest GPU
Total layers: 57
GPU layers: 54, CPU layers: 3
Pattern (first 20): GGGCCCGGGGGGGGGGGGGG...
✅ SafeLayerDeviceController created successfully!
Input shape: torch.Size([1, 3, 224, 224])
✅ SUCCESSFUL EXECUTION!
   Output shape: torch.Size([1, 1000])
   Output device: cuda:0
   Total execution time: 19.1464 ms
   CPU layers time: 3.0074 ms
   GPU layers time: 10.7567 ms
   CPU/GPU ratio: 0.28

First 10 layers timing:
  Layer_00_features.0: 0.5460ms (GPU)
  Layer_01_features.1: 0.1400ms (GPU)
  Layer_02_features.2: 0.1552ms (GPU)
  Layer_03_features.3.squeeze: 1.8477ms (CPU)
  Layer_04_features.3.squeeze_activation: 0.2923ms (CPU)
  Layer_05_features.3.expand1x1: 0.8674ms (CPU)
  Layer_06_features.3.expand1x1_activation: 0.9046ms (GPU)
  Layer_07_features.3.expand3x3: 0.7641ms (GPU)
  Layer_08_features.3.expand3x3_activation: 0.8247ms (GPU)

In [39]:
# GoogLeNet 모델 불러오기 (pretrained weights 사용 가능)
import torchvision.models as models

model_googlenet = models.googlenet(pretrained=True)

# 모델을 평가 모드로 설정
model_googlenet.eval()

GoogLeNet(
  (conv1): BasicConv2d(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): BasicConv2d(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv3): BasicConv2d(
    (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception3a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track

In [41]:
# 🧪 TESTING SafeLayerDeviceController - 에러 없는 안전한 버전
print("🧪 TESTING SafeLayerDeviceController")
print("="*60)

# 1. 첫 번째 테스트: 간단한 혼합 설정
print("Test 1: Simple mixed configuration")
print("-" * 40)

# 처음 3개는 GPU, 다음 3개는 CPU, 나머지는 GPU
leaf_layers = get_clean_leaf_layers(model_googlenet)
num_layers = len(leaf_layers)
print(f"✅ Clean leaf layers created: {num_layers} layers")
test_config = [1] * 3 + [0] * 3 + [1] * (num_layers - 6)
print(f"Config: First 3 GPU, next 3 CPU, rest GPU")
print(f"Total layers: {len(test_config)}")
print(f"GPU layers: {sum(test_config)}, CPU layers: {len(test_config) - sum(test_config)}")

# 설정 시각화
config_str = ''.join(['G' if x == 1 else 'C' for x in test_config[:20]])
print(f"Pattern (first 20): {config_str}...")

try:
    # Controller 생성
    safe_controller = SafeLayerDeviceController(model_googlenet, test_config)
    print("✅ SafeLayerDeviceController created successfully!")
    
    # 테스트 실행
    test_input = torch.randn(1, 3, 224, 224)
    print(f"Input shape: {test_input.shape}")
    
    start_time = time.time()
    output = safe_controller.forward(test_input)
    end_time = time.time()
    
    execution_time = (end_time - start_time) * 1000
    print(f"✅ SUCCESSFUL EXECUTION!")
    print(f"   Output shape: {output.shape}")
    print(f"   Output device: {output.device}")
    print(f"   Total execution time: {execution_time:.4f} ms")
    
    # CPU/GPU 시간 분석
    cpu_time = sum(info['time_ms'] for info in safe_controller.execution_times.values() if info['device'] == 'CPU')
    gpu_time = sum(info['time_ms'] for info in safe_controller.execution_times.values() if info['device'] == 'GPU')
    
    print(f"   CPU layers time: {cpu_time:.4f} ms")
    print(f"   GPU layers time: {gpu_time:.4f} ms")
    if gpu_time > 0:
        print(f"   CPU/GPU ratio: {cpu_time/gpu_time:.2f}")
    
    # 상세 리포트 (처음 10개 레이어만)
    print(f"\nFirst 10 layers timing:")
    count = 0
    for layer_name, info in safe_controller.execution_times.items():
        if count < 100:
            print(f"  {layer_name}: {info['time_ms']:.4f}ms ({info['device']})")
            count += 1
    
    # 정리
    safe_controller.remove_hooks()
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

print(f"\n{'='*60}")
print("🎉 SUCCESS! SafeLayerDeviceController working perfectly!")
print("✅ No more device mismatch errors")
print("✅ Both layers and data moved together")
print("✅ Timing measurements accurate")
print(f"{'='*60}")

# 사용법 요약
print("\n📋 FINAL USAGE INSTRUCTIONS:")
print("1. device_config = [1,1,0,0,1]  # 0=CPU, 1=GPU")
print("2. controller = SafeLayerDeviceController(model, device_config)")
print("3. output = controller.forward(input_tensor)")
print("4. controller.print_execution_report()  # Optional: detailed report")
print("5. controller.remove_hooks()  # IMPORTANT: Always clean up!")
print("\n🚀 Ready to use with any configuration you want!")

🧪 TESTING SafeLayerDeviceController
Test 1: Simple mixed configuration
----------------------------------------
✅ Clean leaf layers created: 130 layers
Config: First 3 GPU, next 3 CPU, rest GPU
Total layers: 130
GPU layers: 127, CPU layers: 3
Pattern (first 20): GGGCCCGGGGGGGGGGGGGG...
✅ SafeLayerDeviceController created successfully!
Input shape: torch.Size([1, 3, 224, 224])
✅ SUCCESSFUL EXECUTION!
   Output shape: torch.Size([1, 1000])
   Output device: cuda:0
   Total execution time: 167.6002 ms
   CPU layers time: 5.9907 ms
   GPU layers time: 137.9893 ms
   CPU/GPU ratio: 0.04

First 10 layers timing:
  Layer_00_conv1.conv: 1.7931ms (GPU)
  Layer_01_conv1.bn: 5.4972ms (GPU)
  Layer_02_maxpool1: 0.1106ms (GPU)
  Layer_03_conv2.conv: 2.4514ms (CPU)
  Layer_04_conv2.bn: 0.2651ms (CPU)
  Layer_05_conv3.conv: 3.2742ms (CPU)
  Layer_06_conv3.bn: 1.9667ms (GPU)
  Layer_07_maxpool2: 0.2058ms (GPU)
  Layer_08_inception3a.branch1.conv: 7.0903ms (GPU)
  Layer_09_inception3a.branch1.bn: 0.600

In [None]:
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
from torchvision.models import mnasnet1_0, Mnasnet1_0_Weights
from torchvision.models import googlenet, GoogLeNet_Weights
from torchvision.models import squeezenet1_0, SqueezeNet1_0_Weights

weights = SqueezeNet1_0_Weights.DEFAULT
model_squeezenet = squeezenet1_0(weights=weights)
model_squeezenet.eval()

weights = GoogLeNet_Weights.DEFAULT
model_googlenet = googlenet(weights=weights)
model_googlenet.eval()

weights = MobileNet_V2_Weights.DEFAULT
model_mobilenet = mobilenet_v2(weights=weights)
model_mobilenet.eval()


weights = Mnasnet1_0_Weights.DEFAULT
model_mnasnet = mnasnet1_0(weights=weights)
model_mnasnet.eval()

model = {"squeezenet": model_squeezenet,
         "googlenet": model_googlenet,
         "mobilenet": model_mobilenet,
         "mnasnet": model_mnasnet}

