# DDP_KBIT Jupyter Notebook Interface

This notebook provides a simple interface to run the DDP_KBIT distributed deep learning system without using command line arguments. It wraps the existing `main.py` functionality for easy experimentation.

## Setup and Imports

## 세션 초기화 (매번 실행 필요)

아래 셀을 매 세션마다 가장 먼저 실행하여 로컬 모듈에 연결하세요.

In [1]:
import os
import sys
import json
import logging
from typing import Dict, Any, Optional

# DDP_KBIT 모듈 경로 설정 - 현재 디렉토리 또는 상위 디렉토리에서 찾기
current_dir = os.getcwd()
possible_paths = [
    current_dir,  # 현재 디렉토리
    os.path.dirname(current_dir),  # 상위 디렉토리
    r"/mnt/data/DDP_KBIT",  # Jupyter 환경
    r"D:\Nextcloud3\kbit\DDP_KBIT",  # Windows 로컬 환경
]

ddp_kbit_path = None
for path in possible_paths:
    if os.path.exists(os.path.join(path, "main.py")):
        ddp_kbit_path = path
        break

if ddp_kbit_path:
    if ddp_kbit_path not in sys.path:
        sys.path.insert(0, ddp_kbit_path)
    print(f"✓ Found DDP_KBIT at: {ddp_kbit_path}")
else:
    print("❌ Could not find DDP_KBIT directory")

# Import the main functions from main.py
try:
    # 패키지로 임포트 시도
    try:
        import DDP_KBIT
        from DDP_KBIT.main import (
            setup_logging, 
            load_external_config,
            run_training_mode,
            run_experiment_mode, 
            create_sample_config
        )
        print("✓ Successfully imported DDP_KBIT as package")
    except ImportError:
        # 직접 임포트 시도
        from main import (
            setup_logging, 
            load_external_config,
            run_training_mode,
            run_experiment_mode, 
            create_sample_config
        )
        print("✓ Successfully imported DDP_KBIT modules directly")
        
except ImportError as e:
    print(f"❌ Error importing DDP_KBIT modules: {e}")
    print(f"Current working directory: {os.getcwd()}")
    print("Available Python files:")
    try:
        import glob
        py_files = glob.glob("*.py")
        if py_files:
            print(f"  Python files found: {py_files}")
        else:
            print("  No Python files found in current directory")
    except:
        pass
    
    # 대안: 절대 경로로 main.py 실행
    if ddp_kbit_path and os.path.exists(os.path.join(ddp_kbit_path, "main.py")):
        print("\nTrying to execute main.py directly...")
        try:
            exec(open(os.path.join(ddp_kbit_path, "main.py")).read(), globals())
            print("✓ Successfully loaded main.py using exec method")
        except Exception as exec_error:
            print(f"❌ Exec method failed: {exec_error}")
    else:
        print("❌ main.py not found in any expected location")
        print("Please ensure you're running from the correct directory and all dependencies are installed.")

✓ Found DDP_KBIT at: /mnt/data/DDP_KBIT


  from torch.distributed.optim import ZeroRedundancyOptimizer


Error importing DDP_KBIT modules: cannot import name 'BenchmarkSuite' from 'experiments.benchmarks' (/mnt/data/DDP_KBIT/experiments/benchmarks.py)
Please ensure you're running from the correct directory and all dependencies are installed.
✓ Successfully imported DDP_KBIT modules directly




## Configuration Setup

In [2]:
# Setup logging
setup_logging("INFO")

# Create a mock args object to simulate command line arguments
class NotebookArgs:
    def __init__(self):
        self.config_path = "sample_config.json"
        self.distributed = False
        self.experiment_type = "single"
        self.iterations = 3
        self.log_level = "INFO"

# Initialize default arguments
args = NotebookArgs()

print("✓ Configuration setup complete")
print(f"Config path: {args.config_path}")
print(f"Distributed: {args.distributed}")
print(f"Iterations: {args.iterations}")

✓ Configuration setup complete
Config path: sample_config.json
Distributed: False
Iterations: 3


## Create Sample Configuration (Run this first)

In [3]:
# Create a sample configuration file
create_sample_config()
print("✓ Sample configuration created!")

# Display the configuration
if os.path.exists("sample_config.json"):
    with open("sample_config.json", 'r') as f:
        config = json.load(f)
    print("\nCurrent configuration:")
    print(json.dumps(config, indent=2))

❌ Create sample config not available - missing dependencies
✓ Sample configuration created!


## Training Mode

Run single node or distributed training.

In [4]:
# Single node training
print("🚀 Starting single node training...")
args.distributed = False

try:
    run_training_mode(args)
    print("✅ Training completed successfully!")
except Exception as e:
    print(f"❌ Training failed: {e}")

🚀 Starting single node training...
❌ Training mode not available - missing dependencies
✅ Training completed successfully!


In [5]:
# Distributed training (uncomment to run)
# print("🚀 Starting distributed training...")
# args.distributed = True

# try:
#     run_training_mode(args)
#     print("✅ Distributed training completed successfully!")
# except Exception as e:
#     print(f"❌ Distributed training failed: {e}")

## Experiment Mode

Run single experiments or multiple iterations with statistical analysis.

In [6]:
# Single experiment
print("🧪 Running single experiment...")
args.experiment_type = "single"

try:
    run_experiment_mode(args)
    print("✅ Single experiment completed successfully!")
except Exception as e:
    print(f"❌ Single experiment failed: {e}")

🧪 Running single experiment...
❌ Experiment mode not available - missing dependencies
✅ Single experiment completed successfully!


In [7]:
# Multiple experiments with statistical analysis
print("🧪 Running multiple experiments...")
args.experiment_type = "multiple"
args.iterations = 5  # You can change this number

try:
    run_experiment_mode(args)
    print(f"✅ {args.iterations} experiments completed successfully!")
except Exception as e:
    print(f"❌ Multiple experiments failed: {e}")

🧪 Running multiple experiments...
❌ Experiment mode not available - missing dependencies
✅ 5 experiments completed successfully!


## Custom Configuration

Modify configuration parameters for your specific needs.

In [8]:
# Customize configuration
custom_config = {
    "spark_config": {
        "master": "local[*]",
        "app_name": "DDP_KBIT_Custom",
        "executor_instances": 4,
        "executor_cores": 2,
        "executor_memory": "8g"
    },
    "training_config": {
        "epochs": 10,
        "batch_size": 128,
        "learning_rate": 0.0001
    },
    "data_config": {
        "kafka_servers": ["localhost:9092"],
        "topic": "custom_topic",
        "batch_size": 64
    }
}

# Save custom configuration
custom_config_path = "custom_config.json"
with open(custom_config_path, "w") as f:
    json.dump(custom_config, f, indent=2)

# Update args to use custom config
args.config_path = custom_config_path

print(f"✓ Custom configuration saved to: {custom_config_path}")
print("\nCustom configuration:")
print(json.dumps(custom_config, indent=2))

✓ Custom configuration saved to: custom_config.json

Custom configuration:
{
  "spark_config": {
    "master": "local[*]",
    "app_name": "DDP_KBIT_Custom",
    "executor_instances": 4,
    "executor_cores": 2,
    "executor_memory": "8g"
  },
  "training_config": {
    "epochs": 10,
    "batch_size": 128,
    "learning_rate": 0.0001
  },
  "data_config": {
    "kafka_servers": [
      "localhost:9092"
    ],
    "topic": "custom_topic",
    "batch_size": 64
  }
}


## Utility Functions

Helper functions for notebook usage.

In [9]:
def quick_train(distributed=False, config_path="sample_config.json"):
    """Quick training function for easy execution."""
    args.distributed = distributed
    args.config_path = config_path
    
    print(f"🚀 Quick training - Distributed: {distributed}")
    try:
        run_training_mode(args)
        print("✅ Training completed!")
    except Exception as e:
        print(f"❌ Training failed: {e}")

def quick_experiment(experiment_type="single", iterations=3):
    """Quick experiment function for easy execution."""
    args.experiment_type = experiment_type
    args.iterations = iterations
    
    print(f"🧪 Quick experiment - Type: {experiment_type}, Iterations: {iterations}")
    try:
        run_experiment_mode(args)
        print("✅ Experiment completed!")
    except Exception as e:
        print(f"❌ Experiment failed: {e}")

print("✓ Utility functions loaded!")
print("\nUse these functions for quick execution:")
print("- quick_train(distributed=False)")
print("- quick_experiment(experiment_type='multiple', iterations=5)")

✓ Utility functions loaded!

Use these functions for quick execution:
- quick_train(distributed=False)
- quick_experiment(experiment_type='multiple', iterations=5)


## Quick Execution Examples

Use the utility functions for quick execution.

In [10]:
# Example: Quick single training
# quick_train()

# Example: Quick multiple experiments
# quick_experiment(experiment_type="multiple", iterations=3)

print("💡 Uncomment the lines above to run quick examples!")

💡 Uncomment the lines above to run quick examples!


In [11]:
import os
print("Current working directory:", os.getcwd())
print("Files in current directory:")
for f in os.listdir('.'):
    print(f"  {f}")

Current working directory: /home/jovyan
Files in current directory:
  .bash_logout
  .profile
  .bashrc
  custom_config.json
  .ipython
  .local
  .npm
  .jupyter
  .cache
  .conda
  .config
  .wget-hsts
  work
