In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import wandb

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
wandb.login(key="8008bd2f8a012ffc99c28c078c8aab3755934e4a")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/minwookkim/.netrc


True

In [14]:
# wandb 설정
wandb.init(project="pytorch-image-classification",
#     entity="kylehobson" # Team space가 있을 경우
          )

In [15]:
# wandb.config 설정
config = wandb.config
config.learning_rate = 0.001
config.epochs = 10
config.batch_size = 64
config.momentum = 0.9

In [16]:
# 데이터셋 로드 및 전처리
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# 전체 train 데이터셋 로드
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

# train 데이터셋을 train과 validation으로 나누기
train_size = int(0.8 * len(trainset))  # 80%를 train에 사용
val_size = len(trainset) - train_size  # 20%를 validation에 사용
trainset, valset = torch.utils.data.random_split(trainset, [train_size, val_size])

# train, validation 데이터 로더 생성
trainloader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size,
                                          shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(valset, batch_size=config.batch_size,
                                        shuffle=False, num_workers=2)

# test 데이터셋 로드
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=config.batch_size,
                                         shuffle=False, num_workers=2)

# 클래스 정의
classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:14<00:00, 11609566.38it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [17]:
# 모델 정의
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net().to(device)

In [18]:
# 손실 함수 및 옵티마이저 설정
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=config.learning_rate, momentum=config.momentum)

In [19]:
import wandb

def train(net, trainloader, valloader, criterion, optimizer, device, config):
    for epoch in range(config.epochs):  # config에서 에포크 수 가져오기
        running_loss = 0.0
        correct = 0
        total = 0

        # Training loop
        net.train()
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Accuracy 계산
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_accuracy = 100 * correct / total
        
        print(f'[Epoch: {epoch + 1}] train loss: {running_loss / len(trainloader):.3f}, train accuracy: {train_accuracy:.2f}%')
        
        wandb.log({"epoch": epoch + 1, "train_accuracy": train_accuracy})

        # Validation loop
        val_loss = 0.0
        correct = 0
        total = 0
        net.eval()
        with torch.no_grad():
            for data in valloader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                # Accuracy 계산
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(valloader)
        val_accuracy = 100 * correct / total
        wandb.log({"epoch": epoch + 1, "val_loss": val_loss, "val_accuracy": val_accuracy})
        print(f'[Epoch: {epoch + 1}] validation loss: {val_loss:.3f}, validation accuracy: {val_accuracy:.2f}%')
        net.train()

    print('Finished Training')

def test(net, testloader, device):
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy of the network on the 10000 test images: {accuracy} %')
    wandb.log({"test_accuracy": accuracy})

    return accuracy


In [20]:
# 학습 및 테스트 함수 호출
train(net, trainloader, valloader, criterion, optimizer, device, config)
accuracy = test(net, testloader, device)

[Epoch: 1] train loss: 2.298, train accuracy: 12.85%
[Epoch: 1] validation loss: 2.288, validation accuracy: 16.85%
[Epoch: 2] train loss: 2.186, train accuracy: 20.75%
[Epoch: 2] validation loss: 2.055, validation accuracy: 24.66%
[Epoch: 3] train loss: 1.941, train accuracy: 29.45%
[Epoch: 3] validation loss: 1.887, validation accuracy: 31.74%
[Epoch: 4] train loss: 1.759, train accuracy: 35.70%
[Epoch: 4] validation loss: 1.697, validation accuracy: 38.35%
[Epoch: 5] train loss: 1.645, train accuracy: 39.55%
[Epoch: 5] validation loss: 1.619, validation accuracy: 39.91%
[Epoch: 6] train loss: 1.572, train accuracy: 42.32%
[Epoch: 6] validation loss: 1.536, validation accuracy: 44.00%
[Epoch: 7] train loss: 1.506, train accuracy: 44.89%
[Epoch: 7] validation loss: 1.497, validation accuracy: 45.43%
[Epoch: 8] train loss: 1.457, train accuracy: 46.63%
[Epoch: 8] validation loss: 1.446, validation accuracy: 48.00%
[Epoch: 9] train loss: 1.418, train accuracy: 48.54%
[Epoch: 9] validati

In [10]:
# 모델 저장
torch.save(net.state_dict(), "model.pth")
wandb.save("model.pth")

['/home/minwookkim/ai/08_30_wandb/wandb/run-20240830_142228-ibiqtdco/files/model.pth']

In [11]:
# wandb 세션 종료
wandb.finish()

# Sweep
 - 하이퍼파라미터 서칭 & 최적화
 - Sweep 설정을 YAML, pythond dict 등으로 설정 후 로드

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import wandb

# 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터셋 로드 및 전처리
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# 전체 train 데이터셋 로드
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

# train 데이터셋을 train과 validation으로 나누기
train_size = int(0.8 * len(trainset))  # 80%를 train에 사용
val_size = len(trainset) - train_size  # 20%를 validation에 사용
trainset, valset = torch.utils.data.random_split(trainset, [train_size, val_size])


# test 데이터셋 로드
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=2)

# 클래스 정의
classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

# 모델 정의
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train(net, trainloader, valloader, criterion, optimizer, device, config):
    for epoch in range(config.epochs):  # config에서 에포크 수 가져오기
        running_loss = 0.0
        correct = 0
        total = 0

        # Training loop
        net.train()
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Accuracy 계산
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_accuracy = 100 * correct / total
        
        print(f'[Epoch: {epoch + 1}] train loss: {running_loss / len(trainloader):.3f}, train accuracy: {train_accuracy:.2f}%')
        
        wandb.log({"epoch": epoch + 1, "train_accuracy": train_accuracy})

        # Validation loop
        val_loss = 0.0
        correct = 0
        total = 0
        net.eval()
        with torch.no_grad():
            for data in valloader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                # Accuracy 계산
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(valloader)
        val_accuracy = 100 * correct / total
        wandb.log({"epoch": epoch + 1, "val_loss": val_loss, "val_accuracy": val_accuracy})
        print(f'[Epoch: {epoch + 1}] validation loss: {val_loss:.3f}, validation accuracy: {val_accuracy:.2f}%')
        net.train()

    print('Finished Training')

def test(net, testloader, device):
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy of the network on the 10000 test images: {accuracy} %')
    wandb.log({"test_accuracy": accuracy})

    return accuracy

### Sweep config

In [30]:
# Sweep 설정
sweep_config = {
    "method": "random", # random, grid, bayes
    "metric": {
        "name": "val_accuracy", # val_loss, 등 
        "goal": "maximize" # minimize, maximize
    },
    "parameters": {
        "learning_rate": {
            "values": [0.1, 0.01, 0.001]
        },
        "epochs": {
            "values": [5, 10]
        },
        "batch_size": {
            "values": [32, 64, 128]
        },
        "momentum": {
            "values": [0.9, 0.95, 0.99]
        }
    }
}


In [32]:
# train, validation 데이터 로더 생성
def get_data_loaders(batch_size):
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                              shuffle=True, num_workers=2)
    valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                            shuffle=False, num_workers=2)
    return trainloader, valloader

In [33]:
def sweep_search():
    wandb.init(project="cifar10-sweep")
    
    # Hyperparameters from wandb config
    config = wandb.config
    net = Net().to(device)
    
    # Data loaders
    trainloader, valloader = get_data_loaders(config.batch_size)
    
    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config.learning_rate, momentum=config.momentum)
    
    # Train and test
    train(net, trainloader, valloader, criterion, optimizer, device, config)
    accuracy = test(net, testloader, device)
    wandb.log({"test_accuracy": accuracy})

In [34]:
sweep_id = wandb.sweep(sweep_config, project="cifar10-sweep")
wandb.agent(sweep_id, sweep_search, count=5)

Create sweep with ID: bmbo5lqr
Sweep URL: https://wandb.ai/geuncheoloh/cifar10-sweep/sweeps/bmbo5lqr


[34m[1mwandb[0m: Agent Starting Run: s93pcc8j with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	momentum: 0.95


[Epoch: 1] train loss: 2.144, train accuracy: 18.58%
[Epoch: 1] validation loss: 2.129, validation accuracy: 18.65%
[Epoch: 2] train loss: 2.173, train accuracy: 16.92%
[Epoch: 2] validation loss: 2.175, validation accuracy: 18.59%
[Epoch: 3] train loss: 2.197, train accuracy: 15.81%
[Epoch: 3] validation loss: 2.312, validation accuracy: 9.82%
[Epoch: 4] train loss: 2.242, train accuracy: 13.20%
[Epoch: 4] validation loss: 2.116, validation accuracy: 19.58%
[Epoch: 5] train loss: 2.150, train accuracy: 17.41%
[Epoch: 5] validation loss: 2.154, validation accuracy: 17.63%
Finished Training
Accuracy of the network on the 10000 test images: 18.43 %


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▃▃▅▅▆▆██
test_accuracy,▁▁
train_accuracy,█▆▄▁▆
val_accuracy,▇▇▁█▇
val_loss,▁▃█▁▂

0,1
epoch,5.0
test_accuracy,18.43
train_accuracy,17.4125
val_accuracy,17.63
val_loss,2.15375


[34m[1mwandb[0m: Agent Starting Run: jv70eawy with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.99


[Epoch: 1] train loss: 2.274, train accuracy: 13.62%
[Epoch: 1] validation loss: 2.145, validation accuracy: 20.43%
[Epoch: 2] train loss: 1.998, train accuracy: 26.67%
[Epoch: 2] validation loss: 1.852, validation accuracy: 32.93%
[Epoch: 3] train loss: 1.691, train accuracy: 38.16%
[Epoch: 3] validation loss: 1.558, validation accuracy: 42.75%
[Epoch: 4] train loss: 1.476, train accuracy: 46.03%
[Epoch: 4] validation loss: 1.390, validation accuracy: 49.14%
[Epoch: 5] train loss: 1.357, train accuracy: 50.70%
[Epoch: 5] validation loss: 1.345, validation accuracy: 51.67%
Finished Training
Accuracy of the network on the 10000 test images: 52.49 %


VBox(children=(Label(value='0.018 MB of 0.018 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▃▃▅▅▆▆██
test_accuracy,▁▁
train_accuracy,▁▃▆▇█
val_accuracy,▁▄▆▇█
val_loss,█▅▃▁▁

0,1
epoch,5.0
test_accuracy,52.49
train_accuracy,50.705
val_accuracy,51.67
val_loss,1.34543


[34m[1mwandb[0m: Agent Starting Run: hm14725t with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.95


[Epoch: 1] train loss: 2.104, train accuracy: 21.93%
[Epoch: 1] validation loss: 1.821, validation accuracy: 32.62%
[Epoch: 2] train loss: 1.636, train accuracy: 40.18%
[Epoch: 2] validation loss: 1.519, validation accuracy: 44.23%
[Epoch: 3] train loss: 1.460, train accuracy: 47.21%
[Epoch: 3] validation loss: 1.407, validation accuracy: 49.80%
[Epoch: 4] train loss: 1.344, train accuracy: 51.51%
[Epoch: 4] validation loss: 1.346, validation accuracy: 52.03%
[Epoch: 5] train loss: 1.255, train accuracy: 55.08%
[Epoch: 5] validation loss: 1.292, validation accuracy: 53.50%
[Epoch: 6] train loss: 1.189, train accuracy: 57.84%
[Epoch: 6] validation loss: 1.223, validation accuracy: 56.55%
[Epoch: 7] train loss: 1.128, train accuracy: 59.99%
[Epoch: 7] validation loss: 1.156, validation accuracy: 59.10%
[Epoch: 8] train loss: 1.076, train accuracy: 61.90%
[Epoch: 8] validation loss: 1.160, validation accuracy: 59.60%
[Epoch: 9] train loss: 1.027, train accuracy: 63.66%
[Epoch: 9] validati

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
test_accuracy,▁▁
train_accuracy,▁▄▅▆▆▇▇▇██
val_accuracy,▁▄▅▆▆▇████
val_loss,█▅▄▃▂▂▁▁▁▁

0,1
epoch,10.0
test_accuracy,60.24
train_accuracy,65.205
val_accuracy,59.96
val_loss,1.14832


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0y6dokqk with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	momentum: 0.99


[Epoch: 1] train loss: 1.940, train accuracy: 28.04%
[Epoch: 1] validation loss: 1.571, validation accuracy: 41.44%
[Epoch: 2] train loss: 1.487, train accuracy: 45.56%
[Epoch: 2] validation loss: 1.387, validation accuracy: 49.71%
[Epoch: 3] train loss: 1.313, train accuracy: 52.87%
[Epoch: 3] validation loss: 1.288, validation accuracy: 54.01%
[Epoch: 4] train loss: 1.218, train accuracy: 56.55%
[Epoch: 4] validation loss: 1.286, validation accuracy: 54.90%
[Epoch: 5] train loss: 1.149, train accuracy: 59.13%
[Epoch: 5] validation loss: 1.214, validation accuracy: 58.01%
Finished Training
Accuracy of the network on the 10000 test images: 59.18 %


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▃▃▅▅▆▆██
test_accuracy,▁▁
train_accuracy,▁▅▇▇█
val_accuracy,▁▄▆▇█
val_loss,█▄▂▂▁

0,1
epoch,5.0
test_accuracy,59.18
train_accuracy,59.135
val_accuracy,58.01
val_loss,1.21365


[34m[1mwandb[0m: Agent Starting Run: plgxij3a with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	momentum: 0.99


[Epoch: 1] train loss: 2.371, train accuracy: 9.99%
[Epoch: 1] validation loss: 2.389, validation accuracy: 9.64%
[Epoch: 2] train loss: 2.369, train accuracy: 9.93%
[Epoch: 2] validation loss: 2.359, validation accuracy: 9.83%
[Epoch: 3] train loss: 2.365, train accuracy: 9.86%
[Epoch: 3] validation loss: 2.422, validation accuracy: 9.83%
[Epoch: 4] train loss: 2.359, train accuracy: 10.07%
[Epoch: 4] validation loss: 2.327, validation accuracy: 10.32%
[Epoch: 5] train loss: 2.387, train accuracy: 10.15%
[Epoch: 5] validation loss: 2.431, validation accuracy: 9.83%
Finished Training
Accuracy of the network on the 10000 test images: 10.0 %


VBox(children=(Label(value='0.018 MB of 0.018 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▃▃▅▅▆▆██
test_accuracy,▁▁
train_accuracy,▄▃▁▆█
val_accuracy,▁▃▃█▃
val_loss,▅▃▇▁█

0,1
epoch,5.0
test_accuracy,10.0
train_accuracy,10.15
val_accuracy,9.83
val_loss,2.43063
