# Convolutional neural networks (CNN) for CIFAR-10/100 using PyTorch

Markus Enzweiler, markus.enzweiler@hs-esslingen.de

This is a demo used in a Computer Vision & Machine Learning lecture. Feel free to use and contribute.

We build and train a CNN for CIFAR-10 / CIFAR-100 image classification, see https://www.cs.toronto.edu/~kriz/cifar.html. We use the Python code from https://github.com/menzHSE/torch-cifar-10-cnn.git and execute it via this notebook. 

## Setup

Adapt `packagePath` to point to the directory containing this notebeook.

In [1]:
# Imports
import sys
import os
import threading
import subprocess
import fcntl
import errno

In [2]:
# Package Path
package_path = "./" # local
print(f"Package path: {package_path}")


def check_for_colab():
  try:
      import google.colab
      return True
  except ImportError:
      return False

# Running on Colab?
on_colab = check_for_colab()

Package path: ./


In [3]:
# Clone git repository

# Absolute path of the repository directory
repo_dir = os.path.join(package_path, "torch-cifar-10-cnn")
repo_url = "https://github.com/menzHSE/torch-cifar-10-cnn.git"

# Store the original working directory
original_cwd = os.getcwd()

# Check if the directory already exists using the absolute path
if os.path.exists(os.path.join(original_cwd, repo_dir)):
    print("Repository exists. Resetting to HEAD...")
    # Navigate into the repository directory
    os.chdir(repo_dir)
    # Fetch the latest changes from the remote
    subprocess.run(["git", "fetch", "origin"])
    # Reset the local branch to the latest commit from the remote
    subprocess.run(["git", "reset", "--hard", "origin/HEAD"])
    # Change back to the original working directory
    os.chdir(original_cwd)
else:
    print("Cloning repository...")
    # Clone the repository if it doesn't exist
    subprocess.run(["git", "clone", repo_url, repo_dir])


Repository exists. Resetting to HEAD...
HEAD is now at ba4f381 Update README.md


From https://github.com/menzHSE/torch-cifar-10-cnn
   e66ffc0..ba4f381  main       -> origin/main


In [4]:
# Install requirements in the current Jupyter kernel
req_file = os.path.join(repo_dir, "requirements.txt")
if os.path.exists(req_file):
    !{sys.executable} -m pip install -r {req_file}
else:
    print(f"Requirements file not found: {req_file}")



## Functions to interface with the code in the repository

In [5]:
def execute(script_name, params=None):
    if on_colab:
        executeCaptureColab(script_name, params)
    else:
        executeCapture(script_name, params)

def executeCapture(script_name, params=None):
    script_path = os.path.join(repo_dir, script_name)
    if os.path.exists(script_path):
        print(f"Executing script: {script_path}")
        # Create the command list starting with Python and the script path
        command = ["python", script_path]
        # Add additional arguments from the params dictionary
        if params:
            for key, value in params.items():
                command.append(f"--{key}")
                command.append(str(value))
        print(command)
        subprocess.run(command)
    else:
        print(f"Script not found: {script_path}")

# This is very hacky ... but it's hard to capture the output of a subprocess in Colab
def executeCaptureColab(script_name, params=None):
    script_path = os.path.join(repo_dir, script_name)
    if os.path.exists(script_path):
        print(f"Executing script: {script_path}")
        # Create the command list starting with Python and the script path
        command = ["python", script_path]
        # Add additional arguments from the params dictionary
        if params:
            for key, value in params.items():
                if value is not None:  # Check if the value is None
                    command.append(f"--{key}")
                    command.append(str(value))
                else:
                    command.append(f"--{key}")
        print("Command:", " ".join(command))

        # Start the subprocess
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

        # Set the stdout to non-blocking
        fd = process.stdout.fileno()
        fl = fcntl.fcntl(fd, fcntl.F_GETFL)
        fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)

        # Function to continuously output lines from a stream
        def stream_output(stream):
            while True:
                try:
                    line = stream.readline()
                    if line:
                        print(line, end='')
                    elif process.poll() is not None:
                        break
                except IOError as e:
                    # Ignore the error if no data is available yet
                    if e.errno != errno.EAGAIN and e.errno != errno.EWOULDBLOCK:
                        raise

        # Use a thread to capture the output stream
        output_thread = threading.Thread(target=stream_output, args=(process.stdout,))
        output_thread.start()

        # Wait for the subprocess to complete and the output thread to end
        process.wait()
        output_thread.join()

    else:
        print(f"Script not found: {script_path}")

In [6]:
# Let's see what we can do with train.py
execute("train.py", {"help": None})

Executing script: ./torch-cifar-10-cnn/train.py
['python', './torch-cifar-10-cnn/train.py', '--help', 'None']
usage: Train a simple CNN on CIFAR-10 / CIFAR_100 with PyTorch.
       [-h] [--cpu] [--seed SEED] [--batchsize BATCHSIZE] [--epochs EPOCHS]
       [--lr LR] [--dataset {CIFAR-10,CIFAR-100}]
       [--finetune {resnet18,resnet34,resnet50}]

optional arguments:
  -h, --help            show this help message and exit
  --cpu                 Use CPU instead of GPU (cuda/mps) acceleration
  --seed SEED           Random seed
  --batchsize BATCHSIZE
                        Batch size for training
  --epochs EPOCHS       Number of training epochs
  --lr LR               Learning rate
  --dataset {CIFAR-10,CIFAR-100}
                        Select the dataset to use (CIFAR-10 or CIFAR-100)
  --finetune {resnet18,resnet34,resnet50}
                        Select the model for fine-tuning (resnet18, resnet34,
                        resnet50), omit for training from scratch


# Train and test CNN on CIFAR-10



## Parameters

In [7]:
# parameters
batchsize = 32
seed      = 42
lr        = 3e-4
epochs    = 30
dataset   = "CIFAR-10"

## Train

In [8]:
params = {
    "dataset": dataset,           # dataset name
    "batchsize": batchsize,       # batch size
    "seed": seed,                 # random seed
    "lr": lr,                     # learning rate
    "epochs": epochs              # number of epochs     
}

# Execute 'train.py' with parameters
execute("train.py", params=params)

Executing script: ./torch-cifar-10-cnn/train.py
['python', './torch-cifar-10-cnn/train.py', '--dataset', 'CIFAR-10', '--batchsize', '32', '--seed', '42', '--lr', '0.0003', '--epochs', '30']
Using device: mps
Options: 
  Device: GPU
  Seed: 42
  Batch size: 32
  Number of epochs: 30
  Learning rate: 0.0003
  Dataset: CIFAR-10
Files already downloaded and verified
Files already downloaded and verified


  action_fn=lambda data: sys.getsizeof(data.storage()),


Layer (type (var_name))                  Output Shape              Param #
CNN (CNN)                                [1, 10]                   --
├─Conv2d (conv1)                         [1, 32, 32, 32]           896
├─BatchNorm2d (bn1)                      [1, 32, 32, 32]           64
├─Conv2d (skip2)                         [1, 32, 32, 32]           1,056
├─Conv2d (conv2)                         [1, 32, 32, 32]           9,248
├─BatchNorm2d (bn2)                      [1, 32, 32, 32]           64
├─MaxPool2d (pool)                       [1, 32, 16, 16]           --
├─Dropout (drop)                         [1, 32, 16, 16]           --
├─Conv2d (conv3)                         [1, 64, 16, 16]           18,496
├─BatchNorm2d (bn3)                      [1, 64, 16, 16]           128
├─Conv2d (skip4)                         [1, 64, 16, 16]           4,160
├─Conv2d (conv4)                         [1, 64, 16, 16]           36,928
├─BatchNorm2d (bn4)                      [1, 64, 16, 16]          

[Epoch   0] :  ........

KeyboardInterrupt: 

## Test 

In [None]:
# parameters
params = {
    "model": f"models/model_{dataset}_{epochs-1:03d}.pth" # model name    
}

# Execute 'train.py' with parameters
execute("test.py", params=params)

# Train and test CNN on CIFAR-100

## Parameters

In [None]:
# parameters
batchsize = 32
seed      = 42
lr        = 3e-4
epochs    = 30
dataset   = "CIFAR-100"

## Train

In [None]:
params = {
    "dataset": dataset,           # dataset name
    "batchsize": batchsize,       # batch size
    "seed": seed,                 # random seed
    "lr": lr,                     # learning rate
    "epochs": epochs              # number of epochs
}

# Execute 'train.py' with parameters
execute("train.py", params=params)

## Test

In [None]:
# parameters
params = {
    "dataset": dataset,           # dataset name
    "model": f"models/model_{dataset}_{epochs-1:03d}.pth" # model name    
}

# Execute 'train.py' with parameters
execute("test.py", params=params)

# Finetune a ResNet on CIFAR-100

## Parameters

In [None]:
# parameters
batchsize = 32
seed      = 42
lr        = 3e-4
epochs    = 5
dataset   = "CIFAR-100"
finetune  = "resnet18"

## Train

In [10]:
params = {
    "dataset": dataset,           # dataset name
    "batchsize": batchsize,       # batch size
    "seed": seed,                 # random seed
    "lr": lr,                     # learning rate
    "epochs": epochs,             # number of epochs
    "finetune": finetune          # finetune model
}

# Execute 'train.py' with parameters
execute("train.py", params=params)

Executing script: ./torch-cifar-10-cnn/train.py
['python', './torch-cifar-10-cnn/train.py', '--dataset', 'CIFAR-100', '--batchsize', '32', '--seed', '42', '--lr', '0.0003', '--epochs', '5', '--finetune', 'resnet18']
Using device: mps
Options: 
  Device: GPU
  Seed: 42
  Batch size: 32
  Number of epochs: 5
  Learning rate: 0.0003
  Dataset: CIFAR-100
  Fine-tuning model: resnet18
Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [02:57<00:00, 950974.92it/s] 


Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified


  action_fn=lambda data: sys.getsizeof(data.storage()),


Layer (type (var_name))                       Output Shape              Param #
CNNResnet (CNNResnet)                         [1, 100]                  --
├─ResNet (backbone)                           [1, 100]                  --
│    └─Conv2d (conv1)                         [1, 64, 112, 112]         9,408
│    └─BatchNorm2d (bn1)                      [1, 64, 112, 112]         128
│    └─ReLU (relu)                            [1, 64, 112, 112]         --
│    └─MaxPool2d (maxpool)                    [1, 64, 56, 56]           --
│    └─Sequential (layer1)                    [1, 64, 56, 56]           --
│    │    └─BasicBlock (0)                    [1, 64, 56, 56]           73,984
│    │    └─BasicBlock (1)                    [1, 64, 56, 56]           73,984
│    └─Sequential (layer2)                    [1, 128, 28, 28]          --
│    │    └─BasicBlock (0)                    [1, 128, 28, 28]          230,144
│    │    └─BasicBlock (1)                    [1, 128, 28, 28]          295,42

[Epoch   0] :  ................ done (1563 batches)
[Epoch   0] : | time: 694.138s | trainLoss:  1.676 | trainAccuracy:  0.550 | valLoss:  1.192 | valAccuracy:  0.653 | throughput:    285.911 img/s |
[Epoch   1] :  ................ done (1563 batches)
[Epoch   1] : | time: 684.555s | trainLoss:  0.933 | trainAccuracy:  0.725 | valLoss:  1.086 | valAccuracy:  0.688 | throughput:    289.701 img/s |
[Epoch   2] :  ................ done (1563 batches)
[Epoch   2] : | time: 679.002s | trainLoss:  0.636 | trainAccuracy:  0.806 | valLoss:  0.976 | valAccuracy:  0.721 | throughput:    293.090 img/s |
[Epoch   3] :  ................ done (1563 batches)
[Epoch   3] : | time: 683.615s | trainLoss:  0.414 | trainAccuracy:  0.872 | valLoss:  1.053 | valAccuracy:  0.711 | throughput:    291.201 img/s |
[Epoch   4] :  ................ done (1563 batches)
[Epoch   4] : | time: 678.860s | trainLoss:  0.298 | trainAccuracy:  0.907 | valLoss:  1.150 | valAccuracy:  0.712 | throughput:    293.938 img/s |


## Test

In [11]:
# parameters
params = {
    "dataset": dataset,           # dataset name
    "model": f"models/model_{finetune}_{dataset}_{epochs-1:03d}.pth", # model name  
    "finetune": finetune          # finetune model  
}

# Execute 'train.py' with parameters
execute("test.py", params=params)

Executing script: ./torch-cifar-10-cnn/test.py
['python', './torch-cifar-10-cnn/test.py', '--dataset', 'CIFAR-100', '--model', 'models/model_resnet18_CIFAR-100_004.pth', '--finetune', 'resnet18']
Using device: mps
Files already downloaded and verified
