# PyTorch with NoKode

In this notebook we will walk through training a character recongition model using the MNIST dataset on Pytorch. 
We will then show you how to use Kubeflow Fairing to run the same training job on both Kubeflow and CMLE

In [21]:
%%writefile requirements.txt

torch
torchvision


Overwriting requirements.txt


In [10]:
#you can skip this step if you have already installed the necessary dependencies
!pip install -U -r requirements.txt

Requirement already up-to-date: torch in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 1)) (1.6.0)
Requirement already up-to-date: torchvision in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (0.7.0)
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [11]:
#%%writefile train_model.py
import argparse
import os
import subprocess
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
from torchvision import transforms
# For mac users you may get hit with this bug https://github.com/pytorch/pytorch/issues/20030
# temporary solution is "brew install libomp"

## PyTorch Model Defintion

Setup a Convolution Nueral network using Pytorch!

In [12]:
#%%writefile train_model.py -a
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

## PyTorch Training and Test Functions
A simple training function that batches the data set. 

In [13]:
#%%writefile train_model.py -a
def train(model, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0 and batch_idx>0:
            print('Train Epoch: {}\t[{}/{}\t({:.0f}%)]\tLoss: {:.6f}'.format(
              epoch, batch_idx * len(data), len(train_loader.dataset),
              100. * batch_idx / len(train_loader), loss.item()))

In [14]:
#%%writefile train_model.py -a
def test(model, device, test_loader, epoch):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(
              output, target, size_average=False).item()  # sum up batch loss
            pred = output.max(
              1, keepdim=True)[1]  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
          test_loss, correct, len(test_loader.dataset),
          100. * correct / len(test_loader.dataset)))

In [15]:
#%%writefile train_model.py -a
def nkTrain(batch_size=64, epochs=1, log_interval=100, lr=0.01, model_dir=None, momentum=0.5, 
                       no_cuda=False, seed=1, test_batch_size=1000):

    use_cuda = not no_cuda and torch.cuda.is_available()
    torch.manual_seed(seed)
    device = torch.device('cuda' if use_cuda else 'cpu')
    print("Using {} for training.".format(device))

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
      datasets.MNIST(
          'data',
          train=True,
          download=True,
          transform=transforms.Compose([
              transforms.ToTensor(),
              # Normalize a tensor image with mean and standard deviation
              transforms.Normalize(mean=(0.1307,), std=(0.3081,))
          ])),
      batch_size=batch_size,
      shuffle=True,
      **kwargs)
    test_loader = torch.utils.data.DataLoader(
      datasets.MNIST(
          'data',
          train=False,
          transform=transforms.Compose([
              transforms.ToTensor(),
              # Normalize a tensor image with mean and standard deviation              
              transforms.Normalize(mean=(0.1307,), std=(0.3081,))
          ])),
      batch_size=test_batch_size,
      shuffle=True,
      **kwargs)

    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

    for epoch in range(1, epochs + 1):
        start_time = time.time()
        train(model, device, train_loader, optimizer, epoch, log_interval)
        print("Time taken for epoch #{}: {:.2f}s".format(epoch, time.time()-start_time))
        test(model, device, test_loader, epoch)

    if model_dir:
        model_file_name = 'torch.model'
        tmp_model_file = os.path.join('/tmp', model_file_name)
        torch.save(model.state_dict(), tmp_model_file)
        subprocess.check_call([
            'gsutil', 'cp', tmp_model_file,
            os.path.join(model_dir, model_file_name)])

## Training locally

    # Check Local Resources

In [6]:
import multiprocessing
import os
import sys

def local_resources():
    print("CPU count: {}".format(multiprocessing.cpu_count()))
    print("Memory: {}".format(os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')/(1024.**3)))

local_resources()

CPU count: 2
Memory: 7.5006561279296875


In [7]:
nkTrain()

Using cpu for training.
Time taken for epoch #1: 19.63s





Test set: Average loss: 0.2063, Accuracy: 9387/10000 (94%)



## Build Docker Images and train using the container.

In this block we set some Docker config. Fairing will use this information to package up the `train_and_test` function i

In [22]:
!nkode create:image -d "data/MNIST"

hello auto from create image
sh /usr/lib/node_modules/nkode/scripts/remoteTrain/remote_train.sh  auto data/MNIST tensorflow/tensorflow:1.15.0-py3  nkTrain
auto data/MNIST tensorflow/tensorflow:1.15.0-py3 nkTrain
[I 201027 07:04:47 utils:320] IMDS ENDPOINT: http://169.254.169.254/
[W 201027 07:04:47 function:49] The FunctionPreProcessor is optimized for using in a notebook or IPython environment. For it to work, the python version should be same for both local python and the python in the docker. Please look at alternatives like BasePreprocessor or FullNotebookPreprocessor.
[W 201027 07:04:47 tasks:62] Using builder: <class 'kubeflow.fairing.builders.cluster.cluster.ClusterBuilder'>
[I 201027 07:04:47 tasks:66] Building the docker image.
[I 201027 07:04:47 cluster:46] Building image using cluster builder.
[W 201027 07:04:47 base:94] /usr/local/lib/python3.6/dist-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 201027 07:04:47 base:107] Creating doc

# RUN on Additional resources

In [None]:
import random, string
import os
import subprocess
import importlib
from kubeflow import fairing
from kubeflow.fairing import TrainJob
from kubeflow.fairing.backends import KubeflowAWSBackend
from kubeflow.fairing.kubernetes.utils import get_resource_mutator


FAIRING_BACKEND = 'KubeflowAWSBackend'
AWS_ACCOUNT_ID=fairing.cloud.aws.guess_account_id()
AWS_REGION='us-west-2'
DOCKER_REGISTRY = '{}.dkr.ecr.{}.amazonaws.com'.format(AWS_ACCOUNT_ID, AWS_REGION)
PY_VERSION = ".".join([str(x) for x in sys.version_info[0:3]])
#BASE_IMAGE = '{}/python:{}'.format(DOCKER_REGISTRY, PY_VERSION)
# TODO: bug to fix. use tensorflow image temporarily
#BASE_IMAGE = 'tensorflow/tensorflow:1.15.0-py3'
BASE_DOCKER_IMAGE = 'python:3.6.9'
DATASET ='data/MNIST'
REQUIREMENTS = 'requirements.txt'
S3_BUCKET = 'pjz16s-eks-ml-data'


if FAIRING_BACKEND == 'KubeflowAWSBackend':
    from kubeflow.fairing.builders.cluster.s3_context import S3ContextSource
    BuildContext = S3ContextSource(
        aws_account=AWS_ACCOUNT_ID, region=AWS_REGION,
        bucket_name=S3_BUCKET
    )

BackendClass = getattr(importlib.import_module('kubeflow.fairing.backends'), FAIRING_BACKEND)

train_job = TrainJob(nkTrain, input_files=[DATASET,REQUIREMENTS],
                     base_docker_image=BASE_DOCKER_IMAGE,
                     docker_registry=DOCKER_REGISTRY,
                     backend=BackendClass(build_context_source=BuildContext),
                     pod_spec_mutators=[get_resource_mutator(cpu=1, memory=1)])

train_job.submit()

[W 201027 07:20:32 tasks:62] Using builder: <class 'kubeflow.fairing.builders.cluster.cluster.ClusterBuilder'>
[I 201027 07:20:32 tasks:66] Building the docker image.
[I 201027 07:20:32 cluster:46] Building image using cluster builder.
[W 201027 07:20:32 base:94] /usr/local/lib/python3.6/dist-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[I 201027 07:20:32 base:107] Creating docker context: /tmp/fairing_context_if2sgrso
[W 201027 07:20:32 base:94] /usr/local/lib/python3.6/dist-packages/kubeflow/fairing/__init__.py already exists in Fairing context, skipping...
[W 201027 07:20:32 aws:70] Not able to find aws credentials secret: aws-secret
[W 201027 07:20:32 manager:298] Waiting for fairing-builder-ttghx-vzj9g to start...
[W 201027 07:20:32 manager:298] Waiting for fairing-builder-ttghx-vzj9g to start...
[W 201027 07:24:20 manager:298] Waiting for fairing-builder-ttghx-vzj9g to start...
[W 201027 07:24:20 manager:298] Waiting for fairing-builder-ttg