In [1]:
# %load py_src/put_public_mnist_to_local_minio.py
# %load py_src/put_public_mnist_to_local_minio.py


# comment it to ease save/load
# %%writefile py_src/put_public_mnist_to_local_minio.py
def put_public_mnist_to_local_minio(args:dict) :

    import tempfile
    import os
    from minio import Minio
    import numpy as np
    import uuid
    from torchvision import datasets, transforms
    import glob
    
    
    bucket_name = args.get("bucket_name", None)
    batch_size = args.get("batch_size", 64)
    test_batch_size = args.get("batch_size", 1000)
    
    
    
    def upload_local_directory_to_minio(minio_client, local_path, bucket_name, minio_path):
        # assert os.path.isdir(local_path)

        for local_file in glob.glob(local_path + '/**'):
            local_file = local_file.replace(os.sep, "/") # Replace \ with / on Windows
            if not os.path.isfile(local_file):
                upload_local_directory_to_minio(
                    minio_client, local_file, bucket_name, minio_path + "/" + os.path.basename(local_file))
            else:
                remote_path = os.path.join(
                    minio_path, local_file[1 + len(local_path):])
                remote_path = remote_path.replace(
                    os.sep, "/")  # Replace \ with / on Windows
                minio_client.fput_object(bucket_name, remote_path, local_file)

    def get_minio_url():
        minio_host, minio_port = os.environ["MINIO_SERVICE_SERVICE_HOST"], os.environ["MINIO_SERVICE_SERVICE_PORT_HTTP"]
        minio_url= "{}:{}".format(minio_host, minio_port)
        return minio_url
    minio_url = get_minio_url()
    print("minio url:", minio_url)

    config = {"endpoint": minio_url,
        "access_key": "minio",
        "secret_key": "minio123",
        "secure": False}
    minio_client = Minio(**config)

    print("try to find bucket {}".format(bucket_name))
    found = minio_client.bucket_exists(bucket_name)
    print("found", found)
    if not found:
        minio_client.make_bucket(bucket_name)
    else:
        print("Bucket '{}' already exists".format(bucket_name))
    

    mnist_data_dirpath="/tmp/"+str(uuid.uuid4())
    

    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])
    dataset1 = datasets.MNIST(mnist_data_dirpath, train=True, download=True,
                       transform=transform)
    dataset2 = datasets.MNIST(mnist_data_dirpath, train=False,
                       transform=transform)
    
    upload_local_directory_to_minio(minio_client, mnist_data_dirpath, bucket_name, "data/original")
                                   
    from shutil import rmtree
    rmtree(mnist_data_dirpath)
    


In [2]:
!mkdir -p py_src

In [3]:
# %load py_src/train_model.py
# %load py_src/train_model.py


# comment it to ease save/load
# %%writefile py_src/train_model.py
def train_model(args:dict) :
    import os
    from minio import Minio
    import numpy as np
    import uuid
    import glob
    import json
    import shutil
    import argparse
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from torchvision import datasets, transforms
    from torch.optim.lr_scheduler import StepLR

    
    bucket_name = args.get("bucket_name", None)
    device_name = args.get("device_name", "cpu")
    epochs = args.get("epochs", 1)
    optimizer = args.get("optimizer", "adam")
    model_save_prefix = args.get("model_save_prefix", "models/trained/detect-digits")
    version = args.get("version", "1")
    lr = args.get("lr", 0.03)
    version = args.get("version", "1")
    gamma = args.get("gamma", 0.7)
    batch_size = args.get("batch_size", 64)
    test_batch_size = args.get("test_batch_size", 1000)
    log_interval = args.get("log_interval", 100)
    take_nth_in_subset = args.get("take_nth_in_subset", 10)
    
    
    
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 32, 3, 1)
            self.conv2 = nn.Conv2d(32, 64, 3, 1)
            self.dropout1 = nn.Dropout(0.25)
            self.dropout2 = nn.Dropout(0.5)
            self.fc1 = nn.Linear(9216, 128)
            self.fc2 = nn.Linear(128, 10)

        def forward(self, x):
            x = self.conv1(x)
            x = F.relu(x)
            x = self.conv2(x)
            x = F.relu(x)
            x = F.max_pool2d(x, 2)
            x = self.dropout1(x)
            x = torch.flatten(x, 1)
            x = self.fc1(x)
            x = F.relu(x)
            x = self.dropout2(x)
            x = self.fc2(x)
            output = F.log_softmax(x, dim=1)
            return output

    def train(log_interval,model, device, train_loader, optimizer, epoch):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))


    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
                pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))
    
        
    def upload_local_directory_to_minio(minio_client, local_path, bucket_name, minio_path):
        # assert os.path.isdir(local_path)

        for local_file in glob.glob(local_path + '/**'):
            local_file = local_file.replace(os.sep, "/") # Replace \ with / on Windows
            if not os.path.isfile(local_file):
                upload_local_directory_to_minio(
                    minio_client, local_file, bucket_name, minio_path + "/" + os.path.basename(local_file))
            else:
                remote_path = os.path.join(
                    minio_path, local_file[1 + len(local_path):])
                remote_path = remote_path.replace(
                    os.sep, "/")  # Replace \ with / on Windows
                minio_client.fput_object(bucket_name, remote_path, local_file)

    
    def get_minio_url():
        minio_host, minio_port = os.environ["MINIO_SERVICE_SERVICE_HOST"], os.environ["MINIO_SERVICE_SERVICE_PORT_HTTP"]
        minio_url= "{}:{}".format(minio_host, minio_port)
        return minio_url
    minio_url = get_minio_url()
    
    config = {"endpoint": minio_url,
        "access_key": "minio",
        "secret_key": "minio123",
        "secure": False}
    minio_client = Minio(**config)
    
    random_prefix=str(uuid.uuid4())
    def download_path(filename):
        return "/tmp/{}_{}.npy".format(random_prefix,filename)


    
    print("downlaod training data from the bucket:", bucket_name)
    train_data_saved_path="/tmp"
    os.makedirs(train_data_saved_path, exist_ok=True)
    model_data_remote_path="data/original"
    for bucket in minio_client.list_buckets():
        if bucket.name!=bucket_name:
            continue
        for item in minio_client.list_objects(bucket.name,model_data_remote_path,recursive=True):
            print("remote name:",item.object_name)
            print("local name:", train_data_saved_path+"/"+item.object_name)
            minio_client.fget_object(bucket.name,item.object_name, train_data_saved_path+"/"+item.object_name)

    
    train_kwargs = {'batch_size':  batch_size}
    test_kwargs = {'batch_size': test_batch_size}

    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])



    local_mnist_data_rootdir=train_data_saved_path+"/"+model_data_remote_path
    print("local_mnist_data_rootdir:", local_mnist_data_rootdir)
    os.makedirs(local_mnist_data_rootdir, exist_ok=True)
    files = os.listdir(local_mnist_data_rootdir)
    print("local_mnist_data_rootdir:",files)
    
    


    dataset1 = datasets.MNIST(local_mnist_data_rootdir, train=True, download=False,
                       transform=transform)
    dataset2 = datasets.MNIST(local_mnist_data_rootdir, train=False, download=False,
                       transform=transform)

    
    #take susbset of the training set by 1/5
    nth = list(range(0, len(dataset1), take_nth_in_subset))
    dataset1 = torch.utils.data.Subset(dataset1, nth)

    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    device = torch.device(device_name)
    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
    for epoch in range(1, epochs + 1):
        train(log_interval, model, device, train_loader, optimizer, epoch)
        #test(model, device, test_loader)
        scheduler.step()
    
    
    
    model_remote_path="{}/{}".format(model_save_prefix, version)
    model_save_dir="/tmp/model_{}/{}".format(model_save_prefix, version)
    os.makedirs(model_save_dir, exist_ok=True)


    model_save_path="{}/{}".format(model_save_dir, "mnist.pt")
    torch.save(model.state_dict(), model_save_path)

    
    model_script_save_path="{}/{}".format(model_save_dir, "model_scripted.pt")
    model_scripted = torch.jit.script(model) # Export to TorchScript
    model_scripted.save(model_script_save_path) # Save
    
    
    
    
    upload_local_directory_to_minio(minio_client, model_save_dir,bucket_name,model_remote_path) 
    
    
    



In [4]:
# %load py_src/test_model_and_save_metrics.py
# %load py_src/test_model_and_save_metrics.py


# comment it to ease save/load
# %%writefile py_src/test_model_and_save_metrics.py

from typing import NamedTuple

def test_model_and_save_metrics(args:dict) -> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata'),('mlpipeline_metrics', 'Metrics')]) :
    from minio import Minio
    import numpy as np
    import uuid
    import glob
    import pandas as pd
    import json
    import shutil
    import argparse
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from torchvision import datasets, transforms
    from torch.optim.lr_scheduler import StepLR
    import os
    from sklearn.metrics import confusion_matrix as sk_confusion_matrix
    from torch.nn.modules.module import _addindent
    
    bucket_name = args.get("bucket_name", None)
    device_name = args.get("device_name", "cpu")
    test_batch_size = args.get("test_batch_size", 1000)
    model_save_prefix = args.get("model_save_prefix", "models/trained/detect-digits")
    version = args.get("version", "1")
    
    
    
    def torch_summarize(model, show_weights=True, show_parameters=True):
        """Summarizes torch model by showing trainable parameters and weights."""
        tmpstr = model.__class__.__name__ + ' (\n'
        for key, module in model._modules.items():
            # if it contains layers let call it recursively to get params and weights
            if type(module) in [
                torch.nn.modules.container.Container,
                torch.nn.modules.container.Sequential
            ]:
                modstr = torch_summarize(module)
            else:
                modstr = module.__repr__()
            modstr = _addindent(modstr, 2)

            params = sum([np.prod(p.size()) for p in module.parameters()])
            weights = tuple([tuple(p.size()) for p in module.parameters()])

            tmpstr += '  (' + key + '): ' + modstr 
            if show_weights:
                tmpstr += ', weights={}'.format(weights)
            if show_parameters:
                tmpstr +=  ', parameters={}'.format(params)
            tmpstr += '\n'   

        tmpstr = tmpstr + ')'
        return tmpstr
    
    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
                pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        model_accuracy = 100. * correct / len(test_loader.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset), model_accuracy))
        return {"loss":test_loss, "accuracy":model_accuracy}
    # generate confusion matrix csv
    def gen_cm_csv(y_test=None,test_predictions=None):
        confusion_matrix = sk_confusion_matrix(y_test, test_predictions)
        vocab = list(np.unique(y_test))
        data = []
        for target_index, target_row in enumerate(confusion_matrix):
            for predicted_index, count in enumerate(target_row):
                data.append((vocab[target_index], vocab[predicted_index], count))

        df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])
        cm_csv = df_cm.to_csv(header=False, index=False)
        return cm_csv
    
    
    def get_minio_url():
        minio_host, minio_port = os.environ["MINIO_SERVICE_SERVICE_HOST"], os.environ["MINIO_SERVICE_SERVICE_PORT_HTTP"]
        minio_url= "{}:{}".format(minio_host, minio_port)
        return minio_url
    minio_url = get_minio_url()
    

    config = {"endpoint": minio_url,
        "access_key": "minio",
        "secret_key": "minio123",
        "secure": False}
    minio_client = Minio(**config)
    
    random_prefix=str(uuid.uuid4())
    def download_path(filename):
        return "/tmp/{}_{}.npy".format(random_prefix,filename)


    model_remote_path="{}/{}".format(model_save_prefix, version)
    model_saved_path="/tmp/{}/{}".format(model_save_prefix, version)
    model_script_remote_path="{}/model_scripted.pt".format(model_remote_path)
    model_script_save_path="{}/model_scripted.pt".format(model_saved_path)

    print(bucket_name,model_script_remote_path,model_script_save_path)
    minio_client.fget_object(bucket_name,model_script_remote_path,model_script_save_path)
    #load model without class prototype
    model = torch.jit.load(model_script_save_path)
    model.eval()
    
    
    transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
    
    train_data_saved_path="/tmp"
    os.makedirs(train_data_saved_path, exist_ok=True)
    model_data_remote_path="data/original"
    for bucket in minio_client.list_buckets():
        if bucket.name!=bucket_name:
            continue
        for item in minio_client.list_objects(bucket.name,model_data_remote_path,recursive=True):
            print("remote name:",item.object_name)
            print("local name:", train_data_saved_path+"/"+item.object_name)
            minio_client.fget_object(bucket.name,item.object_name, train_data_saved_path+"/"+item.object_name)
    local_mnist_data_rootdir=train_data_saved_path+"/"+model_data_remote_path
    test_data = datasets.MNIST(local_mnist_data_rootdir, train=False, download=False,transform=transform)

    test_kwargs = {'batch_size': test_batch_size}
    test_loader = torch.utils.data.DataLoader(test_data, **test_kwargs)
    device = torch.device(device_name)
    
    #{"loss":test_loss, "accuracy":model_accuracy}
    model_test_result = test(model, device, test_loader)
    model_loss = model_test_result["loss"]
    model_accuracy = model_test_result["accuracy"]
    
    
    y_pred = []
    y_true = []

    # iterate over test data
    for inputs, labels in test_loader:
        output = model(inputs) # Feed Network

        output = (torch.max(torch.exp(output), 1)[1]).data.cpu().numpy()
        y_pred.extend(output) # Save Prediction
        
        labels = labels.data.cpu().numpy()
        y_true.extend(labels) # Save Truth
        
        
    
    cm_csv = gen_cm_csv(y_test=y_true,test_predictions=y_pred)
    
    metric_model_summary = torch_summarize(model)
    
    output_confussion_matrix = {
                "type": "confusion_matrix",
                "format": "csv",
                "schema": [
                    {'name': 'target', 'type': 'CATEGORY'},
                    {'name': 'predicted', 'type': 'CATEGORY'},
                    {'name': 'count', 'type': 'NUMBER'},
                  ],
                "target_col" : "actual",
                "predicted_col" : "predicted",
                "source": cm_csv,
                "storage": "inline",
                "labels": list(np.arange(10)) #0..9 labels
            }
    output_model_summary = {
                'type': 'markdown',
                'storage': 'inline',
                'source': f'''# Model Overview
## Model Summary

```
{metric_model_summary}
```

## Model Performance

**Accuracy**: {model_accuracy}
**Loss**: {model_loss}

'''
            }
    
    metadata = {"outputs": [output_confussion_matrix, output_model_summary]}
    metrics = {
      'metrics': [{
          'name': 'model_accuracy',
          'numberValue':  float(model_accuracy),
          'format' : "PERCENTAGE"
        },{
          'name': 'model_loss',
          'numberValue':  float(model_loss),
          'format' : "PERCENTAGE"
        }]}
    
    
    class NpJsonEncoder(json.JSONEncoder):
        """Serializes numpy objects as json."""

        def default(self, obj):
            if isinstance(obj, np.integer):
                return int(obj)
            elif isinstance(obj, np.bool_):
                return bool(obj)
            elif isinstance(obj, np.floating):
                if np.isnan(obj):
                    return None  # Serialized as JSON null.
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            else:
                return super().default(obj)
        
    from collections import namedtuple
    output = namedtuple('Output', ['mlpipeline_ui_metadata', 'mlpipeline_metrics'])
    return output(json.dumps(metadata, cls=NpJsonEncoder),json.dumps(metrics, cls=NpJsonEncoder))


In [None]:
args = {
    "epochs": 1,
    "optimizer": "adam",
    "batch_size":64,
    "test_batch_size":1000,
    "device_name":"cpu",
    "epochs":1,
    "lr":0.03,
    "gamma":0.7,
    "seed":1,
    "log_interval":100,
    "model_save_prefix": "models/trained/detect-digits",
    "bucket_name":"datapipeline-028",
    "take_nth_in_subset":100,
    "version": "16"
}
step1 = put_public_mnist_to_local_minio(args)
step2 = train_model(args)
step3 = test_model_and_save_metrics(args)







minio url: 10.152.183.245:9000
try to find bucket datapipeline-028
found True
Bucket 'datapipeline-028' already exists
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw/train-images-idx3-ubyte.gz to /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw/train-labels-idx1-ubyte.gz to /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw/t10k-images-idx3-ubyte.gz to /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw/t10k-labels-idx1-ubyte.gz to /tmp/f4297038-5284-46f8-b29f-e5aecbfa6dc9/MNIST/raw

Processing...


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Done!
downlaod training data from the bucket: datapipeline-028
remote name: data/original/MNIST/processed/test.pt
local name: /tmp/data/original/MNIST/processed/test.pt
remote name: data/original/MNIST/processed/training.pt
local name: /tmp/data/original/MNIST/processed/training.pt
remote name: data/original/MNIST/raw/t10k-images-idx3-ubyte
local name: /tmp/data/original/MNIST/raw/t10k-images-idx3-ubyte
remote name: data/original/MNIST/raw/t10k-images-idx3-ubyte.gz
local name: /tmp/data/original/MNIST/raw/t10k-images-idx3-ubyte.gz
remote name: data/original/MNIST/raw/t10k-labels-idx1-ubyte
local name: /tmp/data/original/MNIST/raw/t10k-labels-idx1-ubyte
remote name: data/original/MNIST/raw/t10k-labels-idx1-ubyte.gz
local name: /tmp/data/original/MNIST/raw/t10k-labels-idx1-ubyte.gz
remote name: data/original/MNIST/raw/train-images-idx3-ubyte
local name: /tmp/data/original/MNIST/raw/train-images-idx3-ubyte
