<a href="https://colab.research.google.com/github/pavansai26/WB_DataModelVersioning_Artifacts.ipynb/blob/main/WB_DataModelVersioning_Artifacts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

in build_model_and_log rename project name to artifacts-example

In [None]:
# Compatible with wandb version 0.9.2+
!pip install wandb -qqq
!apt install tree

[K     |████████████████████████████████| 1.7 MB 24.8 MB/s 
[K     |████████████████████████████████| 139 kB 41.7 MB/s 
[K     |████████████████████████████████| 97 kB 6.5 MB/s 
[K     |████████████████████████████████| 180 kB 58.4 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 40.7 kB of archives.
After this operation, 105 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]
Fetched 40.7 kB in 0s (150 kB/s)
Selecting previously unselected package tree.
(Reading database ... 155047 files and directories currently installed.)
Preparin

In [None]:
import os
import wandb

In [None]:
#defining the train, test and valid datasets
import random

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Set the random seeds
os.environ['TF_CUDNN_DETERMINISTIC'] = '1' 
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
tf.random.set_seed(hash("by removing stochasticity") % 2**32 - 1)

from collections import namedtuple

Dataset = namedtuple("Dataset", ["x", "y"])

# Data parameters
num_classes = 10
input_shape = (28, 28, 1)

def load(train_size=50_000):
    """Load the data"""
    # the data, split between train and test sets
    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

    # split off a validation set for hyperparameter tuning
    x_train, x_val = x_train[:train_size], x_train[train_size:]
    y_train, y_val = y_train[:train_size], y_train[train_size:]

    training_set = Dataset(x_train, y_train)
    validation_set = Dataset(x_val, y_val)
    test_set = Dataset(x_test, y_test)

    datasets = [training_set, validation_set, test_set]

    return datasets

In [None]:
def load_and_log():

    # 🚀 start a run, with a type to label it and a project it can call home
    with wandb.init(project="artifacts-example", job_type="load-data") as run:
        
        datasets = load()  # separate code for loading the datasets
        names = ["training", "validation", "test"]

        # 🏺 create our Artifact
        raw_data = wandb.Artifact(
            "mnist-raw", type="dataset",
            description="Raw MNIST dataset, split into train/val/test",
            metadata={"source": "keras.datasets.mnist",
                      "sizes": [len(dataset.x) for dataset in datasets]})

        for name, data in zip(names, datasets):
            # 🐣 Store a new file in the artifact, and write something into its contents.
            with raw_data.new_file(name + ".npz", mode="wb") as file:
                np.savez(file, x=data.x, y=data.y)

        # ✍️ Save the artifact to W&B.
        run.log_artifact(raw_data)

load_and_log()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


VBox(children=(Label(value=' 52.41MB of 52.41MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

In [None]:
def preprocess(dataset, normalize=True, expand_dims=True, to_categorical=True):
    """Prepare the data"""
    x, y = dataset.x, dataset.y

    if normalize:
        # Scale images to the [0, 1] range
        x = x.astype("float32") / 255

    if expand_dims:
        # Make sure images have shape (28, 28, 1)
        x = np.expand_dims(x, -1)

    if to_categorical:
        # convert class vectors to binary class matrices
        y = keras.utils.to_categorical(y, num_classes)
        
    return Dataset(x, y)

In [None]:
def preprocess_and_log(steps):

    with wandb.init(project="artifacts-example", job_type="preprocess-data") as run:

        processed_data = wandb.Artifact(
            "mnist-preprocess", type="dataset",
            description="Preprocessed MNIST dataset",
            metadata=steps)
         
        # ✔️ declare which artifact we'll be using
        raw_data_artifact = run.use_artifact('mnist-raw:latest')

        # 📥 if need be, download the artifact
        raw_dataset = raw_data_artifact.download()
        
        for split in ["training", "validation", "test"]:
            raw_split = read(raw_dataset, split)
            processed_dataset = preprocess(raw_split, **steps)

            with processed_data.new_file(split + ".npz", mode="wb") as file:
                np.savez(file, x=processed_dataset.x, y=processed_dataset.y)

        run.log_artifact(processed_data)


def read(data_dir, split):
    filename = split + ".npz"
    data = np.load(os.path.join(data_dir, filename))

    return Dataset(x=data["x"], y=data["y"])

In [None]:
steps = {"normalize": True,
         "expand_dims": True,
         "to_categorical" : True}

preprocess_and_log(steps)

[34m[1mwandb[0m: Currently logged in as: [33mgman001[0m (use `wandb login --relogin` to force relogin)


[34m[1mwandb[0m: Downloading large artifact mnist-raw:latest, 52.41MB. 3 files... Done. 0:0:0


VBox(children=(Label(value=' 212.02MB of 212.02MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…

In [None]:
#defining the model
def build_model(hidden_layer_sizes=[32, 64],
                kernel_sizes=[(3, 3)],
                activation="relu",
                pool_sizes=[(2, 2)],
                dropout=0.5,
                num_classes=10,
                input_shape=(28, 28, 1)):

    model = keras.Sequential(
        [
            keras.Input(shape=input_shape),
            layers.Conv2D(hidden_layer_sizes[0], kernel_size=kernel_sizes[0],
                          activation=activation),
            layers.MaxPooling2D(pool_size=pool_sizes[0]),
            layers.Conv2D(hidden_layer_sizes[-1], kernel_size=kernel_sizes[-1],
                          activation=activation),
            layers.MaxPooling2D(pool_size=pool_sizes[-1]),
            layers.Flatten(),
            layers.Dropout(dropout),
            layers.Dense(num_classes, activation="softmax"),
        ]
    )

    model.summary()

    return model

In [None]:
def build_model_and_log(config):
    with wandb.init(project="keras-artifacts", job_type="initialize", config=config) as run:
        config = wandb.config
        
        model = build_model(**config)

        model_artifact = wandb.Artifact(
            "convnet", type="model",
            description="Simple AlexNet style CNN",
            metadata=dict(config))

        model.save("initialized_model.keras")
        # ➕ another way to add a file to an Artifact
        model_artifact.add_file("initialized_model.keras")
        wandb.save("initialized_model.keras")

        run.log_artifact(model_artifact)

model_config = {"hidden_layer_sizes": [32, 64],
                "kernel_sizes": [(3, 3)],
                "activation": "relu",
                "pool_sizes": [(2, 2)],
                "dropout": 0.5,
                "num_classes": 10,
                "input_shape": (28, 28, 1)}

build_model_and_log(model_config)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
flatten (Flatten)            (None, 1600)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1600)              0         
_________________________________________________________________
dense (Dense)                (None, 10)                1

VBox(children=(Label(value=' 0.30MB of 0.30MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [None]:
def train(model, training, validation, config):
    """Train the model"""
    model.compile(loss="categorical_crossentropy",
                  optimizer=config.optimizer, metrics=["accuracy"])

    callback = wandb.keras.WandbCallback(
        validation_data=(validation.x[:32], validation.y[:32]),
        input_type="images", labels=[str(i) for i in range(10)],
        **config["callback_config"])

    model.fit(training.x, training.y,
            validation_data=(validation.x, validation.y),
            batch_size=config.batch_size, epochs=config.epochs,
            callbacks=[callback])

In [None]:
def evaluate(model, test_dataset):
    """Evaluate the trained model"""

    loss, accuracy = model.evaluate(test_dataset.x, test_dataset.y, verbose=1)
    highest_losses, hardest_examples, true_labels, predictions = get_hardest_k_examples(test_dataset, model)

    return loss, accuracy, highest_losses, hardest_examples, true_labels, predictions


def get_hardest_k_examples(test_dataset, model, k=32):
    class_probs = model(test_dataset.x)
    predictions = np.argmax(class_probs, axis=1)
    losses = keras.losses.categorical_crossentropy(test_dataset.y, class_probs)
    argsort_loss =  np.argsort(losses)

    highest_k_losses = np.array(losses)[argsort_loss[-k:]]
    hardest_k_examples = test_dataset.x[argsort_loss[-k:]]
    true_labels = np.argmax(test_dataset.y[argsort_loss[-k:]], axis=1)

    return highest_k_losses, hardest_k_examples, true_labels, predictions

In [None]:
def train_and_log(config):

    with wandb.init(project="artifacts-example", job_type="train", config=config) as run:
        config = wandb.config

        data = run.use_artifact('mnist-preprocess:latest')
        data_dir = data.download()
        training_dataset =  read(data_dir, "training")
        validation_dataset = read(data_dir, "validation")
        
        model_artifact = run.use_artifact("convnet:latest")
        model_dir = model_artifact.download()
        model_path = os.path.join(model_dir, "initialized_model.keras")
        model = keras.models.load_model(model_path)

        model_config = model_artifact.metadata

        config.update(model_config)
 
        train(model, training_dataset, validation_dataset, config)

        model_artifact = wandb.Artifact(
            "trained-model", type="model",
            description="NN model trained with model.fit",
            metadata=dict(config))

        model.save("trained_model.keras")
        model_artifact.add_file("trained_model.keras")
        wandb.save("trained_model.keras")

        run.log_artifact(model_artifact)

    return model

    
def evaluate_and_log(config=None):
    
    with wandb.init(project="artifacts-example", job_type="report", config=config) as run:
        data = run.use_artifact('mnist-preprocess:latest')
        data_dir = data.download()
        test_dataset = read(data_dir, "test")

        model_artifact = run.use_artifact("trained-model:latest")
        model_dir = model_artifact.download()
        model_path = os.path.join(model_dir, "trained_model.keras")
        model = keras.models.load_model(model_path)

        loss, accuracy, highest_losses, hardest_examples, true_labels, preds = evaluate(model, test_dataset)

        run.summary.update({"loss": loss, "accuracy": accuracy})

        wandb.log({"high-loss-examples":
            [wandb.Image(hard_example, caption=str(pred) + "," +  str(label))
             for hard_example, pred, label in zip(hardest_examples, preds, true_labels)]})       

In [None]:
callback_config = {"log_weights": True,
                   "save_model": False,
                   "log_batch_frequency": 10}

train_config = {"batch_size": 128,
                "epochs": 5,
                "optimizer": "adam",
                "callback_config": callback_config}

model = train_and_log(train_config)
evaluate_and_log()

[34m[1mwandb[0m: Downloading large artifact mnist-preprocess:latest, 212.02MB. 3 files... Done. 0:0:0


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


VBox(children=(Label(value=' 0.92MB of 0.92MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▄▆▆▆▇▇▇████████████████████████████████
epoch,▁▃▅▆█
loss,█▅▄▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▆▇██
val_loss,█▄▂▁▁

0,1
accuracy,0.98088
best_epoch,4.0
best_val_loss,0.04459
epoch,4.0
loss,0.06152
val_accuracy,0.986
val_loss,0.04459


[34m[1mwandb[0m: Downloading large artifact mnist-preprocess:latest, 212.02MB. 3 files... Done. 0:0:0




VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.64542132416…

0,1
accuracy,0.9887
loss,0.03596
