## Deep Learning - Deep Vision Classifier

### Environment Setup -- reinstall horovod based on new version of pytorch

In [None]:
%pip install /dbfs/FileStore/shared_uploads/serenaruan@microsoft.com/synapseml_dl-0.9.5.dev1-py3-none-any.whl

In [None]:
%sh
# Remove Outdated Signing Key:
sudo apt-key del 7fa2af80

# Install the new cuda-keyring package:
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb

apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub
wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/nvidia-machine-learning-repo-ubuntu2004_1.0.0-1_amd64.deb
dpkg -i ./nvidia-machine-learning-repo-ubuntu2004_1.0.0-1_amd64.deb


apt-get update
apt-get install --allow-downgrades --no-install-recommends -y \
cuda-nvml-dev-11-0=11.0.167-1 \
cuda-nvcc-11-0=11.0.221-1 \
cuda-cudart-dev-11-0=11.0.221-1 \
cuda-libraries-dev-11-0=11.0.3-1 \
libnccl-dev=2.10.3-1+cuda11.0 \
libcusparse-dev-11-0=11.1.1.245-1

In [None]:
%sh
git clone --recursive https://github.com/horovod/horovod.git
cd horovod
# fix a certain commit at version 0.24.3
git reset --hard 7707267a4bef79e09a9df1d41b0652feb61b76c7
rm -rf build/ dist/
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 \
/databricks/python3/bin/python setup.py bdist_wheel

readlink -f dist/horovod-*.whl

In [None]:
%pip install --no-cache-dir /databricks/driver/horovod/dist/horovod-0.24.3-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps

In [None]:
! horovodrun --check-build

In [None]:
import os
import numpy as np
from PIL import Image
import sys

from pyspark.sql.functions import udf, col
from pyspark.sql.types import DoubleType

from pyspark.sql.functions import udf
import pyspark.sql.types as T
import numpy as np

from pyspark.ml.linalg import DenseVector, VectorUDT
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import torchvision.transforms as transforms

### Read Dataset

In [None]:
# these files already exist for internal build test machine
train_files = [
    os.path.join(dp, f)
    for dp, dn, filenames in os.walk("/dbfs/tmp/17flowers/train")
    for f in filenames
    if os.path.splitext(f)[1] == ".jpg"
]
test_files = [
    os.path.join(dp, f)
    for dp, dn, filenames in os.walk("/dbfs/tmp/17flowers/test")
    for f in filenames
    if os.path.splitext(f)[1] == ".jpg"
]


def extract_path_and_label(path):
    num = int(path.split("/")[-1].split(".")[0].split("_")[1])
    label = num // 81  # Assign the label
    return (path, label)


train_df = spark.createDataFrame(
    map(extract_path_and_label, train_files), ["image", "label"]
).withColumn("label", col("label").cast(DoubleType()))

test_df = spark.createDataFrame(
    map(extract_path_and_label, test_files), ["image", "label"]
).withColumn("label", col("label").cast(DoubleType()))

display(train_df.limit(100))

### Training

In [None]:
transform = transforms.Compose(
    [
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)


def _transform_row(row):
    image = Image.open(row["image"]).convert("RGB")
    image = transform(image).numpy()
    label = row["label"]
    return {"image": image, "label": label}


def readImageAndTransform(path):
    image = Image.open(path).convert("RGB")
    image = DenseVector(transform(image).numpy().reshape(-1))
    return image


read_image_and_transform_udf = udf(lambda x: readImageAndTransform(x), VectorUDT())

In [None]:
from horovod.spark.common.store import DBFSLocalStore
from horovod.spark.common.backend import SparkBackend
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
from synapse.ml.dl import *

run_output_dir = "/dbfs/FileStore/test/resnet50"
store = DBFSLocalStore(run_output_dir)

backend = SparkBackend(
    num_proc=2,  # This is important parameter
    stdout=sys.stdout,
    stderr=sys.stderr,
    prefix_output_with_timestamp=True,
)

epochs = 10


class MyDummyCallback(Callback):
    def __init__(self):
        self.epcoh_end_counter = 0
        self.train_epcoh_end_counter = 0
        self.validation_epoch_end_counter = 0

    def on_init_start(self, trainer):
        print("Starting to init trainer!")

    def on_init_end(self, trainer):
        print("Trainer is initialized.")

    def on_epoch_end(self, trainer, model):
        print("A epoch ended.")
        self.epcoh_end_counter += 1

    def on_train_epoch_end(self, trainer, model, unused=None):
        print("A train epoch ended.")
        self.train_epcoh_end_counter += 1

    def on_validation_epoch_end(self, trainer, model, unused=None):
        print("A val epoch ended.")
        self.validation_epoch_end_counter += 1

    def on_train_end(self, trainer, model):
        print(
            "Training ends:"
            f"epcoh_end_counter={self.epcoh_end_counter}, "
            f"train_epcoh_end_counter={self.train_epcoh_end_counter}, "
            f"validation_epoch_end_counter={self.validation_epoch_end_counter} \n"
        )
        assert self.train_epcoh_end_counter <= epochs
        assert (
            self.train_epcoh_end_counter + self.validation_epoch_end_counter
            == self.epcoh_end_counter
        )


callbacks = [MyDummyCallback(), ModelCheckpoint(filename="{epoch}-{train_loss:.2f}")]

In [None]:
deep_vision_classifier = DeepVisionClassifier(
    backbone="resnet50",
    store=store,
    backend=backend,
    callbacks=callbacks,
    input_shapes=[[-1, 3, 224, 224]],
    num_classes=17,
    feature_cols=["image"],
    label_cols=["label"],
    batch_size=16,
    epochs=epochs,
    validation=0.1,
    verbose=1,
    profiler=None,
    partitions_per_process=1,
    transformation_fn=_transform_row,
)

deep_vision_model = deep_vision_classifier.fit(train_df).setOutputCols(["label_prob"])

### Prediction

In [None]:
test_df_trans = test_df.withColumn("features", read_image_and_transform_udf("image"))
pred_df = deep_vision_model.setFeatureColumns(["features"]).transform(test_df_trans)
argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
pred_df = pred_df.withColumn("label_pred", argmax(pred_df.label_prob))
evaluator = MulticlassClassificationEvaluator(
    predictionCol="label_pred", labelCol="label", metricName="accuracy"
)
print("Test accuracy:", evaluator.evaluate(pred_df))