# Distributed Deep Learning with Spark on CIFAR 10 Dataset:
![CIFAR-10](https://cntk.ai/jup/201/cifar-10.png)

In [None]:
from mmlspark import CNTKLearner
import os, tarfile, pickle
import urllib.request
cdnURL = "https://amldockerdatasets.azureedge.net"
# Please note that this is a copy of the CIFAR10 dataset originally found here:
# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
dataFile = "cifar-10-python.tar.gz"
dataURL = cdnURL + "/CIFAR10/" + dataFile
if not os.path.isfile(dataFile):
    urllib.request.urlretrieve(dataURL, dataFile)
with tarfile.open(dataFile, "r:gz") as f:
    test_dict = pickle.load(f.extractfile("cifar-10-batches-py/test_batch"),
                            encoding="latin1")

In [None]:
# Used for debugging
import socket
print(socket.gethostname())

In [None]:
# Create the images with labels from CIFAR dataset,
# reformat the labels using OneHotEncoder
import array
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.sql.functions import col
from pyspark.sql.types import *

def reshape_image(record):
    image, label, filename = record
    data = [float(x) for x in image.reshape(3,32,32).flatten()]
    return data, label, filename

convert_to_double = udf(lambda x: x, ArrayType(DoubleType()))

image_rdd = zip(test_dict["data"], test_dict["labels"], test_dict["filenames"])
image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)

imagesWithLabels = image_rdd.toDF(["images", "labels", "filename"])

list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

imagesWithLabels = imagesWithLabels.withColumn(
                       "images",
                       list_to_vector_udf(convert_to_double(col("images")))) \
                       .select("images", "labels")

ohe = OneHotEncoderEstimator() \
        .setInputCols(["labels"]).setOutputCols(["tmplabels"]) \
        .setDropLast(False)
imagesWithLabels = ohe.transform(imagesWithLabels) \
                      .select("images", "tmplabels") \
                      .withColumnRenamed("tmplabels", "labels")

imagesWithLabels.printSchema()

imagesWithLabels.cache()
print(imagesWithLabels.count())

In [None]:
# Define the neural network to be trained via CNTK's brainscript file notation
brainscriptText = """
    # ConvNet applied on CIFAR-10 dataset, with no data augmentation.

    parallelTrain = true

    TrainNetwork = {
        action = "train"

        BrainScriptNetworkBuilder = {
            imageShape = 32:32:3
            labelDim = 10

            featMean = 128
            featScale = 1/256
            Normalize{m,f} = x => f .* (x - m)

            model = Sequential (
                Normalize {featMean, featScale} :
                ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
                ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
                  MaxPoolingLayer {(3:3), stride = (2:2)} :
                ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
                ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
                  MaxPoolingLayer {(3:3), stride = (2:2)} :
                DenseLayer {256} : ReLU : Dropout :
                DenseLayer {128} : ReLU : Dropout :
                LinearLayer {labelDim}
            )

            # inputs
            features = Input {imageShape}
            labels   = Input {labelDim}

            # apply model to features
            z = model (features)

            # connect to system
            ce       = CrossEntropyWithSoftmax     (labels, z)
            errs     = ClassificationError         (labels, z)
            top5Errs = ClassificationError         (labels, z, topN=5)  # only used in Eval action

            featureNodes    = (features)
            labelNodes      = (labels)
            criterionNodes  = (ce)
            evaluationNodes = (errs)  # top5Errs only used in Eval
            outputNodes     = (z)
        }

        SGD = {
            epochSize = 0
            minibatchSize = 32

            learningRatesPerSample = 0.0015625*10:0.00046875*10:0.00015625
            momentumAsTimeConstant = 0*20:607.44
            maxEpochs = 30
            L2RegWeight = 0.002
            dropoutRate = 0*5:0.5

            numMBsToShowResult = 100
            parallelTrain = {
                parallelizationMethod = "DataParallelSGD"
                parallelizationStartEpoch = 2  # warm start: don't use 1-bit SGD for first epoch
                distributedMBReading = true
                dataParallelSGD = { gradientBits = 1 }
            }
        }
    }
"""

In [None]:
# Split the images with labels into a train and test data
train, test = imagesWithLabels.randomSplit([0.6, 0.4], seed=123)
train.printSchema()
train.show(5)

In [None]:
# Specify the working directory and GPU node name and GPU count
workingDir = "file:/tmp/gpuwork/"
gpum = ["mygpuvm,4"]
print("Working in " + workingDir)

In [None]:
# Train the distributed learner using the VM configured above
learner = CNTKLearner(brainScript=brainscriptText, dataTransfer="hdfs",
                      gpuMachines=gpum, workingDir=workingDir)
              .fit(train)

In [None]:
# Evaluate the model
scoredImages = learner.setOutputNodeName("z") \
                      .setInputCol("images").setOutputCol("scored") \
                      .transform(test)
scoredImages.show(10)

In [None]:
# Transform the log probabilities to predictions
def argmax(x): return max(enumerate(x),key=lambda p: p[1])[0]
argmaxUDF = udf(argmax, IntegerType())
imagePredictions = scoredImages.withColumn("predictions", argmaxUDF("scored"))\
                               .withColumn("labels", argmaxUDF("labels")) \
                               .select("predictions", "labels")
imagePredictions.registerTempTable("ImagePredictions")

In [None]:
%%sql -q -o imagePredictions
select * from ImagePredictions

In [None]:
%%local
y, y_hat = imagePredictions["labels"], imagePredictions["predictions"]

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y, y_hat)

labels = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog",
          "horse", "ship", "truck"]
plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=90)
plt.yticks(tick_marks, labels)
plt.xlabel("Predicted label")
plt.ylabel("True Label")
plt.show()