In [None]:
import pyspark
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType, DoubleType
from pyspark.ml import Transformer, Estimator, Pipeline
from pyspark.ml.classification import LogisticRegression

from mmlspark import ImageReader, ImageFeaturizer, UnrollImage, ImageTransformer, TrainClassifier, \
    SelectColumns, Repartition, ImageFeaturizer, ModelDownloader

import numpy as np, pandas as pd, os, sys, time
from os.path import join, abspath, exists
from urllib.request import urlretrieve

In [None]:
# Download the CNTK model

dataFile = "flowers_and_labels.parquet"
dataZipFile = dataFile + ".zip"
cdnURL = "https://mmlspark.azureedge.net/datasets"
dataURL = cdnURL + "/Flowers/" + dataZipFile
localDataDir = "/tmp/Flowers/"
localDataFile = join(localDataDir, dataFile)
localDataZipFile = join(localDataDir, dataZipFile)
modelName = "ResNet50"
modelDir = "file:" + abspath("models")

def maybeDownload(url, path):
    path = abspath(path)
    if not os.path.isfile(path):
        print("downloading to {}".format(path))
        urlretrieve(url, path)
    else:
        print("found {} skipping download".format(abspath(path)))

def maybeUnzip(zipFilePath, outputDir):
    unzippedPath = join(outputDir, os.path.basename(zipFilePath).replace(".zip", ""))
    if os.path.isfile(unzippedPath) or os.path.isdir(unzippedPath):
        print("found {}, skipping unzipping".format(unzippedPath))
    else:
        import zipfile
        print("unzipping {}".format(zipFilePath))
        with zipfile.ZipFile(zipFilePath, "r") as zf:
            zf.extractall(outputDir)

os.makedirs(abspath(localDataDir), exist_ok=True)
maybeDownload(dataURL, localDataZipFile)
maybeUnzip(localDataZipFile, localDataDir)

In [None]:
%%local
from os.path import join
dataFile = "flowers_and_labels.parquet"
dataZipFile = dataFile + ".zip"
cdnURL = "https://mmlspark.azureedge.net/datasets"
dataURL = cdnURL + "/Flowers/" + dataZipFile
localDataDir = "/tmp/Flowers/"
localDataFile = join(localDataDir, dataFile)
localDataZipFile = join(localDataDir, dataZipFile)

import subprocess
if subprocess.call(["hdfs", "dfs", "-test", "-d", localDataDir]):
    from urllib import urlretrieve
    print subprocess.check_output(
            "mkdir -p {}".format(localDataDir),
            stderr=subprocess.STDOUT, shell=True)
    urlretrieve(dataURL, localDataZipFile)
    print subprocess.check_output(
            "unzip {1} -d {0}".format(localDataDir, localDataZipFile),
            stderr=subprocess.STDOUT, shell=True)
    print subprocess.check_output(
            "hdfs dfs -mkdir -p {}".format(localDataDir),
            stderr=subprocess.STDOUT, shell=True)
    print subprocess.check_output(
            "hdfs dfs -copyFromLocal -f {0} {0}".format(localDataFile),
            stderr=subprocess.STDOUT, shell=True)
    print subprocess.check_output(
            "rm -rf {}".format(localDataFile),
            stderr=subprocess.STDOUT, shell=True)

In [None]:
modelName = "ResNet50"
modelDir = "wasb:///models/"

In [None]:
d = ModelDownloader(spark, modelDir)
model = d.downloadByName(modelName)

In [None]:
dataFile = "flowers_and_labels.parquet"
localDataDir = "/tmp/Flowers/"
localDataFile = join(localDataDir, dataFile)

In [None]:
# Load the images
imagesWithLabels = spark.read.parquet(localDataFile)
imagesWithLabels.printSchema()

![Smiley face](https://i.imgur.com/p2KgdYL.jpg)

In [None]:
# Make some featurizers
it = ImageTransformer()\
    .setOutputCol("scaled")\
    .resize(height = 60, width = 60)

ur = UnrollImage()\
    .setInputCol("scaled")\
    .setOutputCol("features")

basicFeaturizer = Pipeline(stages=[it,ur])

In [None]:
cntkFeaturizer = ImageFeaturizer()\
    .setInputCol("image")\
    .setOutputCol("features")\
    .setModelLocation(spark, model.uri)\
    .setLayerNames(model.layerNames)\
    .setCutOutputLayers(1)

![Resnet 18](https://i.imgur.com/Mb4Dyou.png)

### How does it work?

![Convolutional network weights](http://i.stack.imgur.com/Hl2H6.png)

In [None]:
# Define some methods to help us experiment
def featurize(featurizer, train, test, name):
    start = time.time()
    sc1 = SelectColumns(cols=["features","labels"])
    rep = Repartition(n=4)
    pipe = Pipeline(stages=[featurizer,sc1,rep]).fit(train)
    trainFeats = pipe.transform(train).cache()
    testFeats = pipe.transform(test).cache()

    print("Featurized {} images with {} featurizer in {} seconds"\
          .format(trainFeats.count()+testFeats.count(), name, time.time()-start))
    sys.stdout.flush()
    return trainFeats, testFeats

def predict(model, train, test, name):
    start=time.time()
    sc2 = SelectColumns(cols=(["scored_labels","labels"]))
    pipe = Pipeline(stages=[model, sc2]).fit(train)
    predictions = pipe.transform(test).cache()

    print("Classified {} images from {} features in {} seconds"\
          .format(predictions.count(), name, time.time()-start))
    sys.stdout.flush()

    return predictions

### Run the experiment

In [None]:
# Increase or remove the sampling in order to get better results
train, test = imagesWithLabels.sample(False, 0.03).randomSplit([.8,.2])
train, test = train.repartition(1), test.repartition(1)

In [None]:
train.cache()
test.cache()
train.count(), test.count()

In [None]:
model = TrainClassifier().setModel(LogisticRegression()).setLabelCol("labels")

In [None]:
trainFeatsBasic, testFeatsBasic = featurize(basicFeaturizer, train, test, "basic")

In [None]:
basicPredictions = predict(model, trainFeatsBasic, testFeatsBasic, "basic")

In [None]:
trainFeatsCNTK, testFeatsCNTK = featurize(cntkFeaturizer, train, test, "cntk")

In [None]:
cntkPredictions = predict(model, trainFeatsCNTK, testFeatsCNTK, "cntk")

In [None]:
basicPredictions = basicPredictions.toPandas()
cntkPredictions = cntkPredictions.toPandas()

In [None]:
basicPredictions.registerTempTable("basicPredictions")
cntkPredictions.registerTempTable("cntkPredictions")

In [None]:
%%sql -q -o basicPredictions
select * from basicPredictions

In [None]:
%%sql -q -o cntkPredictions
select * from cntkPredictions

### Plot confusion matrix.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pandas as pd
from glob import glob
import numpy as np
def evaluate(results, name):
    y, y_hat = results["labels"],results["scored_labels"]
    y = [int(l) for l in y]

    accuracy = np.mean([1. if pred==true else 0. for (pred,true) in zip(y_hat,y)])
    cm = confusion_matrix(y, y_hat)
    cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

    plt.text(40, 10,"$Accuracy$ $=$ ${}\%$".format(round(accuracy*100,1)),fontsize=14)
    plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
    plt.colorbar()
    plt.xlabel("$Predicted$ $label$", fontsize=18)
    plt.ylabel("$True$ $Label$", fontsize=18)
    plt.title("$Normalized$ $CM$ $for$ ${}$".format(name))

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
evaluate(cntkPredictions,"CNTKModel + LR")
plt.subplot(1,2,2)
evaluate(basicPredictions,"LR")
plt.show()