In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11 pyspark-shell"

In [2]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext('local')
spark = SparkSession(sc)

In [3]:
from pyspark.ml.image import ImageSchema
image_df = ImageSchema.readImages("images")
image_df.toPandas()

Unnamed: 0,image
0,(file:/Users/agnieszkabiernacka/Desktop/git/Da...
1,(file:/Users/agnieszkabiernacka/Desktop/git/Da...
2,(file:/Users/agnieszkabiernacka/Desktop/git/Da...


In [4]:
image_df = ImageSchema.readImages("images")

## Transfer learning

In [5]:
from pyspark.sql.functions import *
tulips_df = ImageSchema.readImages("flower_photos/tulips").withColumn("label", lit(1)).limit(10)
roses_df = ImageSchema.readImages("flower_photos/roses").withColumn("label", lit(0)).limit(10)


In [6]:

tulips_train, tulips_test, _ = tulips_df.randomSplit([0.6, 0.3, 0.1])  # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters)
roses_train, roses_test, _ = roses_df.randomSplit([0.6, 0.3, 0.1])     # use larger training sets (e.g. [0.6, 0.4] for non-community edition clusters)
train_df = tulips_train.unionAll(roses_train)
test_df = tulips_test.unionAll(roses_test)

# Under the hood, each of the partitions is fully loaded in memory, which may be expensive.
# This ensure that each of the paritions has a small size.
train_df = train_df.repartition(100)
test_df = test_df.repartition(100)

In [7]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from sparkdl import DeepImageFeaturizer 

featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3")
lr = LogisticRegression(maxIter=5, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[featurizer, lr])

p_model = p.fit(train_df)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [8]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

tested_df = p_model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(tested_df.select("prediction", "label"))))

Test set accuracy = 0.7


In [9]:
tested_df.toPandas()

Unnamed: 0,image,label,features,rawPrediction,probability,prediction
0,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,1,"[0.2271287590265274, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-2.1270269158902897, 2.1270269158902897]","[0.10649756717954208, 0.8935024328204579]",1.0
1,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,1,"[0.2719690799713135, 0.5799758434295654, 0.0, ...","[-2.091161151399062, 2.091161151399062]","[0.10995888340422419, 0.8900411165957758]",1.0
2,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,1,"[0.42978203296661377, 0.0, 0.19137954711914062...","[-1.7315559533843232, 1.7315559533843232]","[0.1503886644381542, 0.8496113355618459]",1.0
3,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,1,"[0.0, 0.0, 0.0, 0.518147349357605, 0.0, 0.0, 0...","[-27.620443125823375, 27.620443125823375]","[1.0106341348329834e-12, 0.9999999999989895]",1.0
4,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,1,"[0.0, 0.2156921923160553, 0.6671097278594971, ...","[-17.192780406046005, 17.192780406046005]","[3.4140535298693253e-08, 0.9999999658594647]",1.0
5,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,0,"[0.7097122073173523, 0.07345157861709595, 0.26...","[-469.9846518379927, 469.9846518379927]","[7.731417527395182e-205, 1.0]",1.0
6,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,0,"[0.0, 0.3295484185218811, 1.0895251035690308, ...","[0.01773589645516419, -0.01773589645516419]","[0.5044338578874581, 0.49556614211254185]",0.0
7,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,0,"[0.7112337946891785, 0.0, 0.3489874005317688, ...","[-7.70352164348588, 7.70352164348588]","[0.0004510317702174508, 0.9995489682297825]",1.0
8,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,0,"[0.0, 0.0, 0.0, 0.6153160333633423, 0.0, 0.0, ...","[1.0048001739756212, -1.0048001739756212]","[0.732001302709658, 0.2679986972903419]",0.0
9,(file:/Users/agnieszkabiernacka/Desktop/git/Da...,0,"[0.0, 0.27889925241470337, 0.07056798785924911...","[-202.93339892805423, 202.93339892805423]","[7.364523488877299e-89, 1.0]",1.0


## Learning

In [10]:
import PIL.Image
import numpy as np
from keras.applications.imagenet_utils import preprocess_input
from sparkdl.estimators.keras_image_file_estimator import KerasImageFileEstimator

def load_image_from_uri(local_uri):
    img = (PIL.Image.open(local_uri).convert('RGB').resize((299, 299), PIL.Image.ANTIALIAS))
    img_arr = np.array(img).astype(np.float32)
    img_tnsr = preprocess_input(img_arr[np.newaxis, :])
    return img_tnsr

In [11]:
from keras.layers import Activation, Dense, Flatten
from keras.models import Sequential

model = Sequential()
model.add(Flatten(input_shape=(299, 299, 3)))
model.add(Dense(2))
model.add(Activation("softmax"))
model.save('model-full.h5')

In [12]:
estimator = KerasImageFileEstimator( inputCol="uri",
                                     outputCol="prediction",
                                     labelCol="one_hot_label",
                                     imageLoader=load_image_from_uri,
                                     kerasOptimizer='adam',
                                     kerasLoss='categorical_crossentropy',
                                     modelFile='model-full.h5' # local file path for model
                                   ) 

In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = (
  ParamGridBuilder()
  .addGrid(estimator.kerasFitParams, [{"batch_size": 32, "verbose": 0},
                                      {"batch_size": 64, "verbose": 0}])
  .build()
)
bc = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label" )
cv = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=bc, numFolds=2)


In [15]:

train_df = spark.createDataFrame([
    ("flower_photos/tulips/10791227_7168491604.jpg", 1),
    ("flower_photos/tulips/11746080_963537acdc.jpg", 1),
    ("flower_photos/tulips/11746276_de3dec8201.jpg", 2),
    ("flower_photos/tulips/11746367_d23a35b085_n.jpg", 2),
    ("flower_photos/roses/12240303_80d87f77a3_n.jpg", 0),
    ("flower_photos/roses/22679076_bdb4c24401_m.jpg", 0),
    ("flower_photos/roses/24781114_bc83aa811e_n.jpg", 0)
], ["uri", "label"])

In [16]:
test_df = spark.createDataFrame([
    ("flower_photos/tulips/10791227_7168491604.jpg", 1),
    ("flower_photos/roses/24781114_bc83aa811e_n.jpg", 0)
], ["uri", "label"])

In [17]:
from pyspark.ml.feature import OneHotEncoderEstimator

oh_encoder = OneHotEncoderEstimator(inputCols=["label"],
                                 outputCols=["one_hot_label"])
oh_model = oh_encoder.fit(train_df)

In [18]:

train_df = oh_model.transform(train_df)
test_df = oh_model.transform(test_df)

In [19]:
train_df.limit(1).toPandas()

Unnamed: 0,uri,label,one_hot_label
0,flower_photos/tulips/10791227_7168491604.jpg,1,"(0.0, 1.0)"


In [20]:
cvModel = cv.fit(train_df)


Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 2 variables.
INFO:tensorflow:Converted 2 variables to const ops.





INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.



INFO:tensorflow:Froze 2 variables.
INFO:tensorflow:Converted 2 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.
INFO:tensorflow:Froze 2 variables.
INFO:tensorflow:Converted 2 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.
INFO:tensorflow:Froze 2 variables.
INFO:tensorflow:Converted 2 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.


In [21]:
cvModel.transform(test_df).limit(1).toPandas()

INFO:tensorflow:Froze 2 variables.
INFO:tensorflow:Converted 2 variables to const ops.




INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.


Unnamed: 0,uri,label,one_hot_label,prediction
0,flower_photos/tulips/10791227_7168491604.jpg,1,"(0.0, 1.0)","[1.0, 0.0]"


In [22]:
bc.evaluate(cvModel.transform(test_df))

INFO:tensorflow:Froze 2 variables.
INFO:tensorflow:Converted 2 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.




0.0