Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# **This cell configures the spark session - Do not change**

In [55]:
%%configure -f
{
"conf": {
     "spark.rpc.message.maxSize": 1024,
     "spark.kryoserializer.buffer.max": "256m"
   }
}

StatementMeta(, 27, -1, Finished, Available)

# **These are the parameters that need to be changed to your values**

In [56]:
# The input directory where the images reside, can be nested
input_directory = 'abfss://[].dfs.core.windows.net/animal_faces/walkthrough/'
# The number of clusters - this can be automated or start with a guesstimate
number_of_clusters = 3
# The output directory where the output file will be written to
output_directory = 'abfss://[].dfs.core.windows.net/animal_faces/output/'
# The name of the output file
output_filename = 'animal_faces_clustered_walkthrough_pca2.csv'

# The blob account url - https://[accountname].blob.core.windows.net
account_url = "https://[].blob.core.windows.net"
# The blob account name = [accountname]
account_name = ''
# The blob account key [iufquq34r423r2==] - used to generate a SAS key
account_key = ''

# The name of the primary ADLS share
file_system_name="share"
# The directory folders where your files reside  
directory_name='animal_faces/walkthrough'  # bbc/videos/

# If set to true generate a 3D scatterplot otherwise 2D
SCATTER_PLOT_3D = False
# If this is set to True then the Coalesce notebook will need to be run to merge the partition files into a single file
LOW_MEMORY_MODE = True

# AML Experiment tracking

# Azure SubscriptionId
subscription_id=""
# AzureML Workspace Resource Group
resource_group=""
# AzureML Workspace Name
workspace_name=""

StatementMeta(DataDiscovery, 27, 1, Finished, Available)

## Track the Experiment in Azure ML

In [None]:
from azureml.core import Workspace, Experiment, Run
import mlflow

ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name)    
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
experiment_name = f"({mssparkutils.runtime.context['notebookname']}_{str(mssparkutils.env.getJobId())})"
mlflow.set_experiment(experiment_name)
mlflow.log_param("input_filename", input_filename)
mlflow.log_param("number_of_clusters", number_of_clusters)
mlflow.log_param("output_directory", output_directory)
mlflow.log_param("output_filename", output_filename)
mlflow.log_param("account_url", account_url)
mlflow.log_param("account_name", account_name)
mlflow.log_param("file_system_name", file_system_name)
mlflow.log_param("directory_name", directory_name)
mlflow.log_param("SCATTER_PLOT_3D", SCATTER_PLOT_3D)
mlflow.log_param("LOW_MEMORY_MODE", LOW_MEMORY_MODE)
params = {
    "sparkpool": mssparkutils.runtime.context['sparkpool'],
    "workspace": mssparkutils.runtime.context['workspace'],
    "notebookname": mssparkutils.runtime.context['notebookname'],
    "isForPipeline": mssparkutils.runtime.context['isForPipeline'],
    "pipelinejobid": mssparkutils.runtime.context['pipelinejobid']
}

mlflow.log_params(params)
mlflow.pyspark.ml.autolog()

In [57]:
import numpy as np
import io
import pandas as pd
import ntpath
import os

from pyspark.sql.types import StringType
from pyspark.sql.functions import col, pandas_udf, lit, struct, PandasUDFType, udf
import pyspark.sql.types as Types
from pyspark.ml.linalg import Vectors, VectorUDT

from PIL import Image
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input, decode_predictions

from pyspark.sql.types import StringType
from pyspark.sql.functions import col, pandas_udf, lit, struct, PandasUDFType, udf
import pyspark.sql.types as Types
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import PCA
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml import Pipeline
            

# Load all images
images = spark.read.format("binaryFile") \
  .option("recursiveFileLookup", "true") \
  .option("pathGlobFilter", "*.jpg") \
  .load(input_directory)

# Convert to spark df
images_df = images.select(
  col("path"),
  col("content"))


def get_filename(row):
    return ntpath.basename(row)

def preprocess(img_data):
  try:
    img = Image.open(io.BytesIO(img_data)).convert('RGB')
    img = img.resize([299, 299])
    x = np.asarray(img, dtype="float32")
  except OSError:
    x = np.zeros((299, 299, 3))
  return preprocess_input(x)

def keras_model_udf(model_fn):
  def predict(image_batch_iter):
    model = model_fn()
    for img_series in image_batch_iter:
      processed_images = np.array([preprocess(img) for img in img_series])
      predictions = model.predict(processed_images, batch_size=64)
      predicted_labels = [x[0] for x in decode_predictions(predictions, top=1)]
      results = []
      for i, tuples in enumerate(predicted_labels):
        all_predictions = tuples + (predictions[i],)
        results.append(all_predictions)

      yield pd.DataFrame(results)

  return_type = "class: string, desc: string, score:float, inceptionv3: array<float>"
  return pandas_udf(return_type, PandasUDFType.SCALAR_ITER)(predict)  

def inceptionv3_fn():
    model = InceptionV3(weights='imagenet')
    model.set_weights(bc_model_weights.value)
    return model


model = InceptionV3()
bc_model_weights = sc.broadcast(model.get_weights())
inceptionv3_udf = keras_model_udf(inceptionv3_fn)
predictions = images_df.withColumn("preds", inceptionv3_udf(col("content")))


list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
df_with_vectors = predictions.select(
    predictions["path"],
    predictions["content"],
    predictions["preds.desc"], 
    predictions["preds.inceptionv3"], 
    list_to_vector_udf(predictions["preds.inceptionv3"]).alias("features")
)

k = number_of_clusters 
pca_1 = PCA(k=20, inputCol="features")
pca_1.setOutputCol("pca_features")

if SCATTER_PLOT_3D:
  pca_2 = PCA(k=3, inputCol="pca_features")
else:  
  pca_2 = PCA(k=2, inputCol="pca_features")
  
pca_2.setOutputCol("pca_scatterplot_features")

kmeans = KMeans(k=k, seed=42, initMode="k-means||", distanceMeasure="cosine")
pipeline = Pipeline(stages=[pca_1, kmeans, pca_2])

model = pipeline.fit(df_with_vectors)
df_coords = model.transform(df_with_vectors)


StatementMeta(DataDiscovery, 27, 2, Finished, Available)

2022-06-21 12:52:44.552054: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [58]:

from azure.identity import DefaultAzureCredential
from datetime import datetime, timedelta
from azure.storage.blob import BlobServiceClient, generate_container_sas, BlobSasPermissions
token_credential = DefaultAzureCredential()

blob_service_client = BlobServiceClient(
    account_url=account_url,
    credential=token_credential
)

from azure.storage.filedatalake import DataLakeServiceClient, generate_directory_sas
SAS = generate_directory_sas(
        account_name=account_name,
        file_system_name=file_system_name,
        directory_name=directory_name,
        credential=account_key,
        permission=BlobSasPermissions(read=True),
        expiry=datetime.utcnow() + timedelta(days=100))

SAS_key  = "?" + SAS

storage_path = os.path.join(account_url, file_system_name, directory_name)
SAS_path = []

def build_sas_path(row):
    file_name = ntpath.basename(row)
    return account_url + "/" + file_system_name + "/" + directory_name + "/" + file_name + SAS_key

udf_build_sas_path = udf(build_sas_path, StringType())

def get_X(row):
    return str(row.values[0])

def get_Y(row):
    return str(row.values[1])

def get_Z(row):
    return str(row.values[2])

udf_get_X = udf(get_X, StringType())
udf_get_Y = udf(get_Y, StringType())
udf_get_Z = udf(get_Z, StringType())

if SCATTER_PLOT_3D:
    df_coords = df_coords.withColumn("blob_path", udf_build_sas_path(df_coords.path)).withColumn("X", udf_get_X(df_coords.pca_scatterplot_features).cast('string')).withColumn("Y", udf_get_Y(df_coords.pca_scatterplot_features).cast('string')).withColumn("Z", udf_get_Z(df_coords.pca_scatterplot_features).cast('string'))
else:
    df_coords = df_coords.withColumn("blob_path", udf_build_sas_path(df_coords.path)).withColumn("X", udf_get_X(df_coords.pca_scatterplot_features).cast('string')).withColumn("Y", udf_get_Y(df_coords.pca_scatterplot_features).cast('string'))

df_coords = df_coords.drop('inceptionv3', 'features','pca_features', 'content', 'pca_scatterplot_features')


StatementMeta(DataDiscovery, 27, 3, Finished, Available)

In [60]:
if LOW_MEMORY_MODE:
    df_coords.write.mode('overwrite').options(header='true').csv(os.path.join(output_directory, output_filename))
else:
    df_coords.coalesce(1).write.mode('overwrite').options(header='true').csv(os.path.join(output_directory, output_filename))

mlflow.pyspark.ml.mlflow.end_run()

StatementMeta(DataDiscovery, 27, 5, Finished, Available)