In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from PIL import Image
import io
import cv2
import numpy as np
import tensorflow as tf
import os
import time
from pyspark.sql.types import StringType, ArrayType, BooleanType
from IPython.display import display, clear_output,Image
import s3fs

# 1. Setup the input, output path for image and model

In [14]:
endpoint = "https://"+os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': endpoint})

# Set up model path
model_path="s3a://pengfei/diffusion/computer_vision/trained_model"
fs.info(model_path)
cascade_model_path = model_path
vgg19_model_path = model_path
cascade_model_name = "haarcascade_frontalface_default.xml"
vgg19_model_name = "masknet.h5"

#### Modify, you need to change the bucket_name to your own minio bucket name
bucket_name="pengfei"


check_point_path="{}/tmp/checkpoint".format(bucket_name)
fs.touch('s3a://'+check_point_path+'/.keep')
fs.info(check_point_path)

faces_output_path="s3a://{}/tmp/sparkcv/output/faces".format(bucket_name)
fs.touch(faces_output_path+'/.keep')
fs.info(faces_output_path)

final_output_path = "s3a://{}/tmp/sparkcv/output/final".format(bucket_name)
fs.touch(final_output_path+'/.keep')
fs.info(final_output_path)

image_input_folder_path="s3a://{}/tmp/sparkcv/input".format(bucket_name)
fs.touch(image_input_folder_path+'/.keep')
fs.info(image_input_folder_path)

event_log_path="{}/tmp/spark-history".format(bucket_name)
fs.touch('s3://'+event_log_path+'/.keep')
fs.info(event_log_path)

{'Key': 'pengfei/tmp/spark-history',
 'name': 'pengfei/tmp/spark-history',
 'type': 'directory',
 'Size': 0,
 'size': 0,
 'StorageClass': 'DIRECTORY'}

# 2. Create a spark session

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("Evaluate data format") \
    .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "5") \
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .config("spark.eventLog.enabled","true") \
    .config("spark.eventLog.dir","s3a://"+event_log_path) \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") \
    .getOrCreate()

In [23]:
! kubectl get pods

NAME                                                   READY   STATUS      RESTARTS   AGE
deleting-pods-with-completed-status-1622538000-2bpcr   0/1     Completed   0          5m18s
jupyter-1622533988-5599d5bb49-jxkj6                    1/1     Running     0          72m
mlflow-deployment-dd54d6c6b-xnglp                      1/1     Running     0          42d
mlflow-model-deployment-869dd96bbf-f7mgs               1/1     Running     0          11d
postgres-1616502799-67f86f5bdf-wfgjx                   1/1     Running     0          69d
ubuntu-1616490233-56d6684bb4-trwqt                     1/1     Running     2          70d


## 2.1 helper functions

### 2.1.1 face extraction spark udf

In [7]:
def face_extraction(image_name):
    image_path = image_input_folder_path + image_name

    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.IMREAD_GRAYSCALE)
    face_model = cv2.CascadeClassifier("{}/{}".format(cascade_model_path,cascade_model_name))
    faces = face_model.detectMultiScale(img, scaleFactor=1.1, minNeighbors=4)  # returns a list of (x,y,w,h) tuples
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # Extract faces from the origin image
    extracted_face_list = []
    for i in range(len(faces)):
        (x, y, w, h) = faces[i]
        crop = img[y:y + h, x:x + w]
        extracted_face_img_name = image_name[:-4] + "_x" + str(x) + "_y" + str(y) + "_w" + str(
            w) + "_h" + str(h) + ".png"
        extracted_face_list.append(extracted_face_img_name)
        extracted_face_output_path = "{}/{}".format(faces_output_path, extracted_face_img_name)
        cv2.imwrite(extracted_face_output_path, crop)

    return extracted_face_list

Face_Extraction_UDF = f.udf(lambda image_name: face_extraction(image_name), ArrayType(StringType()))


### 2.1.2 column function for extract image name


In [8]:
def extract_file_name(path):
    return f.substring_index(path, "/", -1)

### 2.1.3 mask detection spark udf


In [9]:
# This function use a pre-trained vgg19 model to predict if it has mask or no. It returns true, if it has mask.
def face_mask_prediction(face_image_name):
    # read raw face image
    img = cv2.imread("{}/{}".format(faces_output_path, face_image_name))
    # normalize the raw image for vgg19 model
    img = cv2.resize(img, (128, 128))
    # plt.imshow(img)
    # plt.show()
    img = np.reshape(img, [1, 128, 128, 3])
    img = img / 255.0
    vgg19_model = tf.keras.models.load_model("{}/{}".format(vgg19_model_path, vgg19_model_name))
    score = vgg19_model.predict(img)
    if np.argmax(score) == 0:
        res = True
    else:
        res = False
    # print(res)
    return res


Face_Mask_Prediction_UDF = f.udf(lambda face_image_name: face_mask_prediction(face_image_name), BooleanType())

### 2.1.4 get face position in origin image

In [10]:
def get_face_coordinate_of_origin_image(face_image_name):
    x = face_image_name.split("_")[1][1:]
    y = face_image_name.split("_")[2][1:]
    w = face_image_name.split("_")[3][1:]
    h = face_image_name.split("_")[4][1:].split('.')[0]
    return int(x), int(y), int(w), int(h)

### 2.1.5 prediction integration spark udf


In [11]:
def integrate_face_mask_prediction(face_image_name, origin_image_name, has_mask):
    # check if the image is already treated or not, if yes, it means it has multiple faces. and we just add new mask
    # prediction to untreated faces.
    # If not treated, we load image from
    treated_image_path = "{}/{}".format(final_output_path, origin_image_name)
    # If the image is treated, update the treated image
    if os.path.isfile(treated_image_path):
        image = cv2.imread(treated_image_path)
    else:
        # Get the untreated image from input
        image = cv2.imread("{}/{}".format(image_input_folder_path, origin_image_name))

    # set Label text
    if has_mask:
        mask_label = "MASK"
    else:
        mask_label = "NO MASK"
    # Get the coordinate and size of face image
    (x, y, w, h) = get_face_coordinate_of_origin_image(face_image_name)

    # Set text color for mask label
    mask_label_color = {"MASK": (0, 255, 0), "NO MASK": (0, 0, 255)}

    # Insert mask label to image
    image = cv2.putText(image, mask_label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                        mask_label_color[mask_label], 2)
    # Insert a rectangle around the face
    image = cv2.rectangle(image, (x, y), (x + w, y + h), mask_label_color[mask_label], 1)
    # Save the image
    cv2.imwrite(treated_image_path, image)

    return "Done"


Integrate_Face_Mask_Prediction_UDF = f.udf(
    lambda face_image_name, origin_image_name, has_mask: integrate_face_mask_prediction(face_image_name,
                                                                                        origin_image_name, has_mask))

### 2.1.6 render image in jupyter notebook




In [12]:
def render_image(image_folder_path, image_list):
    for image_name in image_list:
        image_path = "{}/{}".format(image_folder_path,image_name)
        display(Image(filename=image_path))

# 3. Process image

To check if everyone wears a face mask or not in an image, we will follow the below steps:
1. Read raw image
2. Detect faces from the raw image, output extracted faces as single images(haar-cascade)
3. Use a pre-trained vgg19 model to check if a mask is worn
4. Integrate prediction as tags on origin image

## 3.1 Read raw image

In [15]:
image_schema = spark.read.format("binaryFile").load(image_input_folder_path).schema
raw_image_df_stream = spark.readStream \
    .format("binaryFile") \
    .schema(image_schema) \
    .option("maxFilesPerTrigger", "500") \
    .option("recursiveFileLookup", "true") \
    .option("pathGlobFilter", "*.png") \
    .load(image_input_folder_path) \
    .withColumn("time_stamp", f.current_timestamp())



## 3.2 Detect faces and output extracted each face as a single image

In [16]:
# get the image name df
image_name_df = raw_image_df_stream \
    .select("path","time_stamp") \
    .withColumn("origin_image_name", extract_file_name(f.col("path"))) \
    .drop("path")

# run the face detection function on each row
detected_face_list_df = image_name_df.withColumn("detected_face_list", Face_Extraction_UDF("origin_image_name"))

detected_face_df = detected_face_list_df \
                   .withColumn("extracted_face_image_name",f.explode(f.col("detected_face_list")))\
                   .drop("detected_face_list")

## 3.3 predict if the face wear mask or not

In [17]:
predict_mask_df_stream = detected_face_df.withColumn("has_mask",
                                                    Face_Mask_Prediction_UDF("extracted_face_image_name"))


## 3.4: Integrate faces with tag to origin image


In [18]:
complete_df_stream = predict_mask_df_stream.withColumn("integration",
                                                Integrate_Face_Mask_Prediction_UDF("extracted_face_image_name",
                                                                                   "origin_image_name",
                                                                                   "has_mask"))

# 4. View the output data frame and image

In [19]:
stream = complete_df_stream.withWatermark("time_stamp", "10 seconds") \
    .writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("raw_image_df_stream") \
    .start()

In [20]:
for x in range(20):
    clear_output(wait=True)
    display(stream.status)
    _df=spark.sql('SELECT * FROM raw_image_df_stream')
    col_name="origin_image_name"
    if _df.count()>0:
        display(_df.show(10,False))
        img_list=_df.select(col_name).distinct().toPandas()[col_name]
        display(img_list)
        render_image(final_output_path,img_list)
    time.sleep(5)


{'message': 'Terminated with exception: Writing job aborted.',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [21]:
stream.stop()

In [22]:
# stop sparksession
spark.sparkContext.stop()