In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from PIL import Image
import io
import cv2
import numpy as np
import tensorflow as tf
import os
import time

from pyspark.sql.types import StringType, ArrayType, BooleanType

# 1. Setup the input, output path for image and model

In [2]:
image_input_folder_path = "/tmp/sparkcv/input/"
cascade_model_path = "/mnt/hgfs/Centos7_share_folder/trained_models"
cascade_model_name = "haarcascade_frontalface_default.xml"
faces_output_path = "/tmp/sparkcv/output/faces/"


# 2. Create a spark session

In [3]:
spark = SparkSession.builder \
    .master("local") \
    .appName("StreamingExample") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") \
    .getOrCreate()

# 3. Read raw image

In [4]:
query_list=spark.streams.active
    
print(query_list)

[]


In [5]:
image_schema = spark.read.format("binaryFile").load(image_input_folder_path).schema
raw_image_df_stream = spark.readStream.format("binaryFile").option("pathGlobFilter", "*.png") \
        .schema(image_schema) \
        .load(image_input_folder_path)
stream = raw_image_df_stream.writeStream \
    .format("memory") \
    .option("truncate", "false") \
    .trigger(processingTime='2 seconds') \
    .queryName("raw_image_df_stream") \
    .start()



In [None]:
from IPython.display import display, clear_output,Image

# Image(filename='test.png') 
while stream.isActive:
    clear_output(wait=True)
    display(stream.status)
    display(spark.sql('SELECT * FROM raw_image_df_stream').show())
    time.sleep(10)
stream.awaitTermination(200)

{'message': 'Waiting for next trigger',
 'isDataAvailable': False,
 'isTriggerActive': False}

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/tmp/sparkcv...|2020-05-22 07:19:34|787983|[89 50 4E 47 0D 0...|
|file:/tmp/sparkcv...|2020-05-22 07:19:28|206029|[89 50 4E 47 0D 0...|
+--------------------+-------------------+------+--------------------+



None

In [None]:
raw_image_df_stream.stop()
stream.stop()

In [None]:
def extract_file_name(path):
    return f.substring_index(path, "/", -1)

In [None]:
def face_extraction(image_name):

    image_path = image_input_folder_path + image_name
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.IMREAD_GRAYSCALE)
    face_model = cv2.CascadeClassifier("{}/{}".format(cascade_model_path,cascade_model_name))
    faces = face_model.detectMultiScale(img, scaleFactor=1.1, minNeighbors=4)  # returns a list of (x,y,w,h) tuples
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # Extract faces from the origin image
    extracted_face_list = []
    for i in range(len(faces)):
        (x, y, w, h) = faces[i]
        crop = img[y:y + h, x:x + w]
        extracted_face_img_name = image_name[:-4] + "_x" + str(x) + "_y" + str(y) + "_w" + str(
            w) + "_h" + str(h) + ".png"
        extracted_face_list.append(extracted_face_img_name)
        extracted_face_output_path = faces_output_path + extracted_face_img_name
        cv2.imwrite(extracted_face_output_path, crop)

    return extracted_face_list

Face_Extraction_UDF = f.udf(lambda image_name: face_extraction(image_name), ArrayType(StringType()))

In [None]:
def detect_faces_streaming(raw_image_df_stream):
    # get the image name df
    image_name_df = raw_image_df_stream.select("path") \
        .withColumn("origin_image_name", extract_file_name(f.col("path"))).drop("path")

    # run the face detection function on each row
    detected_face_list_df = image_name_df.withColumn("detected_face_list", Face_Extraction_UDF("origin_image_name"))
    detected_face_df = detected_face_list_df.withColumn("extracted_face_image_name",
                                                        f.explode(f.col("detected_face_list"))).drop(
        "detected_face_list")
    return detected_face_df