In [1]:
import io
import os
import numpy as np
import pandas as pd
from pyspark.sql.functions import col, pandas_udf, regexp_extract, regexp_replace, split
from PIL import Image
import tensorflow as tf
import s3fs

In [2]:
endpoint = "https://"+os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': endpoint})

data_path="s3a://pengfei/diffusion/computer_vision"
fs.info(data_path)

#### Modify, you need to change the bucket_name to your own minio bucket name
bucket_name="pengfei"

check_point_path="{}/tmp/checkpoint".format(bucket_name)
fs.touch('s3a://'+check_point_path+'/.keep')
fs.info(check_point_path)

output_path="{}/tmp/sparkcv/output".format(bucket_name)
fs.touch('s3a://'+output_path+'/.keep')
fs.info(output_path)

image_input_path="{}/tmp/sparkcv/input".format(bucket_name)
fs.touch('s3a://'+output_path+'/.keep')
fs.info(output_path)

event_log_path="{}/tmp/spark-history".format(bucket_name)
fs.touch('s3://'+event_log_path+'/.keep')
fs.info(event_log_path)

PermissionError: Forbidden

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("Evaluate data format") \
    .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "5") \
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .config("spark.eventLog.enabled","true") \
    .config("spark.eventLog.dir","s3a://"+event_log_path) \
    .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.8.0") \
    .getOrCreate()

In [None]:
! kubectl get pods

In [None]:
image_schema = spark.read.format("binaryFile").load(data_path).schema
images2 = spark.readStream \
    .format("binaryFile") \
    .schema(image_schema) \
    .option("maxFilesPerTrigger", "500") \
    .option("recursiveFileLookup", "true") \
    .option("pathGlobFilter", "*.png") \
    .load(data_path)

In [None]:
def extract_photo_name(path2):
    """Extract the name of the photo."""
    return regexp_extract(path2, "(/([\w|\d]*)_x)", 1)

def extract_photo_name1(path1):
    """Extract the name of the photo."""
    return regexp_replace(path1, "(^.*/)", "")

def extract_photo_name2(path2):
    """Extract the name of the photo."""
    return regexp_replace(path2, "(_.*)\d", "")

In [None]:
df2 = images2.select(
    extract_photo_name1(col("path")).alias("photo"),
    col("content")).withColumn("photo", extract_photo_name2(col("photo"))).drop(col("content"))

In [None]:
import cv2

def face_extraction(image_name):
    image_input_folder_path = "/tmp/sparkcv/input/"
    cascade_model_path = "/mnt/hgfs/Centos7_share_folder/trained_models"
    faces_output_path = "/tmp/sparkcv/output/faces/"

    image_path = image_input_folder_path + image_name

    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.IMREAD_GRAYSCALE)

    # loading haarcascade_frontalface_default.xml, you can get all the pre-trained model from
    # https://github.com/opencv/opencv/tree/3.4/data/haarcascades
    face_model = cv2.CascadeClassifier("{}/haarcascade_frontalface_default.xml".format(cascade_model_path))
    faces = face_model.detectMultiScale(img, scaleFactor=1.1, minNeighbors=4)  # returns a list of (x,y,w,h) tuples
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # Extraction des visages présents sur la photo
    for i in range(len(faces)):
        (x, y, w, h) = faces[i]
        crop = img[y:y + h, x:x + w]
        extracted_face_output_path = faces_output_path + image_name[:-4] + "_x" + str(x) + "_y" + str(y) + "_w" + str(
            w) + "_h" + str(h) + ".png"
        cv2.imwrite(extracted_face_output_path, crop)

    return str(len(faces))

In [None]:
from pyspark.sql.functions import udf
# On transforme la fonction précédente en UDF
Extraction_des_visages_UDF = udf(lambda x: face_extraction(x))

In [None]:
spark.conf.set("spark.sql.streaming.checkpointLocation", check_point_path)

In [1]:
from delta.tables import *

df2.writeStream.format("delta").outputMode("append").table("extraction_visage")

ModuleNotFoundError: No module named 'delta'

In [None]:
from IPython.core.display import Image, display 
display(Image(filename='test.png'))