In [1]:
import findspark
findspark.init()

import numpy as np
import cv2
import face_recognition
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from PIL import Image
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.functions import udf

In [2]:

SparkContext.setSystemProperty('spark.executor.memory', '4g')
SparkContext.setSystemProperty('spark.driver.memory', '2g')
spark = SparkSession \
    .builder \
    .appName("Streaming from Kafka") \
    .config("spark.streaming.stopGracefullyOnShutdown", "true") \
    .config("spark.memory.offHeap.enabled","true")\
    .config("spark.memory.offHeap.size","8g")\
    .config("spark.sql.shuffle.partitions", 5) \
    .config("spark.network.timeout","360000ms")\
    .config("spark.executor.heartbeatInterval","300000ms")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
    .master("local[*]") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

your 131072x1 screen size is bogus. expect trouble
24/05/19 21:33:46 WARN Utils: Your hostname, LAPTOP-390RNSVU resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/05/19 21:33:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
images_df_ntp = spark.read.format("image").option("dropInvalid", True)\
  .load('./image/NTP', inferschema=True).withColumn('target',lit("phat"))
images_df_tqb = spark.read.format("image").option("dropInvalid", True)\
  .load('./image/TQB', inferschema=True).withColumn('target',lit("bao"))
images_df_hai = spark.read.format("image").option("dropInvalid", True)\
  .load('./image/DuyHai', inferschema=True).withColumn('target',lit("hai"))
images_df_huy = spark.read.format("image").option("dropInvalid", True)\
  .load('./image/QuangHuy', inferschema=True).withColumn('target',lit("huy"))

In [4]:
images_df = images_df_ntp.union(images_df_tqb).union(images_df_hai).union(images_df_huy)

In [5]:
images_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- target: string (nullable = false)



In [6]:
images_df[["image.width","image.height","image.nChannels"]].show()

[Stage 0:>                                                          (0 + 1) / 1]

+-----+------+---------+
|width|height|nChannels|
+-----+------+---------+
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
|  112|   112|        3|
+-----+------+---------+
only showing top 20 rows



                                                                                

In [7]:

def bin_to_128d(img):
    _img = cv2.cvtColor(np.reshape(np.asarray(img), (112,112,3)), cv2.COLOR_BGR2RGB) 
    faces = face_recognition.face_locations(_img,0)
    
    if len(faces) == 0 or faces is None:
        return  None
    try:
        top, right, bottom, left = faces[0]
        arr = np.asarray(_img)
        return Vectors.dense(
            face_recognition.face_encodings(
                np.asarray(Image.fromarray(arr[top:bottom, left:right], mode='RGB'))
            )[0].tolist()
        )
    except:
        return None
    
udf_bin_to_128d = udf(bin_to_128d, VectorUDT())
featured_df = images_df.withColumn('featuresModel', udf_bin_to_128d(images_df['image.data'])).select("featuresModel","target")

In [8]:
featured_df = featured_df.filter("featuresModel IS NOT NULL")

In [9]:
featured_df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+------+
|       featuresModel|target|
+--------------------+------+
|[-0.0836555808782...|  phat|
|[-0.0923014730215...|  phat|
|[-0.1357773244380...|  phat|
|[-0.0669603273272...|  phat|
|[-0.0742524415254...|  phat|
|[-0.0810490772128...|  phat|
|[-0.0958993211388...|  phat|
|[-0.0982806235551...|  phat|
|[-0.1040647700428...|  phat|
|[-0.1188037693500...|  phat|
|[-0.1336424797773...|  phat|
|[-0.0861320197582...|  phat|
|[-0.0946999341249...|  phat|
|[-0.0901736244559...|  phat|
|[-0.0832401886582...|  phat|
|[-0.0952133983373...|  phat|
|[-0.0551990941166...|  phat|
|[-0.1333475112915...|  phat|
|[-0.1324640214443...|  phat|
|[-0.0895575582981...|  phat|
+--------------------+------+
only showing top 20 rows



                                                                                

In [10]:
featured_df.printSchema()

root
 |-- featuresModel: vector (nullable = true)
 |-- target: string (nullable = false)



In [11]:
labelIndexer = StringIndexer(inputCol="target", outputCol="indexedTarget").fit(featured_df)

                                                                                

In [12]:
lr = LogisticRegression(maxIter=5, regParam=0.03, 
                        elasticNetParam=0.5, labelCol="indexedTarget", featuresCol="udt")

In [13]:
vector_assembler = VectorAssembler(inputCols=["featuresModel"],outputCol="udt")

In [14]:
df_train_split, df_test_split =  featured_df.randomSplit([0.8, 0.2],42) 

In [15]:
sparkdn = Pipeline(stages=[labelIndexer,vector_assembler,lr])
spark_model = sparkdn.fit(df_train_split)

                                                                                

In [16]:
predictions = spark_model.transform(df_test_split)

                                                                                

In [17]:
predictions.show()



+--------------------+------+-------------+--------------------+--------------------+--------------------+----------+
|       featuresModel|target|indexedTarget|                 udt|       rawPrediction|         probability|prediction|
+--------------------+------+-------------+--------------------+--------------------+--------------------+----------+
|[-0.1333475112915...|  phat|          1.0|[-0.1333475112915...|[-2.0582456043261...|[0.00645414716123...|       1.0|
|[-0.1040647700428...|  phat|          1.0|[-0.1040647700428...|[-1.5610983359862...|[0.00643514160497...|       1.0|
|[-0.0958993211388...|  phat|          1.0|[-0.0958993211388...|[-1.6341758059338...|[0.00783170639610...|       1.0|
|[-0.0901736244559...|  phat|          1.0|[-0.0901736244559...|[-1.7378060341455...|[0.00808411272476...|       1.0|
|[-0.0832401886582...|  phat|          1.0|[-0.0832401886582...|[-1.6923266118875...|[0.00560746525020...|       1.0|
|[-0.0742524415254...|  phat|          1.0|[-0.074252441

                                                                                

In [18]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedTarget", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))



Test Error = 0


                                                                                

In [19]:
print(accuracy)

1.0


In [20]:
spark_model.stages[2].save("lrmodel")

                                                                                

In [21]:
labelIndexer.labels

['hai', 'phat', 'huy', 'bao']