# Imports

In [5]:
from PIL import Image
import numpy as np
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql import functions as F
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.util import MLUtils
from pyspark.mllib.feature import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import PCA, StandardScaler

# Environment

In [6]:
# Spark Session
appName = "faces"
master = "local"

conf = (SparkConf()
    .set("spark.driver.maxResultSize", "8g")
    .set("spark.driver.memory", "16g") )

sc = SparkContext(master, appName, conf = conf)
sqlContext = SQLContext(sc)
spark = SparkSession.builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

# Faces

In [7]:
# Inbound
raw = spark.read.csv("raw/rostros_db_e1.csv")
raw = raw.withColumnRenamed("_c576", "label")
raw = raw.withColumn("label", F.regexp_replace(F.col("label"), "no_rostro", "0"))
raw = raw.withColumn("label", F.regexp_replace(F.col("label"), "rostro", "1"))
# raw.show(1, vertical = True)

faces = raw.filter(F.col("label") == "1")
faces = faces.drop("label")
# faces.show(1, vertical = True)

not_faces = raw.filter(F.col("label") == "0")
# not_faces.show(1, vertical = True)

# SVMLIB format
!rm -r libsvm/faces
r = raw.rdd.map(lambda line:LabeledPoint(line[-1], Vectors.dense(line[0:575])))
MLUtils.saveAsLibSVMFile(r, "libsvm/faces/")
faces_libsvm = spark.read.format("libsvm").load("libsvm/faces/")
# faces_libsvm.show(truncate = False)

# PCA
pca = PCA(k = 15, inputCol = "features", outputCol = "pca_features")
model = pca.fit(faces_libsvm)
faces_libsvm = model.transform(faces_libsvm)#.select("pca_features")
# faces_libsvm.show(1, truncate = False)

# Mean face
mean_face = faces.select(*[F.mean(c).alias(c) for c in faces.columns])
mean_face_array = np.array(mean_face.collect()).astype(np.uint8).reshape(-1)
mean_face_array = mean_face_array.reshape(24, 24)
img = Image.fromarray(mean_face_array)
img.save('mean_face.png')

# Normalize
scaler = StandardScaler(inputCol = "pca_features", outputCol = "norm_features",
                        withStd = True, withMean = True)
scaler_model = scaler.fit(faces_libsvm)
faces_libsvm = scaler_model.transform(faces_libsvm)
faces_libsvm = faces_libsvm.select("label", "pca_features")
faces_libsvm = faces_libsvm.withColumnRenamed("pca_features", "features")
# faces_libsvm.show(truncate = False)

# Classifier (MLP)
data = faces_libsvm
# Train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]
# Define network
layers = [15, 10, 8, 2]
trainer = MultilayerPerceptronClassifier(maxIter = 100, layers = layers, 
                                         blockSize = 128, seed = 1234)
# Training.
model = trainer.fit(train)
# Accuracy of test
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName = "accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.9320388349514563
