In [1]:
val path = "lfw/*"
val rdd = sc.wholeTextFiles(path)
val first = rdd.first

In [2]:
val files = rdd.map { case (fileName, content) =>
    fileName.replace("file:", "")
}
println(files.first)

/Users/omar/Documents/Projects/Scala/MLSpark/lfw/Aaron_Eckhart/Aaron_Eckhart_0001.jpg


In [3]:
println(files.count)

1054


In [4]:
import java.awt.image.BufferedImage
import javax.imageio.ImageIO
import java.io.File

def loadImageFromFile(path: String): BufferedImage = {
    ImageIO.read(new File(path))
}

val aeImage = loadImageFromFile(files.first)

In [5]:
def processImage(image: BufferedImage, width: Int, height: Int): BufferedImage = {
    val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
    val g = bwImage.getGraphics()
    g.drawImage(image, 0, 0, width, height, null)
    g.dispose()
    bwImage
}

val grayImage = processImage(aeImage, 100, 100)

In [6]:
ImageIO.write(grayImage, "jpg", new File("tmp/aeGray.jpg"))

true

In [7]:
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
    val width = image.getWidth
    val height = image.getHeight
    val pixels = Array.ofDim[Double](width*height)
    image.getData.getPixels(0, 0, width, height, pixels)
}

def extractPixels(path: String, width: Int, height: Int): Array[Double] = {
    val raw = loadImageFromFile(path)
    val processed = processImage(raw, width, height)
    getPixelsFromImage(processed)
}

val pixels = files.map(f => extractPixels(f, 50, 50))
println(pixels.take(10).map(_.take(10).mkString("", ",", ", ...")).mkString("\n"))

1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0, ...
247.0,173.0,159.0,144.0,139.0,155.0,32.0,7.0,4.0,5.0, ...
253.0,254.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0,253.0, ...
242.0,242.0,246.0,239.0,238.0,239.0,225.0,165.0,140.0,167.0, ...
47.0,221.0,205.0,46.0,41.0,154.0,127.0,214.0,232.0,232.0, ...
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, ...
75.0,76.0,72.0,72.0,72.0,74.0,71.0,78.0,54.0,26.0, ...
25.0,27.0,24.0,22.0,26.0,27.0,19.0,16.0,22.0,25.0, ...
240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0, ...
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, ...


In [8]:
import org.apache.spark.mllib.linalg.Vectors
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache

image-vectors MapPartitionsRDD[5] at map at <console>:40

In [9]:
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)

In [10]:
val scaledVectors = vectors.map(v => scaler.transform(v))

In [11]:
//Running PCA on the LFW DataSet
val matrix = new RowMatrix(scaledVectors)
val K = 10
val pc = matrix.computePrincipalComponents(K)

In [12]:
val rows = pc.numRows
val cols = pc.numCols
println(rows, cols)

(2500,10)


In [13]:
import breeze.linalg.DenseMatrix
val pcBreeze = new DenseMatrix(rows, cols, pc.toArray)

import breeze.linalg.csvwrite
csvwrite(new File("tmp/pc.csv"), pcBreeze)

In [14]:
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)

(1054,10)


In [15]:
println(projected.rows.take(5).mkString("\n"))

[2656.255334446328,1331.4316152623849,443.77171439925996,-352.5378024086936,52.35190158301246,377.3800577741128,487.0249575522312,-469.5189260655325,80.88622666722512,-84.82988295536593]
[177.0310856502517,663.9809715438986,261.50327924203305,-708.5431250876696,467.0380132620281,181.4509192089409,-37.15151425523848,635.0116960435249,882.0389729322873,-534.4893725108707]
[-1058.983438535667,390.9754848782779,1508.454706207631,363.79206833776055,275.1957888077916,-623.0196254444063,537.4147515744895,-218.67299199041472,-231.55887927232297,-99.98392390187095]
[-4685.773699057371,255.26635771944402,-153.10119543377468,-24.569787433435064,522.6588196148455,-375.9264880075217,-539.8743970690424,-470.0706533730587,-67.54765928695977,51.92673828087255]
[-2762.7905683587305,622.6539180572763,-405.00678943894866,-462.90978295573234,866.4534195252717,-919.4904224431655,-31.69129561997938,-782.0657727943528,516.2915128509082,237.11383873779545]


In [16]:
val svd = matrix.computeSVD(10, computeU = true)
println(s"U dimension: (${svd.U.numRows}, ${svd.U.numCols})")
println(s"S dimension: (${svd.s.size}, )")
println(s"V dimension: (${svd.V.numRows}, ${svd.V.numCols})")

U dimension: (1054, 10)
S dimension: (10, )
V dimension: (2500, 10)


In [17]:
def approxEqual(array1: Array[Double], array2: Array[Double], tolerance: Double = 1e-6): Boolean = {
    //Note we ignore sign of the principal component / isngular vactor elements
    val bools = array1.zip(array2).map{ case (v1, v2) => if (math.abs(math.abs(v1) - math.abs(v2)) > 1e-6) false else true}
    bools.fold(true)(_&_)
}

In [18]:
println(approxEqual(Array(1.0, 2.0, 3.0), Array(1.0, 2.0, 3.0)))

true


In [19]:
println(approxEqual(Array(1.0, 2.0, 3.0), Array(3.0, 2.0, 1.0)))

false


In [20]:
println(approxEqual(svd.V.toArray, pc.toArray))

true


In [21]:
val breezeS = breeze.linalg.DenseVector(svd.s.toArray)
val projectedSVD = svd.U.rows.map{ v =>
    val breezeV = breeze.linalg.DenseVector(v.toArray)
    val multV = breezeV :* breezeS
    Vectors.dense(multV.data)
}
projected.rows.zip(projectedSVD).map{ case (v1, v2) => approxEqual(v1.toArray, v2.toArray)}.filter(b => true).count

1054

In [22]:
//Evaluating Dimensionality Reduction
val sValues = (1 to 5).map{ i => matrix.computeSVD(i, computeU = false).s }
sValues.foreach(println)

[54091.009971103565]
[54091.009971103565,33757.70286798242]
[54091.009971103565,33757.702867982414,24541.193694775917]
[54091.00997110355,33757.70286798243,24541.19369477594,23309.584188883004]
[54091.00997110355,33757.702867982414,24541.19369477596,23309.58418888302,21803.09841158358]


In [23]:
val svd300 = matrix.computeSVD(300, computeU = false)
val sMatrix = new DenseMatrix(1, 300, svd300.s.toArray)
csvwrite(new File("tmp/s.csv"), sMatrix)