### references

[Supervised learning](https://en.wikipedia.org/wiki/Supervised_learning)

[Unsupervised learning](https://en.wikipedia.org/wiki/Unsupervised_learning)

[Kmeans clustering](https://en.wikipedia.org/wiki/K-means_clustering)

[standard score](https://en.wikipedia.org/wiki/Standard_score)

[mahanobis distance](https://en.wikipedia.org/wiki/Mahalanobis_distance)

[cluster analysis](https://en.wikipedia.org/wiki/Cluster_analysis)

[Silhouette](https://en.wikipedia.org/wiki/Silhouette_(clustering))


In [1]:
%%init_spark
launcher.driver_memory = '6g'

In [2]:
val rawData = sc.textFile("hdfs://localhost:9000/ds/kddcup.data")

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.13:4040
SparkContext available as 'sc' (version = 2.3.1, master = local[*], app id = local-1574114462865)
SparkSession available as 'spark'


rawData: org.apache.spark.rdd.RDD[String] = hdfs://localhost:9000/ds/kddcup.data MapPartitionsRDD[1] at textFile at <console>:25


In [None]:
rawData.map(_.split(',').last).countByValue().toSeq.sortBy(_._2).reverse.foreach(println)

In [3]:
import org.apache.spark.mllib.linalg._

val labelsAndData = rawData.map { line => 
    val buffer = line.split(',').toBuffer
    buffer.remove(1,3)
    val label = buffer.remove(buffer.length - 1)
    val vector = Vectors.dense(buffer.map(_.toDouble).toArray)
    (label, vector)
}

val data = labelsAndData.values.cache()

import org.apache.spark.mllib.linalg._
labelsAndData: org.apache.spark.rdd.RDD[(String, org.apache.spark.mllib.linalg.Vector)] = MapPartitionsRDD[2] at map at <console>:29
data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[3] at values at <console>:37


In [4]:
import org.apache.spark.mllib.clustering._

val kmeans = new KMeans()
val model = kmeans.run(data)

model.clusterCenters.foreach(println)

2019-11-18 23:01:30 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-11-18 23:01:30 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
[48.34019491959669,1834.6215497618625,826.2031900016945,5.7161172049003456E-6,6.487793027561892E-4,7.961734678254053E-6,0.012437658596734055,3.205108575604837E-5,0.14352904910348827,0.00808830584493399,6.818511237273984E-5,3.6746467745787934E-5,0.012934960793560386,0.0011887482315762398,7.430952366370449E-5,0.0010211435092468404,0.0,4.082940860643104E-7,8.351655530445469E-4,334.9735084506668,295.26714620807076,0.17797031701994342,0.1780369894027253,0.05766489875327374,0.05772990937912739,0.7898841322630883,0.021179610609908736,0.02826081009629284,232.98107822302248,189.21428335201279,0.7537133898006421,0.030710978823798966,0.6050519309248854,0.006464107887636004,0.1780911843182601,0.17788589813474293,0.05792761150001131,0.05765922142400886]
[10999.0,0.0,1.309937401

import org.apache.spark.mllib.clustering._
kmeans: org.apache.spark.mllib.clustering.KMeans = org.apache.spark.mllib.clustering.KMeans@3900cb7c
model: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@3705e55f


In [None]:
val clusterLabelCount = labelsAndData.map { case (label,datum) =>
    val cluster = model.predict(datum)
    (cluster,label)
}.countByValue

clusterLabelCount.toSeq.sorted.foreach {
    case ((cluster,label),count) => println(f"$cluster%1s$label%18s$count%8s")
}

In [5]:
def distance(a: Vector, b: Vector) = math.sqrt(a.toArray.zip(b.toArray).map( p => p._1 - p._2).map(d => d * d).sum)
def distToCentroid(datum: Vector, model: KMeansModel) = {
    val cluster = model.predict(datum)
    val centroid = model.clusterCenters(cluster)
    distance(centroid, datum)
}

distance: (a: org.apache.spark.mllib.linalg.Vector, b: org.apache.spark.mllib.linalg.Vector)Double
distToCentroid: (datum: org.apache.spark.mllib.linalg.Vector, model: org.apache.spark.mllib.clustering.KMeansModel)Double


In [6]:
import org.apache.spark.rdd._

def clusteringScore(data: RDD[Vector], k: Int) = {
    val kmeans = new KMeans()
    kmeans.setK(k)
    val model = kmeans.run(data)
    data.map(datum => distToCentroid(datum,model)).mean()
}

import org.apache.spark.rdd._
clusteringScore: (data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector], k: Int)Double


In [None]:
(5 to 40 by 5).map( k=> (k, clusteringScore(data,k))).foreach(println)

In [None]:
val sample = data.map(datum => model.predict(datum) + "," + datum.toArray.mkString(",")).sample(false, 0.05)
sample.saveAsTextFile("hdfs://localhost:9000/ds/sample")

In [9]:
val dataAsArray = data.map(_.toArray)
val numCols = dataAsArray.first().length
val n = dataAsArray.count()
val sums = dataAsArray.reduce(
    (a,b) => a.zip(b).map(t => t._1 + t._2)
)
val sumSquares = dataAsArray.fold(
    new Array[Double](numCols)
)(
    (a,b) => a.zip(b).map(t => t._1 + t._2 * t._2)
)

dataAsArray: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[27] at map at <console>:39
numCols: Int = 38
n: Long = 4898431
sums: Array[Double] = Array(2.3680206E8, 8.986765238E9, 5.357035893E9, 28.0, 3178.0, 39.0, 60925.0, 157.0, 703067.0, 39620.0, 334.0, 180.0, 63361.0, 5823.0, 364.0, 5002.0, 0.0, 2.0, 4091.0, 1.640844284E9, 1.446345448E9, 871775.1400000013, 872101.7299999918, 282468.4699999987, 282786.919999999, 3869193.1300014798, 103746.83999989525, 138433.59999998374, 1.141241758E9, 9.26852923E8, 3692012.2800011584, 150436.22999986156, 2963805.5300003863, 31663.98000003283, 872367.200000095, 871361.6200001689, 283755.3500000004, 282440.6600000077)
sumSquares: Array[Double] = Array(2.4638491956744675E24, 1.5029566894234366E37, 4.148072161728185E36, 348.0, 2.0716658E7, 41...

In [10]:
val stdevs = sumSquares.zip(sums).map {
    case(sumSq,sum) => math.sqrt(n*sumSq - sum*sum)/n
}
val means = sums.map(_ / n)

stdevs: Array[Double] = Array(7.092160637568065E8, 1.751639521459706E15, 9.202263271748564E14, 0.0084287083113581, 2.056512384574052, 0.09166285000710928, 287.68577159744495, 0.09072036889476061, 233.81918676097567, 28130.323433474063, 0.1083573070299824, 0.08124082227268468, 28479.8027413705, 19.74717642001751, 0.09118608360562963, 1.6570203724101165, 0.0, 6.3897874456691E-4, 1.1076200324017111, 1.644073889668342E8, 1.5848272926710385E8, 216.36118707751112, 216.67593997485977, 98.6021992658583, 98.62855554368699, 735.9728858210965, 8.914593473131658, 28.381068274048623, 5.432462364556549E7, 4.5319163933310404E7, 708.1851544280705, 15.559303546893325, 623.6970123393882, 2.253339083152612, 216.41685907071218, 216.5968049851283, 98.2159104102263, 97.78114589612957)
means: Array[Double] = ...

In [11]:
def normalize(datum: Vector) = {
    val normalizedArray = (datum.toArray, means, stdevs).zipped.map(
        (value, mean, stdev) =>
            if (stdev <= 0) (value - mean) else (value - mean) / stdev
    )
    Vectors.dense(normalizedArray)
}

normalize: (datum: org.apache.spark.mllib.linalg.Vector)org.apache.spark.mllib.linalg.Vector


In [16]:
val normalizedData = data.map(normalize).cache()
// (60 to 120 by 10).par.map(k => (k, clusteringScore(normalizedData, k))).toList.foreach(println)

normalizedData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[113] at map at <console>:50


In [12]:
def entropy(counts: Iterable[Int]) = {
    val values =counts.filter(_ > 0)
    val n: Double = values.sum
    values.map{ v =>
        val p = v/n
        -p * math.log(p)
    }.sum
}

def clusteringScore(normalizedLabelsAndData: RDD[(String, Vector)], k: Int) = {
    val kmeans = new KMeans()
    kmeans.setK(k)
    val model = kmeans.run(normalizedLabelsAndData.values)
    val labelsAndClusters = normalizedLabelsAndData.mapValues(model.predict)
    val clustersAndLabels = labelsAndClusters.map(_.swap)
    val labelsInCluster = clustersAndLabels.groupByKey().values
    val labelCounts = labelsInCluster.map(
        _.groupBy(l => l).map(_._2.size)
    )
    val n = normalizedLabelsAndData.count()
    labelCounts.map(m => m.sum * entropy(m)).sum / n
    
}

entropy: (counts: Iterable[Int])Double
clusteringScore: (normalizedLabelsAndData: org.apache.spark.rdd.RDD[(String, org.apache.spark.mllib.linalg.Vector)], k: Int)Double


In [13]:
val normalizedLabelsAndData = rawData.map { line => 
    val buffer = line.split(',').toBuffer
    buffer.remove(1,3)
    val label = buffer.remove(buffer.length - 1)
    val vector = Vectors.dense(buffer.map(_.toDouble).toArray)
    (label, normalize(vector))
}.cache()

// (80 to 160).map(k => (k, clusteringScore(normalizedLabelsAndData, k))).toList.foreach(println)

normalizedLabelsAndData: org.apache.spark.rdd.RDD[(String, org.apache.spark.mllib.linalg.Vector)] = MapPartitionsRDD[28] at map at <console>:50


In [14]:
val clusteScore = clusteringScore(normalizedLabelsAndData, 150)

2019-11-18 23:02:39 WARN  KMeans:66 - The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.
2019-11-18 23:04:03 WARN  KMeans:66 - The input data was not directly cached, which may hurt performance if its parent RDDs are also uncached.


clusteScore: Double = 0.7189396941456284


In [19]:
val kmeans = new KMeans()
kmeans.setK(150)
val model = kmeans.run(normalizedData)

kmeans: org.apache.spark.mllib.clustering.KMeans = org.apache.spark.mllib.clustering.KMeans@6c47985
model: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@61cc09c7


In [20]:
val distances = normalizedData.map(
    datum => distToCentroid(datum, model)
)
val threshold = distances.top(100).last



distances: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[193] at map at <console>:63
threshold: Double = 0.4079889463143644


In [22]:
val labelsAndData = rawData.map { line => 
    val buffer = line.split(',').toBuffer
    buffer.remove(1,3)
    val label = buffer.remove(buffer.length - 1)
    val vector = Vectors.dense(buffer.map(_.toDouble).toArray)
    (label, vector)
}

val data = labelsAndData.values.cache()

labelsAndData: org.apache.spark.rdd.RDD[(String, org.apache.spark.mllib.linalg.Vector)] = MapPartitionsRDD[195] at map at <console>:38
data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[196] at values at <console>:46


In [26]:
val anomalies = labelsAndData.filter{
    case (original, datum) =>
    val normalized = normalize(datum)
    distToCentroid(normalized, model) > threshold
}

anomalies: org.apache.spark.rdd.RDD[(String, org.apache.spark.mllib.linalg.Vector)] = MapPartitionsRDD[197] at filter at <console>:66


In [31]:
val result = anomalies.foreach(println)

(normal.,[2552.0,589.0,28390.0,0.0,0.0,0.0,0.0,0.0,1.0,21.0,1.0,1.0,17.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,2.0,0.01,0.02,0.0,0.0,0.0,0.0,0.0,0.0])
(normal.,[13724.0,10738.0,19584.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,37.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20.0,7.0,0.25,0.15,0.05,0.29,0.0,0.0,0.0,0.0])
(normal.,[0.0,167.0,19827.0,0.0,0.0,0.0,0.0,0.0,1.0,83.0,1.0,2.0,91.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,21.0,8.0,0.29,0.14,0.05,0.25,0.05,0.12,0.0,0.0])
(normal.,[0.0,145.0,13236.0,0.0,0.0,0.0,0.0,0.0,1.0,31.0,1.0,2.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,29.0,10.0,0.28,0.1,0.03,0.2,0.07,0.2,0.0,0.0])
(normal.,[9733.0,5102.0,14935.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,17.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,64.0,16.0,0.22,0.05,0.02,0.12,0.03,0.12,0.0,0.0])
(normal.,[11565.0,565.0,111864.0,0.0,0.0,0.0,0.0,0.0,1.0,217.0,1.0,2.0,247.0,0.0,0.0,4.0,0.0,0.0,0.0

(normal.,[189.0,536.0,1664.0,0.0,0.0,0.0,11.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0,2.0,0.33,0.5,0.17,0.0,0.0,0.0,0.17,0.5])
(normal.,[14517.0,794.0,152110.0,0.0,0.0,0.0,0.0,0.0,1.0,520.0,1.0,2.0,572.0,0.0,0.0,8.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0,3.0,0.38,0.25,0.12,0.0,0.0,0.0,0.0,0.0])
(normal.,[18349.0,6060.0,36611.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,24.0,6.0,0.25,0.12,0.04,0.0,0.0,0.0,0.0,0.0])
(normal.,[15377.0,975.0,165912.0,0.0,0.0,0.0,0.0,0.0,1.0,676.0,1.0,2.0,754.0,0.0,0.0,6.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,130.0,25.0,0.19,0.03,0.01,0.0,0.01,0.04,0.02,0.08])
(normal.,[0.0,224.0,2776333.0,0.0,0.0,0.0,0.0,0.0,1.0,1739.0,1.0,2.0,1743.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,7.0,5.0,0.71,0.43,0.14,0.0,0.14,0.2,0.0,0.0])
(normal.,[20252.0,737.0,112155.0,0.0,0.0,0.0,0.0,0.0,1.0,457.0,1.0,2.0,508.0,0.0,0.0,2.0,0.0

(normal.,[8625.0,866.0,128974.0,0.0,0.0,0.0,0.0,0.0,1.0,538.0,1.0,2.0,605.0,0.0,0.0,5.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,11.0,0.04,0.06,0.0,0.0,0.87,0.0,0.01,0.18])
(normal.,[24.0,228.0,767.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0,0.33,0.67,0.33,1.0,0.0,0.0,0.0,0.5])
(normal.,[16187.0,1119.0,176336.0,0.0,0.0,0.0,0.0,0.0,1.0,691.0,1.0,2.0,766.0,0.0,0.0,8.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0])
(normal.,[11233.0,2361.0,46593.0,0.0,0.0,0.0,8.0,0.0,1.0,54.0,1.0,2.0,39.0,9.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,205.0,104.0,0.51,0.01,0.0,0.0,0.98,0.96,0.0,0.0])
(normal.,[15263.0,9933.0,14772.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,113.0,0.44,0.01,0.0,0.0,0.78,0.88,0.0,0.0])
(normal.,[15908.0,888.0,43963.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0

result: Unit = ()
