## algos
[regression analysis](https://en.wikipedia.org/wiki/Regression_analysis)

[Statistical classification](https://en.wikipedia.org/wiki/Statistical_classification)

[supervised learning](https://en.wikipedia.org/wiki/Supervised_learning)

[decision tree](https://en.wikipedia.org/wiki/Decision_tree_learning)

[random forest](https://en.wikipedia.org/wiki/Random_forest#:~:text=Random%20forests%20or%20random%20decision,regression)

[gini inpurity](https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity)

[entroy - information theory](https://en.wikipedia.org/wiki/Entropy_(information_theory))

[Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier)

[Support vector machines](https://en.wikipedia.org/wiki/Support-vector_machine)

[logistic regression](https://en.wikipedia.org/wiki/Logistic_regression)

In [1]:
%%init_spark
launcher.driver_memory = '6g'

In [2]:
val init : Int = 2

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.20:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1573155825581)
SparkSession available as 'spark'


init: Int = 2


In [3]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression._

val rawData = sc.textFile("hdfs://localhost:9000/ds/covtype/covtype.data")
val data = rawData.map { line =>
    val values = line.split(',').map(_.toDouble)
    val featureVector = Vectors.dense(values.init)
    val label = values.last - 1
    LabeledPoint(label, featureVector)
}

import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression._
rawData: org.apache.spark.rdd.RDD[String] = hdfs://localhost:9000/ds/covtype/covtype.data MapPartitionsRDD[1] at textFile at <console>:28
data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[2] at map at <console>:29


In [4]:
val Array(trainData, cvData, testData) = data.randomSplit(Array(0.8, 0.1, 0.1))
trainData.cache()
cvData.cache()
testData.cache()

trainData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[3] at randomSplit at <console>:32
cvData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[4] at randomSplit at <console>:32
testData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[5] at randomSplit at <console>:32
res0: testData.type = MapPartitionsRDD[5] at randomSplit at <console>:32


In [5]:
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.model._
import org.apache.spark.rdd._

def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): MulticlassMetrics = {
    val predictionAndLabels = data.map(example => 
        (model.predict(example.features), example.label)
    )
    new MulticlassMetrics(predictionAndLabels)
}
val model = DecisionTree.trainClassifier(
    trainData, 7, Map[Int, Int](), "gini", 4, 100
)
val metrics = getMetrics(model, cvData)

import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree.model._
import org.apache.spark.rdd._
getMetrics: (model: org.apache.spark.mllib.tree.model.DecisionTreeModel, data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint])org.apache.spark.mllib.evaluation.MulticlassMetrics
model: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 4 with 23 nodes
metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@37c945c


In [6]:
metrics.confusionMatrix

res1: org.apache.spark.mllib.linalg.Matrix =
14177.0  6703.0   9.0     0.0    0.0  2.0   275.0
5361.0   22680.0  325.0   18.0   0.0  6.0   34.0
0.0      755.0    2730.0  66.0   0.0  14.0  0.0
0.0      0.0      167.0   133.0  0.0  0.0   0.0
0.0      948.0    11.0    3.0    0.0  0.0   0.0
0.0      485.0    1092.0  25.0   0.0  60.0  0.0
1137.0   38.0     0.0     0.0    0.0  0.0   855.0


In [7]:
(0 until 7).map(
    cat => (metrics.precision(cat), metrics.recall(cat))
).foreach(println)

(0.6857073760580411,0.6698006236416895)
(0.7175171628333702,0.7979172530256121)
(0.6299030918320259,0.7657784011220197)
(0.5428571428571428,0.44333333333333336)
(0.0,0.0)
(0.7317073170731707,0.036101083032490974)
(0.7345360824742269,0.4211822660098522)


In [8]:
import org.apache.spark.rdd._

def classProbabilities(data: RDD[LabeledPoint]): Array[Double] = {
    val countByCategory = data.map(_.label).countByValue()
    val counts = countByCategory.toArray.sortBy(_._1).map(_._2)
    counts.map(_.toDouble / counts.sum)
}

val trainPriorProbabilities = classProbabilities(trainData)
val cvPriorProbabilities = classProbabilities(cvData)
trainPriorProbabilities.zip(cvPriorProbabilities).map {
    case (trainProb, cvProb) => trainProb * cvProb
}.sum

import org.apache.spark.rdd._
classProbabilities: (data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint])Array[Double]
trainPriorProbabilities: Array[Double] = Array(0.3646933869975963, 0.4874675885911841, 0.06154405156585267, 0.004691322987482639, 0.0163056798978315, 0.0300958474696526, 0.0352021224904002)
cvPriorProbabilities: Array[Double] = Array(0.36424650226298855, 0.4891497014231875, 0.0613502211361407, 0.005162711456056721, 0.01655509473575522, 0.028601421466554235, 0.03493434751931714)
res3: Double = 0.3774433664190958


In [11]:
val evaluations =
    for (
        impurity <- Array("gini", "entropy");
        depth <- Array(1, 20);
        bins <- Array(10, 300)
    ) yield {
        val model = DecisionTree.trainClassifier(
            trainData, 7, Map[Int, Int](), impurity, depth, bins)
        val predictionsAnsLabels = cvData.map(example => (model.predict(example.features), example.label)
        )
        val accuracy = new MulticlassMetrics(predictionsAnsLabels).precision
        ((impurity, depth, bins), accuracy)
    }

evaluations: Array[((String, Int, Int), Double)] = Array(((gini,1,10),0.6371302207919599), ((gini,1,300),0.6366999948372886), ((gini,20,10),0.8891393760002754), ((gini,20,300),0.9044210019102032), ((entropy,1,10),0.4891497014231875), ((entropy,1,300),0.4891497014231875), ((entropy,20,10),0.8949904489838063), ((entropy,20,300),0.9124748317816517))


In [12]:
evaluations.sortBy(_._2).reverse.foreach(println)

((entropy,20,300),0.9124748317816517)
((gini,20,300),0.9044210019102032)
((entropy,20,10),0.8949904489838063)
((gini,20,10),0.8891393760002754)
((gini,1,10),0.6371302207919599)
((gini,1,300),0.6366999948372886)
((entropy,1,300),0.4891497014231875)
((entropy,1,10),0.4891497014231875)


In [13]:
val data = rawData.map { line =>
    val values = line.split(',').map(_.toDouble)
    val wilderness = values.slice(10, 14).indexOf(1.0).toDouble
    val soil = values.slice(14, 54).indexOf(1.0).toDouble
    val featureVector = Vectors.dense(values.slice(0, 10) :+ wilderness :+ soil)
    val label = values.last - 1
    LabeledPoint(label, featureVector)
}

data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[426] at map at <console>:47


In [16]:
val evaluations =
    for (
        impurity <- Array("gini", "entropy");
        depth <- Array(10, 20, 30);
        bins <- Array(40, 300)
    ) yield {
        val model = DecisionTree.trainClassifier(
            trainData, 7, Map[Int, Int](10 -> 4, 11 -> 40),
            impurity, depth, bins)
        val trainAccuracy = getMetrics(model, trainData).precision
        val cvAccuracy = getMetrics(model, cvData).precision
        ((impurity, depth, bins), (trainAccuracy, cvAccuracy))
    }
evaluations.sortBy(_._2).reverse.foreach(println)

((entropy,30,40),(0.9987142937000392,0.9372042196561634))
((entropy,30,300),(0.9985981931311464,0.9406288182553477))
((gini,30,40),(0.9974930877161298,0.9344851916226402))
((gini,30,300),(0.9952140765489751,0.9328847510712627))
((entropy,20,300),(0.9515086623924457,0.9124748317816517))
((entropy,20,40),(0.9489501498557343,0.9114594985286273))
((gini,20,300),(0.9436202737393413,0.9044210019102032))
((gini,20,40),(0.9410058609287185,0.9043005386428953))
((gini,10,300),(0.7841389422808172,0.781101034263195))
((gini,10,40),(0.7827607855278491,0.78003407389561))
((entropy,10,40),(0.7790262172284644,0.778846650260717))
((entropy,10,300),(0.774132793250687,0.7705691028928393))


evaluations: Array[((String, Int, Int), (Double, Double))] = Array(((gini,10,40),(0.7827607855278491,0.78003407389561)), ((gini,10,300),(0.7841389422808172,0.781101034263195)), ((gini,20,40),(0.9410058609287185,0.9043005386428953)), ((gini,20,300),(0.9436202737393413,0.9044210019102032)), ((gini,30,40),(0.9974930877161298,0.9344851916226402)), ((gini,30,300),(0.9952140765489751,0.9328847510712627)), ((entropy,10,40),(0.7790262172284644,0.778846650260717)), ((entropy,10,300),(0.774132793250687,0.7705691028928393)), ((entropy,20,40),(0.9489501498557343,0.9114594985286273)), ((entropy,20,300),(0.9515086623924457,0.9124748317816517)), ((entropy,30,40),(0.9987142937000392,0.9372042196561634)), ((entropy,30,300),(0.9985981931311464,0.9406288182553477)))


In [17]:
val forest = RandomForest.trainClassifier(
    trainData, 7, Map(10 -> 4, 11 -> 40), 20,
    "auto", "entropy", 30, 300
)

forest: org.apache.spark.mllib.tree.model.RandomForestModel =
TreeEnsembleModel classifier with 20 trees


In [21]:
val input = "2709,125,28,67,23,3224,253,207,61,6094,0,29"
val vector = Vectors.dense(input.split(',').map(_.toDouble))
forest.predict(vector)

java.lang.ArrayIndexOutOfBoundsException:  36