# Statistics in Scala

**Objectives**:
- learn statistical functions, and how they are calculated
- implement my own functions (trying to use as few built-in functions as possible)
- perhaps come back and optimize them
- perhaps write a generic mechanism which permits defining a function based on a formula or expression

In [310]:
val data0 = Array(0.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 5.0, 6.0)
val data1 = Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)
val data2 = Array(1.0, 2.0, 3.0)

[1.0, 2.0, 3.0]

In [322]:
object DataSet {
    def apply(data: Array[Double]) = new Data(data)
}

class DataSet(data: Array[Double]) {
    // "Delta Degrees of Freedom": the divisor used in the variance calculation is
    // ``N - ddof``, where ``N`` represents the number of elements. (from Python's Numpy docs)
    private val ddof = 1.0
    
    private lazy val pivot = Math.abs(data.length%2-1)

    lazy val length = data.foldLeft(0){(a,b) => a+1}
    lazy val sum = data.reduceLeft(_ + _)
    lazy val min = data.sorted.head
    lazy val max = data.sorted.takeRight(1).head
    
    // Describing The Center of The Data
    lazy val mean = sum / length
    lazy val median = data.sorted.takeRight((length+1+pivot)/2).take(1+pivot).sum/(1+pivot)
    lazy val mode = data.groupBy(identity).mapValues(_.size).toSeq.sortBy(_._2).reverse.head._1
    
    // Describing The Variability of The Data
    lazy val range = max - min
    lazy val stddev = Math.sqrt(variance)
    lazy val variance = data.foldLeft(0.0){(s,yi) => s+Math.pow(yi-mean,2.0)}/(length-ddof)
    
    
    // convenience functions
    def average = mean
    
    def summary = for ((tag, value) <- List(("length",length),
                                            ("sum",sum),
                                            ("min",min),
                                            ("max",max),
                                            ("mean",mean),
                                            ("median",median),
                                            ("mode",mode),
                                            ("range",range),
                                            ("variance",variance),
                                            ("stddev",stddev)))
                    println(s"${tag}: ${value}")
}

defined object DataSet
defined class DataSet


In [323]:
DataSet(data0).summary

length: 13
sum: 39.0
min: 0.0
max: 6.0
mean: 3.0
median: 3.0
mode: 3.0
range: 6.0
variance: 2.5
stddev: 1.5811388300841898


null

In [324]:
DataSet(data1).summary

length: 10
sum: 45.0
min: 0.0
max: 9.0
mean: 4.5
median: 4.5
mode: 4.0
range: 9.0
variance: 9.166666666666666
stddev: 3.0276503540974917


null

In [325]:
DataSet(data2).summary

length: 3
sum: 6.0
min: 1.0
max: 3.0
mean: 2.0
median: 2.0
mode: 3.0
range: 2.0
variance: 1.0
stddev: 1.0


null