In [1]:
%AddDeps org.vegas-viz vegas_2.11 0.3.11 --transitive

Marking org.vegas-viz:vegas_2.11:0.3.11 for download
Obtained 42 files


# Data Exploration with Functional Programming using Jupyter Notebook, Scala and Vegas
## A Statistical Analysis of the Titanic Dataset

Titanic survivor dataset captures the various details of people who survived or not survived in the shipwreck. Using this data, we want to build a model which predicts the propability of someone's survival. It is a classification problem that maps all attributes like sex, fare, age on the most probable state: Survived or not

![Titanic](Titanic.jpg)
(Source: https://commons.wikimedia.org/wiki/RMS_Titanic)


The dataset contains the following attributes (for more information: see Kaggle):

| **Variable** | **Definition**                                | **Key**                                           |
|--------------|-----------------------------------------------|---------------------------------------------------|
| survival     | Survival                                      | 1=Yes, 0= No                                      |
| pclass       | Ticket class                                  | 1 = 1st, 2 = 2nd, 3 = 3rd                         |
| sex          | Sex                                           |                                                   |
| age          | Age                                           | Age in years                                      |
| sibsp        |  # of siblings / spouses aboard   the Titanic |                                                   |
| parch        |  # of parents / children aboard   the Titanic |                                                   |
| ticket       | Ticket number                                 |                                                   |
| fare         | Passenger fare                                |                                                   |
| cabin        | Cabin number                                  |                                                   |
| embarked     | Port of Embarkation                           |  C = Cherbourg, Q = Queenstown,   S = Southampton |


The dataset is splittet into three files:
* A Training Dataset (train.csv)
* A Test Dataset (test.csv)
* A Set which contains sample data for the submission (gender_submission.csv).

At first, we need to load the data creating maps for each set.

In [2]:
import vegas._
import vegas.data.External._
implicit val render = vegas.render.ShowHTML(kernel.display.content("text/html", _))
import java.io.PrintWriter

// Regular Expressions for extracting the information
val DATA_ACCESS_PATTERN_test = """(\d+),(\d),"(.+)",(male|female),([0-9]*\.[0-9]+|[0-9]+|d*),(\d*),(\d*),(.*),([0-9]*\.[0-9]+|[0-9]+|d*),(.*),(\w*)""".r
val DATA_ACCESS_PATTERN_train=  """(\d+),(\d),(\d),"(.+)",(male|female),([0-9]*\.[0-9]+|[0-9]+|d*),(\d*),(\d*),(.*),([0-9]*\.[0-9]+|[0-9]+|d*),(.*),(\w*)""".r
val DATA_ACCESS_PATTERN_surv= """(\d+),(\d)""".r

// Reading text file
// Stores the information in a map consisting of a property name (key) and its value
def loadDataCSV(filename:String):List[Map[String,Any]]= {

  val src = scala.io.Source.fromFile(filename)
  val iter = src.getLines().drop(1) //skip first line
    
    val result= (for (row <- iter) yield readData(row)).toList
   
    src.close
    result.flatMap(_ match{ case p:Option[Map[String,Any]]=>p})
}
  

// Extracting all information storing it into a Map[String,Any]
def readData(line:String):Option[Map[String,Any]]={
    
    def toInteger(key:String,s:String):Option[(String,Int)]={
      
      try{
        Some(key,s.toInt)
      } catch { case e:Exception => None}
    }
    
    def toFloat(key:String,s:String):Option[(String,Float)]={
      
      try{
        Some((key,s.toFloat))
      } catch { case e:Exception => None}
    }
    
    def toStr(key:String, s:String):Option[(String,String)]=
        if (s!="") Some((key,s)) else None

    def createPassengerMap(t1:String,t2:String,t3:String,t4:String,t5:String,t6:String,t7:String,
                           t8:String,t9:String,t10:String,t11:String,t12:String):Option[Map[String,Any]]={
        
        val l=List(
            toInteger("passengerID",t1),
            toInteger("survived",t2),
            toInteger("pclass",t3),
            toStr("name",t4),
            toStr("sex",t5),
            toFloat("age",t6),
            toInteger("sibsp",t7),
            toInteger("parch",t8),
            toStr("ticket",t9),
            toFloat("fare",t10),
            toStr("cabin",t11),
            {if (t12.length>0) Some(("embarked",t12(0))) else None})
         Some(l.flatMap(_ match{ case p:Option[(String,Any)]=>p}).toMap)        
    }
    
    val result = line match{
       case DATA_ACCESS_PATTERN_test(t1,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12) => 
                 createPassengerMap(t1,"-1",t3,t4,t5,t6,t7,t8,t9,t10,t11,t12)
       
       case DATA_ACCESS_PATTERN_train(t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12) => {
                  createPassengerMap(t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12)
       }
       
       case DATA_ACCESS_PATTERN_surv (t1,t2) => {
            val t= (toInteger("passengerID",t1),toInteger("survived",t2))
            t match {
                case (Some(p),Some(s)) => Some(List(p,s).toMap)
                case _ => None
            }
       }
       case _ => println("None:"+line);None
     }
     result
}

// Method for printing a passenger in a readable manner
def printPassenger(p:Map[String,Any]):Unit={
    
    println("\n---------------------------------------------------------------------")
    println("passengerID:"+p.getOrElse("passengerID",-1))
    println("survived:"+p.getOrElse("survived",-1))
    println("pclass:"+p.getOrElse("pclass",-1))
    println("name:"+p.getOrElse("name","-"))
    println("sex:"+p.getOrElse("sex","-"))
    println("age:"+p.getOrElse("age",-1))
    println("sibsp:"+p.getOrElse("sibsp",-1))
    println("parch:"+p.getOrElse("parch",-1))
    println("ticket:"+p.getOrElse("ticket","-"))
    println("fare:"+p.getOrElse("fare",-1))
    println("cabin:"+p.getOrElse("cabin",-1))
    println("embarked:"+p.getOrElse("embarked",'-'))
    println("---------------------------------------------------------------------\n")
}

//def countAllMissingValues(passengers:List[Map[String,Any]],attList:List[String]):Map[String,Int]= ???
 


//produces sometimes an missing argument list error - can be ignored
def applyModel[CLASS,ID](model:(Map[String,Any],String)=> (ID,CLASS), 
            testdata: Seq[Map[String,Any]], idKey:String):Seq[(ID,CLASS)]= {
    
    testdata.map(d => model(d,idKey))
}  

def createSubmitFile[ID,CLASS](filename:String, data:Seq[(ID,CLASS)],header:String):Unit= {
    
    val pw = new PrintWriter(filename)
    pw.println(header)
    data.foreach(e=>pw.println(e._1.toString+","+e._2.toString))
    pw.close
}


render = <function1>
DATA_ACCESS_PATTERN_test = (\d+),(\d),"(.+)",(male|female),([0-9]*\.[0-9]+|[0-9]+|d*),(\d*),(\d*),(.*),([0-9]*\.[0-9]+|[0-9]+|d*),(.*),(\w*)
DATA_ACCESS_PATTERN_train = (\d+),(\d),(\d),"(.+)",(male|female),([0-9]*\.[0-9]+|[0-9]+|d*),(\d*),(\d*),(.*),([0-9]*\.[0-9]+|[0-9]+|d*),(.*),(\w*)
DATA_ACCESS_PATTERN_surv = (\d+),(\d)


loadDataCSV: (filename: String)List[Map[String,Any]]
readData: (line: String)Option[Map[String,Any]]
printPassenger: (p: Map[String,Any])Unit
countAllMissingValues: (passengers: List[Map[String,Any]], attList: List[String])Unit
applyModel: [CLASS, ID](model: (Map[String,Any]...


(\d+),(\d)

In [3]:
val train= loadDataCSV("train.csv")
val test= loadDataCSV("test.csv")
val all= train ++ test
  
println("Train Dataset:"+ train.size+" Elements")
println("Test Dataset:"+ test.size+" Elements")
println("whole Dataset:"+ all.size+" Elements")


Train Dataset:891 Elements
Test Dataset:418 Elements
whole Dataset:1309 Elements


train = List(Map(name -> Braund, Mr. Owen Harris, fare -> 7.25, parch -> 0, age -> 22.0, ticket -> A/5 21171, sex -> male, passengerID -> 1, pclass -> 3, sibsp -> 1, embarked -> S, survived -> 0), Map(name -> Cumings, Mrs. John Bradley (Florence Briggs Thayer), fare -> 71.2833, parch -> 0, age -> 38.0, ticket -> PC 17599, cabin -> C85, sex -> female, passengerID -> 2, pclass -> 1, sibsp -> 1, embarked -> C, survived -> 1), Map(name -> Heikkinen, Miss. Laina, fare -> 7.925, parch -> 0, age -> 26.0, ticket -> STON/O2. 3101282, sex -> female, passengerID -> 3, pclass -> 3, sibsp -> 0, embarked -> S, survived -> 1), Map(name -> Futrelle, Mrs. Jacques Heath (Lily May Peel), fare -> 53.1, parch -> 0, age -> 35.0, ticket -> 113803, cabin -> C123, sex -> female, passenger...


List(Map(name -> Braund, Mr. Owen Harris, fare -> 7.25, parch -> 0, age -> 22.0, ticket -> A/5 21171, sex -> male, passengerID -> 1, pclass -> 3, sibsp -> 1, embarked -> S, survived -> 0), Map(name -> Cumings, Mrs. John Bradley (Florence Briggs Thayer), fare -> 71.2833, parch -> 0, age -> 38.0, ticket -> PC 17599, cabin -> C85, sex -> female, passengerID -> 2, pclass -> 1, sibsp -> 1, embarked -> C, survived -> 1), Map(name -> Heikkinen, Miss. Laina, fare -> 7.925, parch -> 0, age -> 26.0, ticket -> STON/O2. 3101282, sex -> female, passengerID -> 3, pclass -> 3, sibsp -> 0, embarked -> S, survived -> 1), Map(name -> Futrelle, Mrs. Jacques Heath (Lily May Peel), fare -> 53.1, parch -> 0, age -> 35.0, ticket -> 113803, cabin -> C123, sex -> female, passenger...

Now we can examine a small sample of the data set

In [4]:
all.take(2).foreach(printPassenger)


---------------------------------------------------------------------
passengerID:1
survived:0
pclass:3
name:Braund, Mr. Owen Harris
sex:male
age:22.0
sibsp:1
parch:0
ticket:A/5 21171
fare:7.25
cabin:-1
embarked:S
---------------------------------------------------------------------


---------------------------------------------------------------------
passengerID:2
survived:1
pclass:1
name:Cumings, Mrs. John Bradley (Florence Briggs Thayer)
sex:female
age:38.0
sibsp:1
parch:0
ticket:PC 17599
fare:71.2833
cabin:C85
embarked:C
---------------------------------------------------------------------



Count the missing values in a passenger set.

In [119]:
val attList= List("passengerID","pclass","survived","name","sex","age","sibsp","parch",
        "ticket","fare","cabin","embarked")

  
def countAllMissingValues(data:List[Map[String,Any]],attList:List[String]):Map[String,Int]= {
    
    attList.map{
        y =>
       ( y ,  ( data.count(x => (!x.keySet.exists(_ == y))) ) ) 

    }.toMap

}

val train_mv= countAllMissingValues(train,attList)
val test_mv= countAllMissingValues(test,attList)
assert(train_mv("cabin")== 687 && train_mv("age")==177 && train_mv("embarked")== 2)
assert(test_mv("cabin")== 327 && test_mv("age")==86 && test_mv("fare")== 1)

attList = List(passengerID, pclass, survived, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked)
train_mv = Map(name -> 0, fare -> 0, parch -> 0, age -> 177, ticket -> 0, cabin -> 687, sex -> 0, passengerID -> 0, pclass -> 0, sibsp -> 0, embarked -> 2, survived -> 0)
test_mv = Map(name -> 0, fare -> 1, parch -> 0, age -> 86, ticket -> 0, cabin -> 327, sex -> 0, passengerID -> 0, pclass -> 0, sibsp -> 0, embarked -> 0, survived -> 0)


countAllMissingValues: (data: List[Map[String,Any]], attList: List[String])Map[String,Int]


Map(name -> 0, fare -> 1, parch -> 0, age -> 86, ticket -> 0, cabin -> 327, sex -> 0, passengerID -> 0, pclass -> 0, sibsp -> 0, embarked -> 0, survived -> 0)

In [120]:
Vegas("Passengers splitted by sex" ).
    withData(train).
    mark(Bar).
    encodeX("sex", Ordinal,axis=Axis(title="Sex")).
    encodeY("passengerID", Quantitative,AggOps.Count,axis=Axis(title="Passengers")).
    show

In [122]:
val passengers= train.size
val survivedPass= (train.filter(m=>m("survived")==1)).size
val rate= survivedPass.toDouble/passengers
println("propability of surviving:"+rate)

Vegas("Passengers classified by survival" ).
    withData(train).
    mark(Bar).
    addTransform("survival", "datum.survived == 0 ? \"Dead\" : \"Alive\"").
    encodeX("survival", Ordinal,axis=Axis(title="Survival")).
    encodeY("passengerID", Quantitative,AggOps.Count,axis=Axis(title="Passengers")).show

propability of surviving:0.3838383838383838


passengers = 891
survivedPass = 342
rate = 0.3838383838383838


0.3838383838383838

In [121]:
Vegas("Survival splitted by sex").
      withData(train).
      mark(Bar).
      addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
      encodeY("passengerID",Quantitative, AggOps.Count, axis=Axis(title="Passengers")).
      encodeX("sex", Ord).
      encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
      show

In [123]:
Vegas("Survival splitted by sex").
      withData(train).
      mark(Bar).
      addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
      encodeY("passengerID",Quantitative, AggOps.Count, axis=Axis(title="Passengers")).
      encodeX("sex", Ord).
      encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
      configMark(stacked = StackOffset.Normalize).
      show