In [1]:
import qualified DataFrame as D

train <- D.readCsv "./data/titanic_train.csv"
test <- D.readCsv "./data/titanic_test.csv"

In [2]:
import qualified Data.Text as T
import qualified DataFrame.Functions as F

import Data.Char
import DataFrame.Functions ((.=))
import DataFrame ((|>))

F.declareColumns train

combined = (train <> test)
            |> D.deriveMany
                [ -- Get only the alphanumeric characters of a ticket. 
                  F.name ticket .= F.whenPresent (T.filter isAlpha) (F.match "^([A-Za-z][A-Za-z0-9./]*)" ticket)
                  -- Match the title using a regex
                , F.name name   .= F.match "\\s*([A-Za-z]+)\\." name
                  -- take the first letter of the cabin
                , F.name cabin  .= F.whenPresent (T.take 1) cabin
                ]

In [3]:
import DataFrame.DecisionTree
import DataFrame.Functions ((.&&), (.==))
import System.Random

(train', validation) = combined
                |> D.take (D.nRows train)
                |> D.shuffle (mkStdGen 1894)
                |> D.randomSplit (mkStdGen 4232) 0.8

test' = D.drop (D.nRows train) combined

model = fitDecisionTree
                ( defaultTreeConfig
                    { maxTreeDepth = 4
                    , minSamplesSplit = 10
                    , minLeafSize = 5
                    }
                )
                survived
                ( train'
                    -- Unwrap maybes from passengers IDs
                    |> D.filterJust (F.name survived)
                    |> D.exclude [F.name passengerid]
                )

In [4]:
-- Let's inspect the learned model
model

(ifThenElse (eq (col @Maybe Text "Name") (lit (Just "Mr."))) (ifThenElse (eq (col @Maybe Text "Cabin") (lit (Nothing))) (lit (0)) (ifThenElse (eq (col @Maybe Text "Ticket") (lit (Nothing))) (lit (0)) (ifThenElse (leq (col @Maybe Double "Fare") (col @Maybe Double "Age")) (lit (1)) (lit (0))))) (ifThenElse (eq (col @Maybe Text "Cabin") (lit (Nothing))) (ifThenElse (eq (col @Text "Sex") (lit ("male"))) (ifThenElse (gt (toDouble (col @Int "SibSp")) (lit (1.0))) (lit (0)) (lit (1))) (lit (1))) (lit (1))))

In [5]:
-- Column reference for predictions
prediction :: D.Expr Int
prediction = F.col @Int "prediction"

computeAccuracy :: D.DataFrame -> Double
computeAccuracy df =
    let
        cond c =  fromIntegral $ D.nRows (D.filterWhere (survived .== 1 .&& prediction .== 1) df)
        tp = fromIntegral $ D.nRows (D.filterWhere (survived .== 1 .&& prediction .== 1) df)
        tn = fromIntegral $ D.nRows (D.filterWhere (survived .== 0 .&& prediction .== 0) df)
        fp = fromIntegral $ D.nRows (D.filterWhere (survived .== 0 .&& prediction .== 1) df)
        fn = fromIntegral $ D.nRows (D.filterWhere (survived .== 1 .&& prediction .== 0) df)
     in
        (tp + tn) / (tp + tn + fp + fn)

: 

In [6]:
putStrLn "Training accuracy: "
print $
    computeAccuracy
        (train' |> D.filterJust (F.name survived) |> D.derive (F.name prediction) model)

putStrLn "Validation accuracy: "
print $
    computeAccuracy
        ( validation
            |> D.filterJust (F.name survived)
            |> D.derive (F.name prediction) model
        )

test' |> D.derive (F.name survived) model
      |> D.take 10

Training accuracy:

: 