In [1]:
# src: http://ajkl.github.io/2015/08/10/Titanic-with-Julia/

# DataFrame introduction
# https://en.wikibooks.org/wiki/Introducing_Julia/DataFrames

# Julia documentation index.html
# https://docs.julialang.org/en/v1/manual/documentation/index.html

# Plots.jl
# https://github.com/JuliaPlots/Plots.jl
    # backends: http://docs.juliaplots.org/latest/backends/
        # PyPlot https://github.com/JuliaPy/PyPlot.jl
            # examples: https://docs.juliaplots.org/latest/examples/pyplot/

In [12]:
# Load libraries
using DataArrays
using DataFrames
using CSV
using FreqTables
using StatsBase

using Gadfly

# Snippet to use show() and println() in one line
showln(x) = (show(x); println())
showln("Libraries loaded")

┌ Info: Precompiling DataArrays [0fe7c1db-08c2-54a3-a222-6d1d3b7a471a]
└ @ Base loading.jl:1186
ERROR: LoadError: LoadError: UndefVarError: start not defined
Stacktrace:
 [1] getproperty(::Module, ::Symbol) at ./sysimg.jl:13
 [2] top-level scope at none:0
 [3] include at ./boot.jl:326 [inlined]
 [4] include_relative(::Module, ::String) at ./loading.jl:1038
 [5] include at ./sysimg.jl:29 [inlined]
 [6] include(::String) at /home/max/.julia/packages/DataArrays/mcLPH/src/DataArrays.jl:3
 [7] top-level scope at none:0
 [8] include at ./boot.jl:326 [inlined]
 [9] include_relative(::Module, ::String) at ./loading.jl:1038
 [10] include(::Module, ::String) at ./sysimg.jl:29
 [11] top-level scope at none:2
 [12] eval at ./boot.jl:328 [inlined]
 [13] eval(::Expr) at ./client.jl:404
 [14] top-level scope at ./none:3
in expression starting at /home/max/.julia/packages/DataArrays/mcLPH/src/abstractdataarray.jl:27
in expression starting at /home/max/.julia/packages/DataArrays/mcLPH/src/DataArrays.jl

ErrorException: Failed to precompile DataArrays [0fe7c1db-08c2-54a3-a222-6d1d3b7a471a] to /home/max/.julia/compiled/v1.1/DataArrays/ZAWTY.ji.

In [3]:
# Load dataset

train_set = CSV.read("titanic/train.csv")
test_set = CSV.read("titanic/test.csv")

showln("Train and Test set loaded")

"Train and Test set loaded"


In [4]:
# Describe loaded train set
println("train_set head, 3 elements:")
showln(first(train_set, 3))
println("train_set tail, 3 elements:")
showln(last(train_set, 3))
println("train_set column names:")
showln(names(train_set))
println("train_set column types:")
showln(eltypes(train_set))
println("train_set shape:")
showln(size(train_set))
println("Describe train_set:")
show(describe(train_set), allcols=true)

train_set head, 3 elements:
3×12 DataFrame. Omitted printing of 9 columns
│ Row │ PassengerId │ Survived │ Pclass │
│     │ [90mInt64[39m       │ [90mInt64[39m    │ [90mInt64[39m  │
├─────┼─────────────┼──────────┼────────┤
│ 1   │ 1           │ 0        │ 3      │
│ 2   │ 2           │ 1        │ 1      │
│ 3   │ 3           │ 1        │ 3      │
train_set tail, 3 elements:
3×12 DataFrame. Omitted printing of 9 columns
│ Row │ PassengerId │ Survived │ Pclass │
│     │ [90mInt64[39m       │ [90mInt64[39m    │ [90mInt64[39m  │
├─────┼─────────────┼──────────┼────────┤
│ 1   │ 889         │ 0        │ 3      │
│ 2   │ 890         │ 1        │ 1      │
│ 3   │ 891         │ 0        │ 3      │
train_set column names:
Symbol[:PassengerId, :Survived, :Pclass, :Name, :Sex, :Age, :SibSp, :Parch, :Ticket, :Fare, :Cabin, :Embarked]
train_set column types:
Type[Int64, Int64, Int64, String, String, Union{Missing, Float64}, Int64, Int64, String, Float64, Union{Missing, String}, Union{M

In [5]:
# Describe loaded test set
println("test_set shape:")
showln(size(test_set))
println("Describe test_set:")
show(describe(test_set), allcols=true)

test_set shape:
(418, 11)
Describe test_set:
11×8 DataFrame
│ Row │ variable    │ mean     │ min                           │ median  │
│     │ [90mSymbol[39m      │ [90mUnion…[39m   │ [90mAny[39m                           │ [90mUnion…[39m  │
├─────┼─────────────┼──────────┼───────────────────────────────┼─────────┤
│ 1   │ PassengerId │ 1100.5   │ 892                           │ 1100.5  │
│ 2   │ Pclass      │ 2.26555  │ 1                             │ 3.0     │
│ 3   │ Name        │          │ Abbott, Master. Eugene Joseph │         │
│ 4   │ Sex         │          │ female                        │         │
│ 5   │ Age         │ 30.2726  │ 0.17                          │ 27.0    │
│ 6   │ SibSp       │ 0.447368 │ 0                             │ 0.0     │
│ 7   │ Parch       │ 0.392344 │ 0                             │ 0.0     │
│ 8   │ Ticket      │          │ 110469                        │         │
│ 9   │ Fare        │ 35.6272  │ 0.0                           │ 14.4542 │


In [6]:
# Exploratory data analysis

# Explore unique values in every column

# Way 0: custom
function show_unique_values(dataset::DataFrame, max_unique_vals::Int64=10)
    for column_symbol in names(dataset)
        unique_col_values = unique(dataset[column_symbol])
        if size(unique_col_values, 1) < max_unique_vals
            println(column_symbol, ", unique values are ", unique_col_values)
        else
            println(column_symbol, " has too many unique values to display")
        end
    end
end

println("Showing unique values for train_set:")
show_unique_values(train_set)

println()

println("Showing unique values for test_set:")
show_unique_values(test_set)

# Way 1
survive_zero = count(i -> (i == 0), train_set[:Survived])
survive_one = count(i -> (i == 1), train_set[:Survived])
display(survive_zero)
display(survive_one)

# Way 2: "using FreqTables"
survive_freq_table = freqtable(train_set[:Survived])
display(survive_freq_table)

# Way 3: "using StatsBase"
display(countmap(train_set[:Survived]))

# Way 4: "using StatsBase"
display(counts(train_set[:Survived]))

Showing unique values for train_set:
PassengerId has too many unique values to display
Survived, unique values are [0, 1]
Pclass, unique values are [3, 1, 2]
Name has too many unique values to display
Sex, unique values are ["male", "female"]
Age has too many unique values to display
SibSp, unique values are [1, 0, 3, 4, 2, 5, 8]
Parch, unique values are [0, 1, 2, 5, 3, 4, 6]
Ticket has too many unique values to display
Fare has too many unique values to display
Cabin has too many unique values to display
Embarked, unique values are Union{Missing, String}

549

342

2-element Named Array{Int64,1}
Dim1  │ 
──────┼────
0     │ 549
1     │ 342

Dict{Int64,Int64} with 2 entries:
  0 => 549
  1 => 342

2-element Array{Int64,1}:
 549
 342

["S", "C", "Q", missing]

Showing unique values for test_set:
PassengerId has too many unique values to display
Pclass, unique values are [3, 2, 1]
Name has too many unique values to display
Sex, unique values are ["male", "female"]
Age has too many unique values to display
SibSp, unique values are [0, 1, 2, 3, 4, 5, 8]
Parch, unique values are [0, 1, 3, 2, 4, 6, 5, 9]
Ticket has too many unique values to display
Fare has too many unique values to display
Cabin has too many unique values to display
Embarked, unique values are ["Q", "S", "C"]


In [7]:
# View proportions of people survived / not survived
display(proportions(train_set[:Survived]))
display(proportionmap(train_set[:Survived]))

# "counts" does not work for categorical variables. 
# Use "countmap()" instead
display(countmap(train_set[:Sex]))

2-element Array{Float64,1}:
 0.6161616161616162 
 0.38383838383838387

Dict{Int64,Float64} with 2 entries:
  0 => 0.616162
  1 => 0.383838

Dict{String,Int64} with 2 entries:
  "male"   => 577
  "female" => 314

In [11]:
# Create dimension: indicate if person was a child or not
train_set[:Child] = 1
train_set[isna(train_set[:Age]), :Child] = 1

train_set[train_set[:Age] .< 18, :Child] = 1
train_set[train_set[:Age] .> 18, :Child] = 0

display(head(train_set))

UndefVarError: UndefVarError: isna not defined

In [None]:
# NOTE: this notebook isn't finished