In [1]:
using DecisionTree
using DataFrames
using DataFramesMeta
using DataArrays
using ScikitLearn: DataFrameMapper, @sk_import, fit_transform!, Pipelines, fit!
using ScikitLearn.CrossValidation: cross_val_score
@sk_import preprocessing: (LabelBinarizer, StandardScaler)



In [None]:
train = readtable("train.csv")

In [3]:
describe(train)

PassengerId
Min      1.0
1st Qu.  223.5
Median   446.0
Mean     446.0
3rd Qu.  668.5
Max      891.0
NAs      0
NA%      0.0%

Survived
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     0.3838383838383838
3rd Qu.  1.0
Max      1.0
NAs      0
NA%      0.0%

Pclass
Min      1.0
1st Qu.  2.0
Median   3.0
Mean     2.308641975308642
3rd Qu.  3.0
Max      3.0
NAs      0
NA%      0.0%

Name
Length  891
Type    String
NAs     0
NA%     0.0%
Unique  891

Sex
Length  891
Type    String
NAs     0
NA%     0.0%
Unique  2

Age
Min      0.42
1st Qu.  20.125
Median   28.0
Mean     29.69911764705882
3rd Qu.  38.0
Max      80.0
NAs      177
NA%      19.87%

SibSp
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     0.5230078563411896
3rd Qu.  1.0
Max      8.0
NAs      0
NA%      0.0%

Parch
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     0.38159371492704824
3rd Qu.  0.0
Max      6.0
NAs      0
NA%      0.0%

Ticket
Length  891
Type    String
NAs     0
NA%     0.0%
Unique  681

Fare
Min      0.0
1st Qu.  7.9104


In [4]:
# view NA in Embarked
train[isna(train[:,:Embarked]),:]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
2,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [5]:
# drop NA in Embarked
deleterows!(train,find(isna(train[:,Symbol("Embarked")])))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [6]:
# Extract Title from Name
train = @transform(train,
CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name)
    )

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr.
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr.
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master.
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs.
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs.


In [7]:
#Compute the median age depending of class, sex, title
df_Agegroup = by(train, [:Pclass,:Sex,:CptTitle], df -> median(dropna(df[:Age])))

Unnamed: 0,Pclass,Sex,CptTitle,x1
1,1,female,Mrs.,40.0
2,1,female,Miss.,30.0
3,1,female,Dr.,49.0
4,1,female,Mme.,24.0
5,1,female,Lady.,48.0
6,1,female,Mlle.,24.0
7,1,female,the Countess.,33.0
8,1,male,Mr.,40.0
9,1,male,Master.,4.0
10,1,male,Don.,40.0


In [8]:
function groupby_Age(in_Age,df_groupby,in_Pclass,tx_Sex,tx_Title)
    ifelse(
        isna(in_Age),
        reshape(df_groupby[
            (df_groupby[:Pclass].==in_Pclass)&
            (df_groupby[:Sex].==tx_Sex)&
        (df_groupby[:CptTitle].==tx_Title)
        ,:x1], 1)[1],
    in_Age)
end

groupby_Age (generic function with 1 method)

In [9]:
train = @byrow! train begin
    @newcol CptAge::DataArray{Float64}
    :CptAge = groupby_Age(:Age,df_Agegroup,:Pclass,:Sex,:CptTitle)
end

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle,CptAge
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,22.0
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,38.0
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,26.0
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,35.0
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,35.0
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr.,26.0
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr.,54.0
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master.,2.0
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs.,27.0
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs.,14.0


In [10]:
tmp = @transform(train,
CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)
    )
train = @transform(tmp,
CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin)
    )

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle,CptAge,CptCabin
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,22.0,Unknown
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,38.0,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,26.0,Unknown
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,35.0,C
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,35.0,Unknown
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr.,26.0,Unknown
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr.,54.0,E
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master.,2.0,Unknown
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs.,27.0,Unknown
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs.,14.0,Unknown


In [11]:
final_training = train[[:Pclass,:CptTitle,:Sex,:CptAge,:SibSp,:Parch,:Fare,:Embarked,:CptCabin,:Survived]]

Unnamed: 0,Pclass,CptTitle,Sex,CptAge,SibSp,Parch,Fare,Embarked,CptCabin,Survived
1,3,Mr.,male,22.0,1,0,7.25,S,Unknown,0
2,1,Mrs.,female,38.0,1,0,71.2833,C,C,1
3,3,Miss.,female,26.0,0,0,7.925,S,Unknown,1
4,1,Mrs.,female,35.0,1,0,53.1,S,C,1
5,3,Mr.,male,35.0,0,0,8.05,S,Unknown,0
6,3,Mr.,male,26.0,0,0,8.4583,Q,Unknown,0
7,1,Mr.,male,54.0,0,0,51.8625,S,E,0
8,3,Master.,male,2.0,3,1,21.075,S,Unknown,0
9,3,Mrs.,female,27.0,0,2,11.1333,S,Unknown,1
10,2,Mrs.,female,14.0,1,0,30.0708,C,Unknown,1


In [12]:
# Create model
mapper = DataFrameMapper([
    (:Pclass, nothing),
    (:CptTitle, LabelBinarizer()),
    (:Sex, LabelBinarizer()),
    (:CptAge, nothing),
    (:SibSp, nothing),
    (:Parch, nothing),
    (:Fare, nothing),
    (:Embarked, LabelBinarizer()),
    (:CptCabin, LabelBinarizer())
    ]);

In [13]:
# fit model
round(fit_transform!(mapper, copy(final_training)), 2)

889×35 Array{Float64,2}:
 3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     1.0  0.0  0.0  0.0  0.0  0.0  0.0
 3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     1.0  0.0  0.0  0.0  0.0  0.0  0.0
 3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  0.0
 3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 2.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  1.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     1.0  0.0  0.0  0.0  0.0  0.0  0.0
 3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0

In [14]:
pipe = Pipelines.Pipeline([
     ("featurize", mapper),
    ("forest", RandomForestClassifier(ntrees=1000))
    ])

ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("featurize",ScikitLearn.DataFrameMapper(Tuple[(:Pclass,nothing),(:CptTitle,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(:Sex,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(:CptAge,nothing),(:SibSp,nothing),(:Parch,nothing),(:Fare,nothing),(:Embarked,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(:CptCabin,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False))],false,false,Array{Float64,2})),("forest",DecisionTree.RandomForestClassifier(0,1000,0.7,-1,MersenneTwister(UInt32[0x92073f52,0x23eb14ef,0xfe077ba5,0x89f1f576],Base.dSFMT.DSFMT_state(Int32[-37091267,1073274352,56258988,1073364806,-489946093,1073342629,-926597871,1073188115,1896361484,1073551563  …  1805625306,1072829776,1924793155,1073320977,1495839231,519142192,-378552959,1034623539,382,0]),[1.55418,1.64045,1.6193,1.47194,1.81855,1.13345,1.43756,1.27753,1.06885,1.1987  …  1.93029,1.14781,1.5

In [15]:
Y_train = convert(Array, final_training[:Survived])
delete!(final_training, :Survived)
X_train = final_training

Unnamed: 0,Pclass,CptTitle,Sex,CptAge,SibSp,Parch,Fare,Embarked,CptCabin
1,3,Mr.,male,22.0,1,0,7.25,S,Unknown
2,1,Mrs.,female,38.0,1,0,71.2833,C,C
3,3,Miss.,female,26.0,0,0,7.925,S,Unknown
4,1,Mrs.,female,35.0,1,0,53.1,S,C
5,3,Mr.,male,35.0,0,0,8.05,S,Unknown
6,3,Mr.,male,26.0,0,0,8.4583,Q,Unknown
7,1,Mr.,male,54.0,0,0,51.8625,S,E
8,3,Master.,male,2.0,3,1,21.075,S,Unknown
9,3,Mrs.,female,27.0,0,2,11.1333,S,Unknown
10,2,Mrs.,female,14.0,1,0,30.0708,C,Unknown


In [16]:
model = fit!(pipe, X_train, Y_train)

ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("featurize",ScikitLearn.DataFrameMapper(Tuple[(:Pclass,nothing),(:CptTitle,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(:Sex,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(:CptAge,nothing),(:SibSp,nothing),(:Parch,nothing),(:Fare,nothing),(:Embarked,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(:CptCabin,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False))],false,false,Array{Float64,2})),("forest",DecisionTree.RandomForestClassifier(0,1000,0.7,-1,MersenneTwister(UInt32[0x92073f52,0x23eb14ef,0xfe077ba5,0x89f1f576],Base.dSFMT.DSFMT_state(Int32[1078926576,1073590573,1744961346,1073036864,-1700845634,1073142442,-1913707369,1073438898,300637961,1073642766  …  -1114153269,1073160617,-107824139,1072753197,-149164423,2037659154,953745926,204564514,382,0]),[1.85576,1.3277,1.42839,1.71111,1.90553,1.86096,1.10646,1.03939,1.78504,1.53605  …  1.20165,1.7334

In [17]:
# load test data
test = readtable("test.csv")

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
2,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
3,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
4,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
5,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
6,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
7,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
8,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
9,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
10,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [18]:
describe(test)

PassengerId
Min      892.0
1st Qu.  996.25
Median   1100.5
Mean     1100.5
3rd Qu.  1204.75
Max      1309.0
NAs      0
NA%      0.0%

Pclass
Min      1.0
1st Qu.  1.0
Median   3.0
Mean     2.2655502392344498
3rd Qu.  3.0
Max      3.0
NAs      0
NA%      0.0%

Name
Length  418
Type    String
NAs     0
NA%     0.0%
Unique  418

Sex
Length  418
Type    String
NAs     0
NA%     0.0%
Unique  2

Age
Min      0.17
1st Qu.  21.0
Median   27.0
Mean     30.272590361445783
3rd Qu.  39.0
Max      76.0
NAs      86
NA%      20.57%

SibSp
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     0.4473684210526316
3rd Qu.  1.0
Max      8.0
NAs      0
NA%      0.0%

Parch
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     0.3923444976076555
3rd Qu.  0.0
Max      9.0
NAs      0
NA%      0.0%

Ticket
Length  418
Type    String
NAs     0
NA%     0.0%
Unique  363

Fare
Min      0.0
1st Qu.  7.8958
Median   14.4542
Mean     35.627188489208635
3rd Qu.  31.5
Max      512.3292
NAs      1
NA%      0.24%

Cabin
Length  418


In [19]:
# Extract Title from Name
test = @transform(test,
CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name)
    )

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle
1,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr.
2,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs.
3,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr.
4,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr.
5,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs.
6,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,Mr.
7,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,Miss.
8,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,Mr.
9,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,Mrs.
10,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,Mr.


In [20]:
# view NA in Age
test[isna(test[:,:Age]),:]

#Compute the median age depending of class, sex, title
v = vcat(train,test)
# describe(v)

# Seems like there is a particular :Sex, :CptTitle combination with no matching Age
# writetable("age.csv",v)
#Dona. is not matched to anything, assume Lady
v[v[:CptTitle].=="Dona.",:Age] = v[v[:CptTitle].=="Lady.",:Age]
#Ms. on Pclass3 assume = to Pclass2
v[(v[:CptTitle].=="Ms.")&(v[:Pclass].==3),:Age] = v[(v[:CptTitle].=="Ms.")&(v[:Pclass].==2),:Age]

df_Agegroup_TEST = by(v, [:Pclass,:Sex,:CptTitle], df -> median(dropna(df[:Age])))



Unnamed: 0,Pclass,Sex,CptTitle,x1
1,1,female,Mrs.,45.0
2,1,female,Miss.,30.0
3,1,female,Dr.,49.0
4,1,female,Mme.,24.0
5,1,female,Lady.,48.0
6,1,female,Mlle.,24.0
7,1,female,the Countess.,33.0
8,1,female,Dona.,48.0
9,1,male,Mr.,41.5
10,1,male,Master.,6.0


In [21]:
# Estimate age
test = @byrow! test begin
    @newcol CptAge::DataArray{Float64}
    :CptAge = groupby_Age(:Age,df_Agegroup_TEST,:Pclass,:Sex,:CptTitle)
end

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle,CptAge
1,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr.,34.5
2,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs.,47.0
3,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr.,62.0
4,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr.,27.0
5,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs.,22.0
6,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,Mr.,14.0
7,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,Miss.,30.0
8,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,Mr.,26.0
9,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,Mrs.,18.0
10,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,Mr.,21.0


In [22]:
tmp = @transform(test,
CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)
    )
test = @transform(tmp,
CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin)
    )

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle,CptAge,CptCabin
1,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr.,34.5,Unknown
2,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs.,47.0,Unknown
3,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr.,62.0,Unknown
4,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr.,27.0,Unknown
5,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs.,22.0,Unknown
6,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S,Mr.,14.0,Unknown
7,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,Miss.,30.0,Unknown
8,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S,Mr.,26.0,Unknown
9,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,Mrs.,18.0,Unknown
10,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S,Mr.,21.0,Unknown


In [23]:
# view NA in Fare
test[isna(test[:,:Fare]),:]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle,CptAge,CptCabin
1,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,Mr.,60.5,Unknown


In [33]:
median(dropna(v[(v[:CptTitle].=="Mr.")&(v[:Pclass].==3),:Fare]))

7.8958

In [34]:
test[isna(test[:,:Fare]),:Fare] = median(dropna(v[(v[:CptTitle].=="Mr.")&(v[:Pclass].==3),:Fare]))

7.8958

In [35]:
final_test = test[[:Pclass,:CptTitle,:Sex,:CptAge,:SibSp,:Parch,:Fare,:Embarked,:CptCabin]]

Unnamed: 0,Pclass,CptTitle,Sex,CptAge,SibSp,Parch,Fare,Embarked,CptCabin
1,3,Mr.,male,34.5,0,0,7.8292,Q,Unknown
2,3,Mrs.,female,47.0,1,0,7.0,S,Unknown
3,2,Mr.,male,62.0,0,0,9.6875,Q,Unknown
4,3,Mr.,male,27.0,0,0,8.6625,S,Unknown
5,3,Mrs.,female,22.0,1,1,12.2875,S,Unknown
6,3,Mr.,male,14.0,0,0,9.225,S,Unknown
7,3,Miss.,female,30.0,0,0,7.6292,Q,Unknown
8,2,Mr.,male,26.0,1,1,29.0,S,Unknown
9,3,Mrs.,female,18.0,0,0,7.2292,C,Unknown
10,3,Mr.,male,21.0,2,0,24.15,S,Unknown


In [38]:
result=DataFrame()
result[:PassengerId] = test[:PassengerId]
result[:Survived] = @data DecisionTree.predict(model,final_test)

418-element DataArrays.DataArray{Any,1}:
 0
 0
 0
 1
 1
 0
 0
 0
 1
 0
 0
 0
 1
 ⋮
 0
 0
 0
 1
 1
 1
 0
 0
 1
 0
 0
 1

In [39]:
result

Unnamed: 0,PassengerId,Survived
1,892,0
2,893,0
3,894,0
4,895,1
5,896,1
6,897,0
7,898,0
8,899,0
9,900,1
10,901,0


In [40]:
writetable("julia-randomforests.csv",result)