In [1]:
using DecisionTree
using DataFrames
using DataFramesMeta
using DataArrays
using Gadfly
using ScikitLearn: DataFrameMapper, @sk_import, fit_transform!, Pipelines, fit!, predict
using ScikitLearn.CrossValidation
# @sk_import linear_model: LogisticRegression
@sk_import preprocessing: (LabelBinarizer, RobustScaler, Binarizer, StandardScaler, FunctionTransformer)



In [2]:
# Note for debugging, changing samething inside a function require kernel reloading :/

In [3]:
train = readtable("train.csv")
test = readtable("test.csv")
head(train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [4]:
####### Exploration phase #######

describe(train),2

PassengerId
Min      1.0
1st Qu.  223.5
Median   446.0
Mean     446.0
3rd Qu.  668.5
Max      891.0
NAs      0
NA%      0.0%

Survived
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     0.3838383838383838
3rd Qu.  1.0
Max      1.0
NAs      0
NA%      0.0%

Pclass
Min      1.0
1st Qu.  2.0
Median   3.0
Mean     2.308641975308642
3rd Qu.  3.0
Max      3.0
NAs      0
NA%      0.0%

Name
Length  891
Type    String
NAs     0
NA%     0.0%
Unique  891

Sex
Length  891
Type    String
NAs     0
NA%     0.0%
Unique  2

Age
Min      0.42
1st Qu.  20.125
Median   28.0
Mean     29.69911764705882
3rd Qu.  38.0
Max      80.0
NAs      177
NA%      19.87%

SibSp
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     0.5230078563411896
3rd Qu.  1.0
Max      8.0
NAs      0
NA%      0.0%

Parch
Min      0.0
1st Qu.  0.0
Median   0.0
Mean     0.38159371492704824
3rd Qu.  0.0
Max      6.0
NAs      0
NA%      0.0%

Ticket
Length  891
Type    String
NAs     0
NA%     0.0%
Unique  681

Fare
Min      0.0
1st Qu.  7.9104


(nothing,2)

In [5]:
# Somehow adding the color visualization generate an error about int not defined in Gadfly
# Need to edit source code and replace int by Int
# plot(train, x="Sex", y="Survived", color="Survived", Geom.histogram(position=:stack), Scale.color_discrete_manual("red","green"))

In [6]:
# plot(train, x=:Age, y=:Survived, color=:Survived, Geom.histogram(bincount=15,position=:dodge), Scale.color_discrete_manual("orange","green"))

# Feature Engineering

Use end to end Pipelines. (questions how to use data from train set in the test set if using pipelines ?)
- Compute Missing Age (or predict it via a simple Regression)
- Bin Age into Age group (and predict that instead)
- Drop NA in Embarked (or predict it via a simple Regression)
- Extract Title from Name
- Extract family Name from Name and count occurences (similar to family size but may have people that share a name)
-? Extract family name (what if a family is split between train and test, or is irrelevant in the other set ??)
- Extract Deck from Cabin
- Categorize titles per social rank
- Combine Siblings + Spouse + Parents + 1 to have family size
-? Be inventive with tickets ?
- Predict Fare for test data
- Bin Fare. Have a category 0 for VIP that were invited
- Normalize Fare/Family Size if someone paid for all


In [7]:
# Compute a new Age column with missing values filled with median
# Other possibility do a regression and predict the age

## Important : the test data computation for missing age should use train data info
## How to do that cleanly ?
function pp_MissingAge(df::AbstractDataFrame)
    @linq df |>
    groupby([:Pclass,:Sex,:CptTitle]) |>
          transform(CptAge = ifelse(isna(:Age),median(dropna(:Age)),:Age))
end

pp_MissingAge (generic function with 1 method)

In [8]:
# drop NA in Embarked column
# Alternative predict the mot likely value
function pp_dropNAembarked(df::AbstractDataFrame)
    df[~isna(df[:,:Embarked]),:]
end

pp_dropNAembarked (generic function with 1 method)

In [9]:
function pp_Title(df::AbstractDataFrame)
    @linq df |>
        transform(
            CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name)
        )
end

pp_Title (generic function with 1 method)

In [10]:
# Extract Deck from Cabin
function pp_deck(df::AbstractDataFrame)
    @linq df |>
        transform(
    CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)
            )|>
    ## Need two step otherwise complains about no index method for NAtypes
        transform(
            CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin)
        )
end

pp_deck (generic function with 1 method)

In [11]:
#Dictionary for socio-prof categories
#Dictionary for consistent referencing (transform input to lower case for insensitive use)
#Master. --> children 0
#Miss. Mlle --> unmarried 1
#Mr. Mrs. Ms. --> normal 2
#Honorifics --> rich people
dicoRef = Dict(
            "Mr." => 2,
            "Mrs."=> 2,
            "Miss." => 1,
            "Master." => 0,
            "Don."=> 3,
            "Rev."=>3,
            "Dr."=>3,
            "Mme."=>2,
            "Ms."=>2,
            "Major."=>3,
            "Lady."=>3,
            "Sir."=>3,
            "Mlle."=>1,
            "Col."=>3,
            "Capt."=>3,
            "the Countess."=>3,
            "Jonkheer."=>3,
            "Dona."=>3
    )

function pp_titlecat(df::AbstractDataFrame)
    @linq df |> transform(CptTitleCat = map(s->dicoRef[s], :CptTitle))
end

pp_titlecat (generic function with 1 method)

In [12]:
# Occurences of family name
## potential issue, if family are split between training and test data?
function pp_namefreq(df::AbstractDataFrame)
    @linq df |>
        transform(
            CptNameFreq = map(s->match(r"^.*?,", s).match, :Name)
            )|>
        groupby(:CptNameFreq)|>
        transform(
            CptNameFreq = length(:CptNameFreq) ## TODO : is there a count equivalent ?
        )
end

pp_namefreq (generic function with 1 method)

In [13]:
@linq train |> pp_Title |> pp_titlecat |> pp_MissingAge |> pp_deck |> pp_namefreq

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle,CptTitleCat,CptAge,CptCabin,CptNameFreq
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,2,38.0,C,1
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,2,35.0,C,2
3,138,0,1,"Futrelle, Mr. Jacques Heath",male,37.0,1,0,113803,53.1,C123,S,Mr.,2,37.0,C,2
4,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C,Mrs.,2,41.5,B,1
5,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,Mrs.,2,49.0,D,4
6,646,1,1,"Harper, Mr. Henry Sleeper",male,48.0,1,0,PC 17572,76.7292,D33,C,Mr.,2,48.0,D,4
7,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S,Miss.,1,6.0,Unknown,4
8,849,0,2,"Harper, Rev. John",male,28.0,0,1,248727,33.0,,S,Rev.,3,28.0,Unknown,4
9,152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22.0,1,0,113776,66.6,C2,S,Mrs.,2,22.0,C,2
10,337,0,1,"Pears, Mr. Thomas Clinton",male,29.0,1,0,113776,66.6,C2,S,Mr.,2,29.0,C,2


In [None]:
final_trainset = train_004[[:Pclass,:CptTitle,:Sex,:CptAge,:SibSp,:Parch,:Fare,:Embarked,:CptCabin,:Survived]]
full_train = train_004 #used to compute data combined with the test set
head(final_trainset)

In [None]:
# plot(final_trainset, x="CptCabin", y="Survived", color="Survived", Geom.histogram(position=:stack), Scale.color_discrete_manual("red","green"))

In [None]:
# Create model
mapper = DataFrameMapper([
    ([:Pclass], Binarizer()),
    (:CptTitle, LabelBinarizer()),
    (:Sex, LabelBinarizer()),
    ([:CptAge], StandardScaler()),
    ([:SibSp], RobustScaler()),
    ([:Parch], RobustScaler()),
    ([:Fare], RobustScaler()),
    (:Embarked, LabelBinarizer()),
    (:CptCabin, LabelBinarizer())
    ]);

In [None]:
pipe = Pipelines.Pipeline([
     ("featurize", mapper),
    ("forest", RandomForestClassifier(ntrees=200))
    ])

In [None]:
Y_train = convert(Array, final_trainset[:Survived])
X_train = final_trainset


#Cross Validation - check model accuracy
round(cross_val_score(pipe, X_train, Y_train, cv =3), 2)

In [None]:
model = fit!(pipe, X_train, Y_train)

In [None]:
##########################
# TEST DATA

# describe test data
describe(test)

##########################
# Extract Title from Name
test_001 = @transform(test,
    CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name)
    )


#Compute the median age depending of class, sex, title
v = vcat(full_train,test_001)
# describe(v)

# Seems like there is a particular :Sex, :CptTitle combination with no matching Age
# writetable("age.csv",v)
#Dona. is not matched to anything, assume Lady
v[v[:CptTitle].=="Dona.",:Age] = v[v[:CptTitle].=="Lady.",:Age]
#Ms. on Pclass3 assume = to Pclass2
v[(v[:CptTitle].=="Ms.")&(v[:Pclass].==3),:Age] = v[(v[:CptTitle].=="Ms.")&(v[:Pclass].==2),:Age]


# Estimate age
test_002 = @byrow! test_001 begin
    @newcol CptAge::DataArray{Float64}
    :CptAge = groupby_Age(v,:Age,:Pclass,:Sex,:CptTitle)
end

# Map Cabin
test_003 = @transform(
                @transform(test_002,
                CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)
                    ),
                CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin)
    )
head(test_003)


# view NA in Fare
print(test[isna(test[:,:Fare]),:])
print("\n\n\n")
# View median that may fill this NA
print(median(dropna(v[(v[:CptTitle].=="Mr.")&(v[:Pclass].==3),:Fare])))

test_004 = test_003
test_004[isna(test_003[:,:Fare]),:Fare] = median(dropna(v[(v[:CptTitle].=="Mr.")&(v[:Pclass].==3),:Fare]))

In [None]:
final_test = test_004[[:Pclass,:CptTitle,:Sex,:CptAge,:SibSp,:Parch,:Fare,:Embarked,:CptCabin]]
head(final_test)

In [None]:
result=DataFrame()
result[:PassengerId] = test[:PassengerId]
result[:Survived] = @data predict(model,final_test)

In [None]:
result

In [None]:
writetable("julia-magicalforests.csv",result)