In [1]:
using DecisionTree
using DataFrames
using DataFramesMeta
using DataArrays
using Gadfly
using ScikitLearn: DataFrameMapper, @sk_import, Pipelines, fit!, predict
using ScikitLearn.CrossValidation
using ScikitLearnBase: @declare_hyperparameters, BaseEstimator, simple_get_params

# @sk_import linear_model: LogisticRegression
@sk_import preprocessing: (LabelBinarizer, RobustScaler, Binarizer, StandardScaler, FunctionTransformer)



In [None]:
# Note for debugging, changing samething inside a function require kernel reloading :/

In [2]:
train = readtable("train.csv")
test = readtable("test.csv")
head(train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [None]:
####### Exploration phase #######

describe(train)

In [None]:
# Somehow adding the color visualization generate an error about int not defined in Gadfly
# Need to edit source code and replace int by Int
# plot(train, x="Sex", y="Survived", color="Survived", Geom.histogram(position=:stack), Scale.color_discrete_manual("red","green"))

In [None]:
# plot(train, x=:Age, y=:Survived, color=:Survived, Geom.histogram(bincount=15,position=:dodge), Scale.color_discrete_manual("orange","green"))

# Feature Engineering

Use end to end Pipelines. (questions how to use data from train set in the test set if using pipelines ?)
- Compute Missing Age (or predict it via a simple Regression)
- Bin Age into Age group (and predict that instead)
- Drop NA in Embarked (or predict it via a simple Regression)
- Extract Title from Name
- Extract family Name from Name and count occurences (similar to family size but may have people that share a name)
-? Extract family name (what if a family is split between train and test, or is irrelevant in the other set ??)
- Extract Deck from Cabin
- Categorize titles per social rank
- Combine Siblings + Spouse + Parents + 1 to have family size
-? Be inventive with tickets ?
- Predict Fare for test data
- Bin Fare. Have a category 0 for VIP that were invited
- Normalize Fare/Family Size if someone paid for all


In [None]:
# Compute a new Age column with missing values filled with median
# Other possibility do a regression and predict the age

## Important : the test data computation for missing age should use train data info
## How to do that cleanly ?
function pp_MissingAge(df::AbstractDataFrame)
    @linq df |>
    groupby([:Pclass,:Sex,:CptTitle]) |>
          transform(CptAge = ifelse(isna(:Age),median(dropna(:Age)),:Age))
end


# by(train, [:Pclass,:Sex,:CptTitle], df -> median(dropna(df[:Age])))

In [None]:
# drop NA in Embarked column
# Alternative predict the mot likely value
function pp_dropNAembarked(df::AbstractDataFrame)
    df[~isna(df[:,:Embarked]),:]
end

In [None]:
function pp_Title(df::AbstractDataFrame)
    @linq df |>
        transform(
            CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name)
        )
end

type PP_TitleTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_TitleTransformer, [])

ScikitLearnBase.fit!(self::PP_TitleTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_TitleTransformer, X::DataFrame)
    return @linq X |>
    transform(
            CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name)
        )
end


import ScikitLearnBase.simple_get_params
function simple_get_params(estimator::PP_TitleTransformer, param_names)
    return ScikitLearnBase.simple_get_params(estimator, param_names)
end

In [None]:
# Extract Deck from Cabin
function pp_deck(df::AbstractDataFrame)
    @linq df |>
        transform(
    CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)
            )|>
    ## Need two step otherwise complains about no index method for NAtypes
        transform(
            CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin)
        )
end

In [None]:
#Dictionary for socio-prof categories
#Dictionary for consistent referencing (transform input to lower case for insensitive use)
#Master. --> children 0
#Miss. Mlle --> unmarried 1
#Mr. Mrs. Ms. --> normal 2
#Honorifics --> rich people
dicoRef = Dict(
            "Mr." => 2,
            "Mrs."=> 2,
            "Miss." => 1,
            "Master." => 0,
            "Don."=> 3,
            "Rev."=>3,
            "Dr."=>3,
            "Mme."=>2,
            "Ms."=>2,
            "Major."=>3,
            "Lady."=>3,
            "Sir."=>3,
            "Mlle."=>1,
            "Col."=>3,
            "Capt."=>3,
            "the Countess."=>3,
            "Jonkheer."=>3,
            "Dona."=>3
    )

function pp_titlecat(df::AbstractDataFrame)
    @linq df |> transform(CptTitleCat = map(s->dicoRef[s], :CptTitle))
end

In [None]:
# Occurences of family name
## potential issue, if family are split between training and test data?
function pp_namefreq(df::AbstractDataFrame)
    @linq df |>
        transform(
            CptNameFreq = map(s->match(r"^.*?,", s).match, :Name)
            )|>
        groupby(:CptNameFreq)|>
        transform(
            CptNameFreq = length(:CptNameFreq) ## TODO : is there a count equivalent ?
        )
end

In [None]:
@linq train |> pp_Title |> pp_titlecat |> pp_MissingAge |> pp_deck |> pp_namefreq

In [None]:
final_trainset = train_004[[:Pclass,:CptTitle,:Sex,:CptAge,:SibSp,:Parch,:Fare,:Embarked,:CptCabin,:Survived]]
full_train = train_004 #used to compute data combined with the test set
head(final_trainset)

In [None]:
# plot(final_trainset, x="CptCabin", y="Survived", color="Survived", Geom.histogram(position=:stack), Scale.color_discrete_manual("red","green"))

In [4]:
# Create model
mapper = DataFrameMapper([
    ([:Pclass], Binarizer()),
#    (:CptTitle, LabelBinarizer()),
    (:Sex, LabelBinarizer()),
#    ([:CptAge], StandardScaler()),
    ([:SibSp], RobustScaler()),
    ([:Parch], RobustScaler()),
#    ([:Fare], RobustScaler()),
#    (:Embarked, LabelBinarizer()),
#    (:CptCabin, LabelBinarizer())
    ]);

In [5]:
#using ScikitLearnBase

type DumbTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(DumbTransformer, [])

ScikitLearnBase.fit!(dt::DumbTransformer, X, y=nothing) = dt
function ScikitLearnBase.transform(dt::DumbTransformer, X::DataFrame)
   return X
end

import ScikitLearnBase.simple_get_params
function simple_get_params(estimator::DumbTransformer, param_names)
    # Not written as a comprehension for 0.3/0.5 compatibility
    di = Dict{Symbol, Any}()
    for name in param_names di[name] = getfield(estimator, name) end
    di
end



simple_get_params (generic function with 2 methods)

In [6]:
pipe = Pipelines.Pipeline([
  #  ("extract_title", PP_TitleTransformer()),
    ("test",DumbTransformer()),
     ("featurize", mapper),
    ("forest", RandomForestClassifier(ntrees=200))
    ])

ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("test",DumbTransformer()),("featurize",ScikitLearn.DataFrameMapper(Tuple[(Symbol[:Pclass],PyObject Binarizer(copy=True, threshold=0.0)),(:Sex,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(Symbol[:SibSp],PyObject RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)),(Symbol[:Parch],PyObject RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True))],false,false,Array{Float64,2})),("forest",DecisionTree.RandomForestClassifier(0,200,0.7,-1,MersenneTwister(UInt32[0xd888d4aa,0xa57503c4,0x738c5f62,0xb63459b9],Base.dSFMT.DSFMT_state(Int32[-18070980,1072959855,577544238,1073720163,-1834079697,1073340310,667658300,1072876264,636465048,1073055055  …  -2074888886,1073688714,1399013506,1072825869,-1075750892,17800495,-2095711411,-1013039817,382,0]),[1.25426,1.97934,1.61709,1.17454,1.34505,1.03234,1.96326,1.10096,1.4386,1.07424  …  1.99121,1

In [None]:
typeof(RandomForestClassifier)

In [7]:
model=fit!(pipe, train, convert(Array,train[:Survived]))

ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("test",DumbTransformer()),("featurize",ScikitLearn.DataFrameMapper(Tuple[(Symbol[:Pclass],PyObject Binarizer(copy=True, threshold=0.0)),(:Sex,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(Symbol[:SibSp],PyObject RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)),(Symbol[:Parch],PyObject RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True))],false,false,Array{Float64,2})),("forest",DecisionTree.RandomForestClassifier(0,200,0.7,-1,MersenneTwister(UInt32[0xd888d4aa,0xa57503c4,0x738c5f62,0xb63459b9],Base.dSFMT.DSFMT_state(Int32[-1309901869,1073416276,-921730591,1073373075,1529900540,1073015615,1517778765,1073206374,-1170945360,1073451701  …  1961810414,1073675739,569910745,1072974157,91640941,593495506,-1323685051,-1552640212,382,0]),[1.68953,1.64833,1.30743,1.48936,1.72332,1.60886,1.59464,1.41771,1.989,1.47344  …  1.53024,1

In [8]:
Y_train = convert(Array, train[:Survived])
X_train = train


#Cross Validation - check model accuracy
round(cross_val_score(pipe, X_train, Y_train, cv =2), 2)

2-element Array{Float64,1}:
 0.79
 0.76

In [None]:
Y_train = convert(Array, final_trainset[:Survived])
X_train = final_trainset


#Cross Validation - check model accuracy
round(cross_val_score(pipe, X_train, Y_train, cv =1), 2)

In [None]:
model = fit!(pipe, X_train, Y_train)

In [None]:
predict(model,test)

In [None]:
##########################
# TEST DATA

# describe test data
describe(test)

##########################
# Extract Title from Name
test_001 = @transform(test,
    CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name)
    )


#Compute the median age depending of class, sex, title
v = vcat(full_train,test_001)
# describe(v)

# Seems like there is a particular :Sex, :CptTitle combination with no matching Age
# writetable("age.csv",v)
#Dona. is not matched to anything, assume Lady
v[v[:CptTitle].=="Dona.",:Age] = v[v[:CptTitle].=="Lady.",:Age]
#Ms. on Pclass3 assume = to Pclass2
v[(v[:CptTitle].=="Ms.")&(v[:Pclass].==3),:Age] = v[(v[:CptTitle].=="Ms.")&(v[:Pclass].==2),:Age]


# Estimate age
test_002 = @byrow! test_001 begin
    @newcol CptAge::DataArray{Float64}
    :CptAge = groupby_Age(v,:Age,:Pclass,:Sex,:CptTitle)
end

# Map Cabin
test_003 = @transform(
                @transform(test_002,
                CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)
                    ),
                CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin)
    )
head(test_003)


# view NA in Fare
print(test[isna(test[:,:Fare]),:])
print("\n\n\n")
# View median that may fill this NA
print(median(dropna(v[(v[:CptTitle].=="Mr.")&(v[:Pclass].==3),:Fare])))

test_004 = test_003
test_004[isna(test_003[:,:Fare]),:Fare] = median(dropna(v[(v[:CptTitle].=="Mr.")&(v[:Pclass].==3),:Fare]))

In [None]:
final_test = test_004[[:Pclass,:CptTitle,:Sex,:CptAge,:SibSp,:Parch,:Fare,:Embarked,:CptCabin]]
head(final_test)

In [None]:
result=DataFrame()
result[:PassengerId] = test[:PassengerId]
result[:Survived] = @data predict(model,final_test)

In [None]:
result

In [None]:
writetable("julia-magicalforests.csv",result)