In [1]:
using DecisionTree
using DataFrames
using DataFramesMeta
using DataArrays
using Gadfly
using ScikitLearn: DataFrameMapper, @sk_import, Pipelines, fit!, predict
using ScikitLearn.CrossValidation
using ScikitLearnBase: @declare_hyperparameters, BaseEstimator
import ScikitLearnBase.simple_get_params

# @sk_import linear_model: LogisticRegression
@sk_import preprocessing: (LabelBinarizer, RobustScaler, Binarizer, StandardScaler)



In [2]:
# Note for debugging, changing samething inside a function require kernel reloading :/

In [3]:
train = readtable("train.csv")
test = readtable("test.csv")
head(train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [4]:
####### Exploration phase #######

# describe(train)

In [5]:
# Somehow adding the color visualization generate an error about int not defined in Gadfly
# Need to edit source code and replace int by Int
# plot(train, x="Sex", y="Survived", color="Survived", Geom.histogram(position=:stack), Scale.color_discrete_manual("red","green"))

In [6]:
# plot(train, x=:Age, y=:Survived, color=:Survived, Geom.histogram(bincount=15,position=:dodge), Scale.color_discrete_manual("orange","green"))

# Feature Engineering

Use end to end Pipelines. (questions how to use data from train set in the test set if using pipelines ?)

- Extract Title from Name
- Extract family Name from Name and count occurences (similar to family size but may have people that share a name)
- Count occurences of ticket (might detected nannies and friends)
- Extract Deck from Cabin
- Categorize titles per social rank
- Combine Siblings + Spouse + Parents + 1 to have family size
- Bin Fare. Have a category 0 for VIP that were invited
- Normalize Fare/Family size if someone paid for all

#### Missing values :
- Compute Missing Age (or predict it via a simple Regression)
- Bin Age into Age group (and predict that instead)
- Drop NA in Embarked (or predict it via a simple Regression)

- Predict Fare and FareGroup for test data

To keep the algorithm as general as possible and not overfit onedge cases I didn't use Names, Ticket ID, Cabin Number as is. What if the data is split between train and test data, what if it doesn't appear at all ?


In [7]:
## Optimus Title, the transformer that gets title from the name field
type PP_TitleTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_TitleTransformer, Symbol[]) ##Symbol is a temp mesure while waiting for new release of ScikitLearn

ScikitLearnBase.fit!(self::PP_TitleTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_TitleTransformer, X::DataFrame)
    @linq X |>
    transform(CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name))
end

function pp_title(df::DataFrame) ##For debugging the transformation
    @linq df |>
    transform(CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name))
end

pp_title (generic function with 1 method)

In [8]:
## Optimus Fare bucket transformer, the transformer that bins Fares
## This transformer would not have seen the light without the magical searchsortedfirst.
## (DataFrames.jl needs a cut function)
type PP_FareGroupTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FareGroupTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FareGroupTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FareGroupTransformer, X::DataFrame)
    @linq X |>
    transform(CptFareGroup = map(s->
    if isna(s) return s
    else ifelse(s==0,0,
        searchsortedfirst(10.0:10.0:100.0,s)
        ) end,:Fare)
    )
end


# Magical searchsortedfirst for binning bucketing. (DataFrames.jl needs a cut function)
# However if s==0 is poisoned by NAtype
function pp_FareGroup(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |> transform(CptFareGroup = map(s->
    if isna(s) return s
    else ifelse(s==0,0,
        searchsortedfirst(10.0:10.0:100.0,s)
        ) end,:Fare)
    )
end

pp_FareGroup (generic function with 1 method)

In [9]:
## Optimus Deck, the transformer that gets deck from the Cabin field
type PP_DeckTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_DeckTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_DeckTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_DeckTransformer, X::DataFrame)
    @linq X |>
    transform(CptDeck = map(s->ifelse(isna(s),"Unknown",s), :Cabin)) |>
        ## Need two step otherwise Julia complains about no index method for NAtypes, pesky Julia
        transform(CptDeck = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptDeck))
end

function pp_deck(df::DataFrame) ##For debugging the transformation
    @linq df |>
        transform(CptDeck = map(s->ifelse(isna(s),"Unknown",s), :Cabin)) |>
        ## Need two step otherwise Julia complains about no index method for NAtypes, pesky Julia
        transform(CptDeck = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptDeck))
end

pp_deck (generic function with 1 method)

In [10]:
#Dictionary for socio-prof categories
#Dictionary for consistent referencing (transform input to lower case for insensitive use)
#Master. --> children 0
#Miss. Mlle --> unmarried 1
#Mr. Mrs. Ms. --> normal 2
#Honorifics --> rich people
dicoRef = Dict(
            "Mr." => 2,
            "Mrs."=> 2,
            "Miss." => 1,
            "Master." => 0,
            "Don."=> 3,
            "Rev."=>3,
            "Dr."=>3,
            "Mme."=>2,
            "Ms."=>2,
            "Major."=>3,
            "Lady."=>3,
            "Sir."=>3,
            "Mlle."=>1,
            "Col."=>3,
            "Capt."=>3,
            "the Countess."=>3,
            "Jonkheer."=>3,
            "Dona."=>3
    )

## Optimus Title Social Category, the transformer that gets the social standing
## from the CptTitle field
type PP_TitleCatTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_TitleCatTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_TitleCatTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_TitleCatTransformer, X::DataFrame)
    @linq X |>
    transform(CptTitleCat = map(s->dicoRef[s], :CptTitle))
end

function pp_titlecat(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |> transform(CptTitleCat = map(s->dicoRef[s], :CptTitle))
end

pp_titlecat (generic function with 1 method)

In [11]:
## Optimus Family Name frequency, the transformer that gets the family name frequency
## potential issue, if family are split between training and test data?
## 
## Todo use fit to save between training and test data
type PP_FamNameFreqTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FamNameFreqTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FamNameFreqTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FamNameFreqTransformer, X::DataFrame)
    @linq X |>
        transform(CptNameFreq = map(s->match(r"^.*?,", s).match, :Name))|>
        groupby(:CptNameFreq)|>
        transform(CptNameFreq = length(:CptNameFreq))
end

function pp_namefreq(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |>
        transform(CptNameFreq = map(s->match(r"^.*?,", s).match, :Name))|>
        groupby(:CptNameFreq)|>
        transform(CptNameFreq = length(:CptNameFreq)) ## TODO : is length really a count equivalent ?
end

pp_namefreq (generic function with 1 method)

In [12]:
## Optimus ticket frequency, the transformer that gets the ticket frequency
## potential issue, if family are split between training and test data?
## 
## Todo use fit to save between training and test data
type PP_TicketFreqTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_TicketFreqTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_TicketFreqTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_TicketFreqTransformer, X::DataFrame)
    @linq X |>
        groupby(:Ticket)|>
        transform(CptTicketFreq = length(:Ticket))
end

function pp_ticketfreq(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |>
        groupby(:Ticket)|>
        transform(CptTicketFreq = length(:Ticket)) ## TODO : is length really a count equivalent ?
end

pp_ticketfreq (generic function with 1 method)

In [13]:
## Optimus Family Size
type PP_FamSizeTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FamSizeTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FamSizeTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FamSizeTransformer, X::DataFrame)
    @linq X |>
    transform(CptFamSize = :SibSp + :Parch + 1 )
end

function pp_familysize(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |>
    transform(CptFamSize = :SibSp + :Parch + 1 )
end

pp_familysize (generic function with 1 method)

In [14]:
## Optimus Embarked transformer (to be replaced by a Regressor?)

type PP_EmbarkedImputer <: ScikitLearnBase.BaseEstimator
    df_Embarked::DataFrame
    PP_EmbarkedImputer() = new()
end

@declare_hyperparameters(PP_EmbarkedImputer, Symbol[])

function ScikitLearnBase.fit!(self::PP_EmbarkedImputer, X::DataFrame, y=nothing)
    self.df_Embarked = by(X, [:Pclass,:Sex,:CptTitleCat], df -> mode(dropna(df[:Embarked])))
    return self
end

function fillNA_Embarked(EI::PP_EmbarkedImputer,tx_Embarked, in_Pclass, tx_Sex, in_Titlecat)
    df = EI.df_Embarked
    ifelse(
        isna(tx_Embarked),
        reshape(df[
            (df[:Pclass].==in_Pclass)&
            (df[:Sex].==tx_Sex)&
            (df[:CptTitleCat].==in_Titlecat)&
        ,:x1])[1],
    tx_Embarked)
end

function ScikitLearnBase.transform(self::PP_EmbarkedImputer, X::DataFrame)
    result = @byrow! X begin
        @newcol CptEmbarked::DataArray{String}
        :CptEmbarked = fillNA_Embarked(self,:Embarked,:Pclass,:Sex,:CptTitleCat)
    end
    return result
end

LoadError: LoadError: syntax: unexpected ,
while loading In[14], in expression starting on line 23

In [15]:
## Optimus Age bucket transformer, the transformer that bins Age
## This transformer would not have seen the light without the magical searchsortedfirst.
## (DataFrames.jl needs a cut function)
type PP_AgeGroupTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_AgeGroupTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_AgeGroupTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_AgeGroupTransformer, X::DataFrame)
    @linq X |> transform(CptAgeGroup = map(s->
    ifelse(~isna(s),searchsortedfirst(4.0:12.0:64.0,s),s),:Age))
end


# Magical searchsortedfirst for binning bucketing. (DataFrames.jl needs a cut function)
function pp_AgeGroup(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |> transform(CptAgeGroup = map(s->
    ifelse(~isna(s),searchsortedfirst(4.0:12.0:64.0,s),s),:Age))
end



pp_AgeGroup (generic function with 1 method)

In [16]:
## Optimus Fare per person
type PP_FarePersonTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FarePersonTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FarePersonTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FarePersonTransformer, X::DataFrame)
    @linq X |>
    transform(CptFarePerson = :Fare ./ :CptFamSize )
end

function pp_fareperson(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |>
    transform(CptFarePerson = :Fare ./ :CptFamSize )
end

pp_fareperson (generic function with 1 method)

In [17]:
## Optimus Age group imputer transformer (to be replaced by a Regressor?)
## Note ! Can fail during Cross validation because the filter by group by returns empty :/


type PP_AgeGroupImputer <: ScikitLearnBase.BaseEstimator
    df_AgeGroup::DataFrame
    PP_AgeGroupImputer() = new()
end

@declare_hyperparameters(PP_AgeGroupImputer, Symbol[])

function ScikitLearnBase.fit!(self::PP_AgeGroupImputer, X::DataFrame, y=nothing)
    self.df_AgeGroup = by(X, [:Pclass,:Sex,:CptTitleCat], df -> mode(dropna(df[:CptAgeGroup])))
    return self
end

function fillNA_AgeCat(AGI::PP_AgeGroupImputer,in_AgeGroup, in_Pclass, tx_Sex, in_Titlecat)
    df = AGI.df_AgeGroup
    ifelse(
        isna(in_AgeGroup),
        reshape(df[
            (df[:Pclass].==in_Pclass)&
            (df[:Sex].==tx_Sex)&
            (df[:CptTitleCat].==in_Titlecat)
        ,:x1])[1],
    in_AgeGroup)
end

function ScikitLearnBase.transform(self::PP_AgeGroupImputer, X::DataFrame)
    result = @byrow! X begin
        :CptAgeGroup = fillNA_AgeCat(self,:CptAgeGroup,:Pclass,:Sex,:CptTitleCat)
    end
    return result
end

In [18]:
## Optimus Age imputer transformer (to be replaced by a Regressor?)
## Note ! Can fail during Cross validation because the filter by group by returns empy :/
## For missing values, only infer from non-missing field
## to avoid accumulate approx from actual data
## ==> Don't use CptAgeGroup

type PP_AgeTransformer <: ScikitLearnBase.BaseEstimator
    df_Age::DataFrame
    PP_AgeTransformer() = new()
end

@declare_hyperparameters(PP_AgeTransformer, Symbol[])

function ScikitLearnBase.fit!(self::PP_AgeTransformer, X::DataFrame, y=nothing)
    self.df_Age = by(X, [:Pclass,:Sex,:CptTitleCat], df -> median(dropna(df[:Age])))
    return self
end

function fillNA_Age(AGI::PP_AgeTransformer,in_Age, in_Pclass, tx_Sex, in_Titlecat)
    df = AGI.df_Age
    ifelse(
        isna(in_Age),
        reshape(df[
            (df[:Pclass].==in_Pclass)&
            (df[:Sex].==tx_Sex)&
            (df[:CptTitleCat].==in_Titlecat)
        ,:x1])[1],
    in_Age)
end

function ScikitLearnBase.transform(self::PP_AgeTransformer, X::DataFrame)
    result = @byrow! X begin
        @newcol CptAge::DataArray{Float64}
        :CptAge = fillNA_Age(self,:Age,:Pclass,:Sex,:CptTitleCat)
    end
    return result
end




## For testing only. cannot automatically test data in a pipeline
## because groupby will be different between train and test
function pp_MissingAge(df::AbstractDataFrame)
    @linq df |>
    groupby([:Pclass,:Sex,:CptTitle]) |>
          transform(CptAge = ifelse(isna(:Age),median(dropna(:Age)),:Age))
end

pp_MissingAge (generic function with 1 method)

In [19]:
# Check Pipeline before NA prediction steps
Z = @linq train |> pp_title |> pp_titlecat |> pp_deck |>
pp_namefreq |> pp_familysize |> pp_FareGroup |> pp_fareperson |> pp_AgeGroup |>
pp_ticketfreq |> pp_MissingAge

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CptTitle,CptTitleCat,CptDeck,CptNameFreq,CptFamSize,CptFareGroup,CptFarePerson,CptAgeGroup,CptTicketFreq,CptAge
1,258,1,1,"Cherry, Miss. Gladys",female,30.0,0,0,110152,86.5,B77,S,Miss.,1,B,1,1,9,86.5,4,3,30.0
2,505,1,1,"Maioni, Miss. Roberta",female,16.0,0,0,110152,86.5,B79,S,Miss.,1,B,1,1,9,86.5,2,3,16.0
3,586,1,1,"Taussig, Miss. Ruth",female,18.0,0,2,110413,79.65,E68,S,Miss.,1,E,3,3,8,26.55,3,3,18.0
4,330,1,1,"Hippach, Miss. Jean Gertrude",female,16.0,0,1,111361,57.9792,B18,C,Miss.,1,B,2,2,6,28.9896,2,2,16.0
5,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,Miss.,1,B,3,1,3,30.0,3,1,19.0
6,357,1,1,"Bowerman, Miss. Elsie Edith",female,22.0,0,1,113505,55.0,E33,S,Miss.,1,E,1,2,6,27.5,3,2,22.0
7,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,Miss.,1,B,1,1,8,80.0,4,2,38.0
8,436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S,Miss.,1,B,6,4,11,30.0,2,4,14.0
9,298,0,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,Miss.,1,C,3,4,11,37.8875,1,4,2.0
10,709,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.55,,S,Miss.,1,Unknown,1,1,11,151.55,3,4,22.0


In [20]:
## Optimus Debug
type InspectTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(InspectTransformer, Symbol[])

ScikitLearnBase.fit!(self::InspectTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::InspectTransformer, X::DataFrame)
    writetable("debug.csv",X)
    return X
end

In [21]:
# Create model
mapper = DataFrameMapper([
    ([:Pclass], Binarizer()),
    ([:CptAge], RobustScaler()),
    (:CptTitle, LabelBinarizer()),
    (:Sex, LabelBinarizer()),
    ([:SibSp], RobustScaler()),
    ([:Parch], RobustScaler()),
    ([:Fare], RobustScaler()),
    (:CptEmbarked, LabelBinarizer()),
    (:CptDeck, LabelBinarizer()),
    ([:CptTitleCat], Binarizer()),
    ([:CptNameFreq], StandardScaler()), #lots of extremes
    ([:CptFamSize], Binarizer()), #lots of extremes
    ([:CptFareGroup], Binarizer()), #lots of extremes
    ([:CptAgeGroup], Binarizer()),
    ([:CptFarePerson], StandardScaler()),
    ([:CptTicketFreq], StandardScaler())
    
    ]);

In [22]:
pipe = Pipelines.Pipeline([
    ("extract_deck",PP_DeckTransformer()),
    ("extract_title", PP_TitleTransformer()),
    ("extract_titlecat",PP_TitleCatTransformer()),
    ("extract_namefreq",PP_FamNameFreqTransformer()),
    ("extract_famsize",PP_FamSizeTransformer()),
    ("extract_faregroup",PP_FareGroupTransformer()),
    ("extract_fareperson",PP_FarePersonTransformer()),
    ("extract_AgeGroup",PP_AgeGroupTransformer()),
    ("fillNA_AgeGroup",PP_AgeGroupImputer()),
    ("fillNA_Age",PP_AgeTransformer()),
    ("fillNA_Embarked",PP_EmbarkedImputer()),
    ("extract_ticketfreq",PP_TicketFreqTransformer()),
    ("DEBUG",InspectTransformer()),
     ("featurize", mapper),
    ("forest", RandomForestClassifier(ntrees=100))
    ])

ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("extract_deck",PP_DeckTransformer()),("extract_title",PP_TitleTransformer()),("extract_titlecat",PP_TitleCatTransformer()),("extract_namefreq",PP_FamNameFreqTransformer()),("extract_famsize",PP_FamSizeTransformer()),("extract_faregroup",PP_FareGroupTransformer()),("extract_fareperson",PP_FarePersonTransformer()),("extract_AgeGroup",PP_AgeGroupTransformer()),("fillNA_AgeGroup",PP_AgeGroupImputer(#undef)),("fillNA_Age",PP_AgeTransformer(#undef)),("fillNA_Embarked",PP_EmbarkedImputer(#undef)),("extract_ticketfreq",PP_TicketFreqTransformer()),("DEBUG",InspectTransformer()),("featurize",ScikitLearn.DataFrameMapper(Tuple[(Symbol[:Pclass],PyObject Binarizer(copy=True, threshold=0.0)),(Symbol[:CptAge],PyObject RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)),(:CptTitle,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(:Sex,PyObject LabelBinarizer(neg_label=0, pos_label=1, spar

In [23]:
X_train = train
Y_train = convert(Array, train[:Survived])


891-element Array{Int64,1}:
 0
 1
 1
 1
 0
 0
 0
 0
 1
 1
 1
 1
 0
 ⋮
 1
 1
 0
 0
 0
 0
 0
 0
 1
 0
 1
 0

In [24]:
#Cross Validation - check model accuracy
round(cross_val_score(pipe, X_train, Y_train, cv =10), 2)

LoadError: LoadError: MethodError: no method matching transform(::PP_EmbarkedImputer, ::DataFrames.DataFrame)
Closest candidates are:
  transform(!Matched::ScikitLearn.DataFrameMapper, ::DataFrames.DataFrame) at /Users/tesuji/.julia/v0.5/ScikitLearn/src/dataframes.jl:143
  transform(!Matched::ScikitLearn.Skcore.FitBit, ::Any...; kwargs...) at /Users/tesuji/.julia/v0.5/ScikitLearn/src/sk_utils.jl:73
  transform(!Matched::PyCall.PyObject, ::Any...; kwargs...) at /Users/tesuji/.julia/v0.5/ScikitLearn/src/Skcore.jl:95
  ...
while loading In[24], in expression starting on line 2

In [None]:
model = fit!(pipe, X_train, Y_train)

In [None]:
result=DataFrame()
result[:PassengerId] = test[:PassengerId]
result[:Survived] = @data predict(model,final_test)

In [None]:
result

In [None]:
writetable("julia-magicalforests.csv",result)