In [1]:
using DecisionTree
using DataFrames
using DataFramesMeta
using DataArrays
using Gadfly
using ScikitLearn: DataFrameMapper, @sk_import, Pipelines, fit!, predict
using ScikitLearn.CrossValidation
using ScikitLearnBase: @declare_hyperparameters, BaseEstimator
import ScikitLearnBase.simple_get_params

# @sk_import linear_model: LogisticRegression
@sk_import preprocessing: (LabelBinarizer, RobustScaler, Binarizer, StandardScaler)



In [2]:
# Note for debugging, changing samething inside a function require kernel reloading :/

In [3]:
train = readtable("train.csv")
test = readtable("test.csv")
head(train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [4]:
####### Exploration phase #######

# describe(train)

In [5]:
# Somehow adding the color visualization generate an error about int not defined in Gadfly
# Need to edit source code and replace int by Int
# plot(train, x="Sex", y="Survived", color="Survived", Geom.histogram(position=:stack), Scale.color_discrete_manual("red","green"))

In [6]:
# plot(train, x=:Age, y=:Survived, color=:Survived, Geom.histogram(bincount=15,position=:dodge), Scale.color_discrete_manual("orange","green"))

# Feature Engineering

Use end to end Pipelines. (questions how to use data from train set in the test set if using pipelines ?)

- Extract Title from Name
- Extract family Name from Name and count occurences (similar to family size but may have people that share a name)
- Count occurences of ticket (might detected nannies and friends)
- Extract Deck from Cabin
- Categorize titles per social rank
- Combine Siblings + Spouse + Parents + 1 to have family size
- Bin Fare. Have a category 0 for VIP that were invited
- Normalize Fare/Family size if someone paid for all

####Missing values :
- Compute Missing Age (or predict it via a simple Regression)
- Bin Age into Age group (and predict that instead)
- Drop NA in Embarked (or predict it via a simple Regression)

- Predict FareGroup for test data

To keep the algorithm as general as possible and not overfit onedge cases I didn't use Names, Ticket ID, Cabin Number as is. What if the data is split between train and test data, what if it doesn't appear at all ?


In [7]:
## Optimus Title, the transformer that gets title from the name field
type PP_TitleTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_TitleTransformer, Symbol[]) ##Symbol is a temp mesure while waiting for new release of ScikitLearn

ScikitLearnBase.fit!(self::PP_TitleTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_TitleTransformer, X::DataFrame)
    @linq X |>
    transform(CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name))
end

function pp_title(df::DataFrame) ##For debugging the transformation
    @linq df |>
    transform(CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name))
end

pp_title (generic function with 1 method)

In [8]:
## Optimus Deck, the transformer that gets deck from the Cabin field
type PP_DeckTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_DeckTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_DeckTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_DeckTransformer, X::DataFrame)
    @linq X |>
        transform(CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)) |>
        ## Need two step otherwise Julia complains about no index method for NAtypes, pesky Julia
        transform(CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin))
end

function pp_deck(df::DataFrame) ##For debugging the transformation
    @linq df |>
        transform(CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)) |>
        ## Need two step otherwise Julia complains about no index method for NAtypes, pesky Julia
        transform(CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin))
end

pp_deck (generic function with 1 method)

In [9]:
#Dictionary for socio-prof categories
#Dictionary for consistent referencing (transform input to lower case for insensitive use)
#Master. --> children 0
#Miss. Mlle --> unmarried 1
#Mr. Mrs. Ms. --> normal 2
#Honorifics --> rich people
dicoRef = Dict(
            "Mr." => 2,
            "Mrs."=> 2,
            "Miss." => 1,
            "Master." => 0,
            "Don."=> 3,
            "Rev."=>3,
            "Dr."=>3,
            "Mme."=>2,
            "Ms."=>2,
            "Major."=>3,
            "Lady."=>3,
            "Sir."=>3,
            "Mlle."=>1,
            "Col."=>3,
            "Capt."=>3,
            "the Countess."=>3,
            "Jonkheer."=>3,
            "Dona."=>3
    )

## Optimus Title Social Category, the transformer that gets the social standing
## from the CptTitle field
type PP_TitleCatTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_TitleCatTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_TitleCatTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_TitleCatTransformer, X::DataFrame)
    @linq X |>
    transform(CptTitleCat = map(s->dicoRef[s], :CptTitle))
end

function pp_titlecat(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |> transform(CptTitleCat = map(s->dicoRef[s], :CptTitle))
end

pp_titlecat (generic function with 1 method)

In [10]:
## Optimus Family Name frequency, the transformer that gets the family name frequency
## potential issue, if family are split between training and test data?
## 
## There might be cousins/uncles relationship that are not captured in sibsp or Parch
type PP_FamNameFreqTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FamNameFreqTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FamNameFreqTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FamNameFreqTransformer, X::DataFrame)
    @linq X |>
        transform(CptNameFreq = map(s->match(r"^.*?,", s).match, :Name))|>
        groupby(:CptNameFreq)|>
        transform(CptNameFreq = length(:CptNameFreq))
end

function pp_namefreq(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |>
        transform(CptNameFreq = map(s->match(r"^.*?,", s).match, :Name))|>
        groupby(:CptNameFreq)|>
        transform(CptNameFreq = length(:CptNameFreq)) ## TODO : is length really a count equivalent ?
end

pp_namefreq (generic function with 1 method)

In [11]:
## Optimus Family Size
type PP_FamSizeTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FamSizeTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FamSizeTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FamSizeTransformer, X::DataFrame)
    @linq X |>
    transform(CptFamSize = :SibSp + :Parch + 1 )
end

function pp_familysize(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |>
    transform(CptFamSize = :SibSp + :Parch + 1 )
end

pp_familysize (generic function with 1 method)

In [12]:
## Optimus Fare bucket transformer, the transformer that bins Fares
## This transformer would not have seen the light without the magical searchsortedfirst.
## (DataFrames.jl needs a cut function)
type PP_FareGroupTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FareGroupTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FareGroupTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FareGroupTransformer, X::DataFrame)
    @linq X |>
    transform(FareGroup = map(s->
    if isna(s) return s
    else ifelse(s==0,0,
        searchsortedfirst(10.0:10.0:100.0,s)
        ) end,:Age)
    )
end


# Magical searchsortedfirst for binning bucketing. (DataFrames.jl needs a cut function)
# However if s==0 is poisoned by NAtype
function pp_FareGroup(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |> transform(FareGroup = map(s->
    if isna(s) return s
    else ifelse(s==0,0,
        searchsortedfirst(10.0:10.0:100.0,s)
        ) end,:Age)
    )
end

pp_FareGroup (generic function with 1 method)

In [13]:
# Check Pipeline before NA prediction steps
@linq train |> pp_title |> pp_titlecat |> pp_deck |>
pp_namefreq |> pp_familysize |>  pp_ticketfreq |> pp_FareGroup |> pp_fareperson

LoadError: LoadError: UndefVarError: pp_ticketfreq not defined
while loading In[13], in expression starting on line 2

In [14]:
# drop NA in Embarked column
# Alternative predict the mot likely value
function pp_dropNAembarked(df::AbstractDataFrame)
    df[~isna(df[:,:Embarked]),:]
end

pp_dropNAembarked (generic function with 1 method)

In [15]:
## Optimus Age bucket transformer, the transformer that bins Age
## This transformer would not have seen the light without the magical searchsortedfirst.
## (DataFrames.jl needs a cut function)
type PP_AgeGroupTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_AgeGroupTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_AgeGroupTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_AgeGroupTransformer, X::DataFrame)
    @linq X |> transform(CptAgeGroup = map(s->
    ifelse(~isna(s),searchsortedfirst(4.0:12.0:64.0,s),s),:Age))
end


# Magical searchsortedfirst for binning bucketing. (DataFrames.jl needs a cut function)
function pp_AgeGroup(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |> transform(CptAgeGroup = map(s->
    ifelse(~isna(s),searchsortedfirst(4.0:12.0:64.0,s),s),:Age))
end



pp_AgeGroup (generic function with 1 method)

In [16]:
## Optimus Fare per person
type PP_FarePersonTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FarePersonTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FarePersonTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FarePersonTransformer, X::DataFrame)
    @linq X |>
    transform(CptFareperson = :Fare ./ :CptFamSize )
end

function pp_fareperson(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |>
    transform(CptFareperson = :Fare ./ :CptFamSize )
end

pp_fareperson (generic function with 1 method)

In [17]:
# Create model
mapper = DataFrameMapper([
    ([:Pclass], Binarizer()),
    (:CptTitle, LabelBinarizer()),
    (:Sex, LabelBinarizer()),
#    ([:CptAge], StandardScaler()),
    ([:SibSp], RobustScaler()),
    ([:Parch], RobustScaler()),
#    ([:Fare], RobustScaler()),
#    (:Embarked, LabelBinarizer()),
    (:CptCabin, LabelBinarizer())
    ]);

In [18]:
## Optimus Age group imputer transformer (to be replaced by a Regressor?)
type PP_AgeGroupImputer <: ScikitLearnBase.BaseEstimator
    df_AgeGroup::DataFrame
    PP_AgeGroupImputer() = new()
end

@declare_hyperparameters(PP_AgeGroupImputer, Symbol[])

function ScikitLearnBase.fit!(self::PP_AgeGroupImputer, X::DataFrame, y=nothing)
    self.df_AgeGroup = by(X, [:Pclass,:Sex,:CptTitleCat], df -> mode(dropna(df[:CptAgeGroup])))
    print(self.df_AgeGroup)
    return self
end

function ScikitLearnBase.transform(self::PP_AgeGroupImputer, X::DataFrame)
#    @byrow! X begin
#        :CptAgeGroup = 
#        if isna(:CptAgeGroup)
#            reshape(df_AgeGroup[(df_Agegroup[:Pclass].==:Pclass)&
#            (df_AgeGroup[:Sex].==:Sex)&
            (df_AgeGroup[:CptTitle].==:CptTitleCat)
                ,df_AgeGroup[:x1], 1)[1] #Magical :x1 is the name of the column created by "by"
        end
    end
    return X
end




## For testing only. cannot automatically test data in a pipeline
## because groupby will be different between train and test
function pp_MissingAge(df::AbstractDataFrame)
    @linq df |>
    groupby([:Pclass,:Sex,:CptTitle]) |>
          transform(CptAge = ifelse(isna(:Age),median(dropna(:Age)),:Age))
end

LoadError: LoadError: syntax: missing separator in array expression
while loading In[18], in expression starting on line 22

In [19]:
pipe = Pipelines.Pipeline([
    ("extract_deck",PP_DeckTransformer()),
    ("extract_title", PP_TitleTransformer()),
    ("extract_titlecat",PP_TitleCatTransformer()),
    ("extract_namefreq",PP_FamNameFreqTransformer()),
    ("extract_famsize",PP_FamSizeTransformer()),
    ("extract_faregroup",PP_FareGroupTransformer()),
    ("extract_fareperson",PP_FarePersonTransformer()),
    ("extract_AgeGroup",PP_AgeGroupTransformer()),
    ("fillNA_AgeGroup",PP_AgeGroupImputer()),
     ("featurize", mapper),
    ("forest", RandomForestClassifier(ntrees=200))
    ])

ScikitLearn.Skcore.Pipeline(Tuple{Any,Any}[("extract_deck",PP_DeckTransformer()),("extract_title",PP_TitleTransformer()),("extract_titlecat",PP_TitleCatTransformer()),("extract_namefreq",PP_FamNameFreqTransformer()),("extract_famsize",PP_FamSizeTransformer()),("extract_faregroup",PP_FareGroupTransformer()),("extract_fareperson",PP_FarePersonTransformer()),("extract_AgeGroup",PP_AgeGroupTransformer()),("fillNA_AgeGroup",PP_AgeGroupImputer(#undef)),("featurize",ScikitLearn.DataFrameMapper(Tuple[(Symbol[:Pclass],PyObject Binarizer(copy=True, threshold=0.0)),(:CptTitle,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(:Sex,PyObject LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),(Symbol[:SibSp],PyObject RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)),(Symbol[:Parch],PyObject RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)),(:CptCabin,PyObject LabelB

In [20]:
Y_train = convert(Array, train[:Survived])
X_train = train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [21]:



#Cross Validation - check model accuracy
round(cross_val_score(pipe, X_train, Y_train, cv =2), 2)

15×4 DataFrames.DataFrame
│ Row │ Pclass │ Sex      │ CptTitleCat │ x1 │
├─────┼────────┼──────────┼─────────────┼────┤
│ 1   │ 1      │ "female" │ 2           │ 5  │
│ 2   │ 1      │ "female" │ 1           │ 4  │
│ 3   │ 1      │ "female" │ 3           │ 5  │
│ 4   │ 1      │ "male"   │ 2           │ 4  │
│ 5   │ 1      │ "male"   │ 0           │ 1  │
│ 6   │ 1      │ "male"   │ 3           │ 5  │
│ 7   │ 2      │ "female" │ 2           │ 4  │
│ 8   │ 2      │ "female" │ 1           │ 3  │
│ 9   │ 2      │ "male"   │ 2           │ 3  │
│ 10  │ 2      │ "male"   │ 0           │ 1  │
│ 11  │ 2      │ "male"   │ 3           │ 3  │
│ 12  │ 3      │ "female" │ 2           │ 4  │
│ 13  │ 3      │ "female" │ 1           │ 3  │
│ 14  │ 3      │ "male"   │ 2           │ 3  │
│ 15  │ 3      │ "male"   │ 0           │ 2  │

LoadError: LoadError: MethodError: no method matching transform(::PP_AgeGroupImputer, ::DataFrames.DataFrame)
Closest candidates are:
  transform(!Matched::ScikitLearn.DataFrameMapper, ::DataFrames.DataFrame) at /Users/tesuji/.julia/v0.5/ScikitLearn/src/dataframes.jl:143
  transform(!Matched::ScikitLearn.Skcore.FitBit, ::Any...; kwargs...) at /Users/tesuji/.julia/v0.5/ScikitLearn/src/sk_utils.jl:73
  transform(!Matched::PyCall.PyObject, ::Any...; kwargs...) at /Users/tesuji/.julia/v0.5/ScikitLearn/src/Skcore.jl:95
  ...
while loading In[21], in expression starting on line 5

In [None]:
model = fit!(pipe, X_train, Y_train)

In [None]:
result=DataFrame()
result[:PassengerId] = test[:PassengerId]
result[:Survived] = @data predict(model,final_test)

In [None]:
result

In [None]:
writetable("julia-magicalforests.csv",result)