In [1]:
using DecisionTree
using DataFrames
using DataFramesMeta
using DataArrays
using Gadfly
using ScikitLearn: DataFrameMapper, @sk_import, Pipelines, fit!, predict
using ScikitLearn.CrossValidation
using ScikitLearnBase: @declare_hyperparameters, BaseEstimator
import ScikitLearnBase.simple_get_params

# @sk_import linear_model: LogisticRegression
@sk_import preprocessing: (LabelBinarizer, RobustScaler, Binarizer, StandardScaler)



In [None]:
# Note for debugging, changing samething inside a function require kernel reloading :/

In [2]:
train = readtable("train.csv")
test = readtable("test.csv")
head(train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [None]:
####### Exploration phase #######

describe(train)

In [None]:
# Somehow adding the color visualization generate an error about int not defined in Gadfly
# Need to edit source code and replace int by Int
# plot(train, x="Sex", y="Survived", color="Survived", Geom.histogram(position=:stack), Scale.color_discrete_manual("red","green"))

In [None]:
# plot(train, x=:Age, y=:Survived, color=:Survived, Geom.histogram(bincount=15,position=:dodge), Scale.color_discrete_manual("orange","green"))

# Feature Engineering

Use end to end Pipelines. (questions how to use data from train set in the test set if using pipelines ?)
- Compute Missing Age (or predict it via a simple Regression)
- Bin Age into Age group (and predict that instead)
- Drop NA in Embarked (or predict it via a simple Regression)
- Extract Title from Name
- Extract family Name from Name and count occurences (similar to family size but may have people that share a name)
-? Extract family name (what if a family is split between train and test, or is irrelevant in the other set ??)
- Extract Deck from Cabin
- Categorize titles per social rank
- Combine Siblings + Spouse + Parents + 1 to have family size
-? Be inventive with tickets ?
- Predict Fare for test data
- Bin Fare. Have a category 0 for VIP that were invited
- Normalize Fare/Family Size if someone paid for all


In [None]:
## Optimus Title, the transformer that gets title from the name field
type PP_TitleTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_TitleTransformer, Symbol[]) ##Symbol is a temp mesure while waiting for new release of ScikitLearn

ScikitLearnBase.fit!(self::PP_TitleTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_TitleTransformer, X::DataFrame)
    @linq X |>
    transform(CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name))
end

function pp_title(df::DataFrame) ##For debugging the transformation
    @linq df |>
    transform(CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name))
end

In [None]:
## Optimus Deck, the transformer that gets deck from the Cabin field
type PP_DeckTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_DeckTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_DeckTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_DeckTransformer, X::DataFrame)
    @linq X |>
        transform(CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)) |>
        ## Need two step otherwise Julia complains about no index method for NAtypes, pesky Julia
        transform(CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin))
end

function pp_deck(df::DataFrame) ##For debugging the transformation
    @linq df |>
        transform(CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)) |>
        ## Need two step otherwise Julia complains about no index method for NAtypes, pesky Julia
        transform(CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin))
end

In [None]:
#Dictionary for socio-prof categories
#Dictionary for consistent referencing (transform input to lower case for insensitive use)
#Master. --> children 0
#Miss. Mlle --> unmarried 1
#Mr. Mrs. Ms. --> normal 2
#Honorifics --> rich people
dicoRef = Dict(
            "Mr." => 2,
            "Mrs."=> 2,
            "Miss." => 1,
            "Master." => 0,
            "Don."=> 3,
            "Rev."=>3,
            "Dr."=>3,
            "Mme."=>2,
            "Ms."=>2,
            "Major."=>3,
            "Lady."=>3,
            "Sir."=>3,
            "Mlle."=>1,
            "Col."=>3,
            "Capt."=>3,
            "the Countess."=>3,
            "Jonkheer."=>3,
            "Dona."=>3
    )

## Optimus Title Social Category, the transformer that gets the social standing
## from the CptTitle field
type PP_TitleCatTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_TitleCatTransformer, [])

ScikitLearnBase.fit!(self::PP_TitleCatTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_TitleCatTransformer, X::DataFrame)
    @linq X |>
    transform(CptTitleCat = map(s->dicoRef[s], :CptTitle))
end

function pp_titlecat(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |> transform(CptTitleCat = map(s->dicoRef[s], :CptTitle))
end

In [None]:
## Optimus Family Name frequency, the transformer that gets the family name frequency
## potential issue, if family are split between training and test data?
## 
## There might be cousins/uncles relationship that are not captured in sibsp or Parch
type PP_FamNameFreqTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_FamNameFreqTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_FamNameFreqTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_FamNameFreqTransformer, X::DataFrame)
    @linq X |>
        transform(CptNameFreq = map(s->match(r"^.*?,", s).match, :Name))|>
        groupby(:CptNameFreq)|>
        transform(CptNameFreq = length(:CptNameFreq))
end

function pp_namefreq(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |>
        transform(CptNameFreq = map(s->match(r"^.*?,", s).match, :Name))|>
        groupby(:CptNameFreq)|>
        transform(CptNameFreq = length(:CptNameFreq)) ## TODO : is length really a count equivalent ?
end

In [None]:




## For testing only. Not scalable automatically to test data
# Compute a new Age column with missing values filled with median
# Other possibility do a regression and predict the age

## Important : the test data computation for missing age should use train data info
## How to do that cleanly ?
function pp_MissingAge(df::AbstractDataFrame)
    @linq df |>
    groupby([:Pclass,:Sex,:CptTitle]) |>
          transform(CptAge = ifelse(isna(:Age),median(dropna(:Age)),:Age))
end


# by(train, [:Pclass,:Sex,:CptTitle], df -> median(dropna(df[:Age])))

In [None]:
# drop NA in Embarked column
# Alternative predict the mot likely value
function pp_dropNAembarked(df::AbstractDataFrame)
    df[~isna(df[:,:Embarked]),:]
end

In [None]:
# Check your pipeline outputs :
@linq train |> pp_title |> pp_titlecat |> pp_deck |> pp_namefreq |> pp_MissingAge

In [None]:
# Create model
mapper = DataFrameMapper([
    ([:Pclass], Binarizer()),
    (:CptTitle, LabelBinarizer()),
    (:Sex, LabelBinarizer()),
#    ([:CptAge], StandardScaler()),
    ([:SibSp], RobustScaler()),
    ([:Parch], RobustScaler()),
#    ([:Fare], RobustScaler()),
#    (:Embarked, LabelBinarizer()),
    (:CptCabin, LabelBinarizer())
    ]);

In [None]:
pipe = Pipelines.Pipeline([
    ("extract_deck",PP_DeckTransformer()),
    ("extract_title", PP_TitleTransformer()),
     ("featurize", mapper),
    ("forest", RandomForestClassifier(ntrees=200))
    ])

In [None]:
model=fit!(pipe, train, convert(Array,train[:Survived]))

In [None]:
Y_train = convert(Array, train[:Survived])
X_train = train


#Cross Validation - check model accuracy
round(cross_val_score(pipe, X_train, Y_train, cv =2), 2)

In [None]:
model = fit!(pipe, X_train, Y_train)

In [7]:
## Optimus Age bucket transformer, the transformer that bins Age
## This transformer would not have seen the light without the magical searchsortedfirst.
## (DataFrames.jl needs a cut function)
type PP_AgeGroupTransformer <: ScikitLearnBase.BaseEstimator
end

@declare_hyperparameters(PP_AgeGroupTransformer, Symbol[])

ScikitLearnBase.fit!(self::PP_AgeGroupTransformer, X::DataFrame, y=nothing) = return self

function ScikitLearnBase.transform(self::PP_AgeGroupTransformer, X::DataFrame)
    @linq X |> transform(AgeGroup = map(s->
    ifelse(~isna(s),searchsortedfirst(4.0:12.0:64.0,s),s),:Age))
end


# Magical searchsortedfirst for binning bucketing. (DataFrames.jl needs a cut function)
function pp_AgeGroup(df::AbstractDataFrame) ## For debugging the transformation
    @linq df |> transform(AgeGroup = map(s->
    ifelse(~isna(s),searchsortedfirst(4.0:12.0:64.0,s),s),:Age))
end





pp_AgeGroup (generic function with 1 method)

In [5]:
writetable("toto.csv",@linq train |> pp_AgeGroup)

In [None]:
predict(model,test)

In [None]:
##########################
# TEST DATA

# describe test data
describe(test)

##########################
# Extract Title from Name
test_001 = @transform(test,
    CptTitle = map(s->match(r"(?<=, ).*?\.", s).match, :Name)
    )


#Compute the median age depending of class, sex, title
v = vcat(full_train,test_001)
# describe(v)

# Seems like there is a particular :Sex, :CptTitle combination with no matching Age
# writetable("age.csv",v)
#Dona. is not matched to anything, assume Lady
v[v[:CptTitle].=="Dona.",:Age] = v[v[:CptTitle].=="Lady.",:Age]
#Ms. on Pclass3 assume = to Pclass2
v[(v[:CptTitle].=="Ms.")&(v[:Pclass].==3),:Age] = v[(v[:CptTitle].=="Ms.")&(v[:Pclass].==2),:Age]


# Estimate age
test_002 = @byrow! test_001 begin
    @newcol CptAge::DataArray{Float64}
    :CptAge = groupby_Age(v,:Age,:Pclass,:Sex,:CptTitle)
end

# Map Cabin
test_003 = @transform(
                @transform(test_002,
                CptCabin = map(s->ifelse(isna(s),"Unknown",s), :Cabin)
                    ),
                CptCabin = map(s->ifelse(s=="Unknown",s,s[1:1]), :CptCabin)
    )
head(test_003)


# view NA in Fare
print(test[isna(test[:,:Fare]),:])
print("\n\n\n")
# View median that may fill this NA
print(median(dropna(v[(v[:CptTitle].=="Mr.")&(v[:Pclass].==3),:Fare])))

test_004 = test_003
test_004[isna(test_003[:,:Fare]),:Fare] = median(dropna(v[(v[:CptTitle].=="Mr.")&(v[:Pclass].==3),:Fare]))

In [None]:
final_test = test_004[[:Pclass,:CptTitle,:Sex,:CptAge,:SibSp,:Parch,:Fare,:Embarked,:CptCabin]]
head(final_test)

In [None]:
result=DataFrame()
result[:PassengerId] = test[:PassengerId]
result[:Survived] = @data predict(model,final_test)

In [None]:
result

In [None]:
writetable("julia-magicalforests.csv",result)