In [1]:
using Pkg;
Pkg.activate(".");
Pkg.instantiate();

[32m[1m  Activating[22m[39m project at `~/Repos/DoingRightNow-Analysis`


│ It is recommended to `Pkg.resolve()` or consider `Pkg.update()` if necessary.
└ @ Pkg.API /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-4.0/build/default-macmini-aarch64-4-0/julialang/julia-release-1-dot-8/usr/share/julia/stdlib/v1.8/Pkg/src/API.jl:1535


Pkg.Types.PkgError: `Arrow` is a direct dependency, but does not appear in the manifest. If you intend `Arrow` to be a direct dependency, run `Pkg.resolve()` to populate the manifest. Otherwise, remove `Arrow` with `Pkg.rm("Arrow")`. Finally, run `Pkg.instantiate()` again.

In [109]:
using DataFrames, Arrow, CategoricalArrays, ScientificTypes, MLJ

In [103]:
DATA_FILE_PATH = "./data/model_data.arrow";
df = DataFrame(Arrow.Table(DATA_FILE_PATH));
df = copy(df)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Any,Any,Int64,Type
1,TUCASEID,20210600000000.0,20210101210037,2.02106e13,20211212212591,0,Int64
2,TUACTIVITY_N,10.4928,1,9.0,67,0,Int64
3,TUSTARTTIM,,00:00:00,13:40:00,23:59:00,0,Time
4,TUSTOPTIME,,00:00:00,11:30:00,23:59:00,0,Time
5,start_time_int,611.477,0,630.0,1435,0,Int64
6,stop_time_int,828.101,1,810.0,1440,0,Int64
7,TRTIER2,603.055,101,301.0,5001,0,Int64
8,snap_time_int,717.5,0,717.5,1435,0,Int64
9,TULINENO,1.0,1,1.0,1,0,"Union{Missing, Int64}"
10,GESTFIPS_label,,AK,,WY,0,"Union{Missing, String}"


In [104]:

function clean_data!(df)
    
    # Fix machine types.
    HEFAMINC_ordered_set = [
        "Less than 5,000",
        "5,000 to 7,499",
        "7,500 to 9,999",
        "10,000 to 12,499",
        "12,500 to 14,999",
        "15,000 to 19,999",
        "20,000 to 24,999",
        "25,000 to 29,999",
        "30,000 to 34,999",
        "35,000 to 39,999",
        "40,000 to 49,999",
        "50,000 to 59,999",
        "60,000 to 74,999",
        "75,000 to 99,999",
        "100,000 to 149,999",
        "150,000 and over"
    ]

    df.TRTIER2 = categorical(df.TRTIER2)
    df.GESTFIPS_label = categorical(df.GESTFIPS_label)
    df.HEFAMINC_label = categorical(df.HEFAMINC_label; levels=HEFAMINC_ordered_set, ordered=true)
    df.PEMARITL_label = categorical(df.PEMARITL_label)
    df.HETENURE_label = categorical(df.HETENURE_label)
    df.TUDIARYDAY_label = categorical(df.TUDIARYDAY_label)

    # drop columns and disallow missing.
    drop_cols = [
        :TUCASEID,:TUACTIVITY_N,:TUSTARTTIM,:TUSTOPTIME,
        :start_time_int,:stop_time_int,:TULINENO, :TUDIARYDAY
        ]
    select!(df, Not(drop_cols))
    disallowmissing!(df)

    # Define scientific types.
    coerce!(df, :snap_time_int => Continuous, :PRTAGE => Continuous)
end

clean_data!(df);

In [144]:
y, X = unpack(df, ==(:TRTIER2));

## Find the right model to use

We'll take a look at what type of models are available to MLJ to predict on our target.

In [145]:
for m in models(matching(X,y))
    if m.prediction_type == :probabilistic
        println(rpad(m.name, 30), "($(m.package_name))")
    end
end

CatBoostClassifier            (CatBoost)
ConstantClassifier            (MLJModels)
DecisionTreeClassifier        (BetaML)
RandomForestClassifier        (BetaML)


The only models showing are tree-based models. We're prodicting a multi-class category. And this is how it is encoded in the data. Tree-based models will handle this explicitly.

But we _should_ be able to use something like a multivariate logistic regression, shouldn't we? Likely, the reason is typing. A regression isn't going to work on non-encoded predictors. According to the documentation, it _should_ properly interpret the multivariate target though.

In [167]:
# One Hot Encode X into a new object called X2.
ohe = OneHotEncoder(drop_last=true)
mach = fit!(machine(ohe, X))
X2 = MLJ.transform(mach, X)

# Search for the available models.
for m in models(matching(X2,y))
    if m.prediction_type == :probabilistic
        println(rpad(m.name, 30), "($(m.package_name))")
    end
end

┌ Info: Training machine(OneHotEncoder(features = Symbol[], …), …).
└ @ MLJBase /Users/mph/.julia/packages/MLJBase/g5E7V/src/machines.jl:492
┌ Info: Spawning 50 sub-features to one-hot encode feature :GESTFIPS_label.
└ @ MLJModels /Users/mph/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878
┌ Info: Spawning 15 sub-features to one-hot encode feature :HEFAMINC_label.
└ @ MLJModels /Users/mph/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878
┌ Info: Spawning 1 sub-features to one-hot encode feature :PEMARITL_label.
└ @ MLJModels /Users/mph/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878
┌ Info: Spawning 2 sub-features to one-hot encode feature :HETENURE_label.
└ @ MLJModels /Users/mph/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878
┌ Info: Spawning 6 sub-features to one-hot encode feature :TUDIARYDAY_label.
└ @ MLJModels /Users/mph/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878


AdaBoostClassifier            (MLJScikitLearnInterface)
AdaBoostStumpClassifier       (DecisionTree)
BaggingClassifier             (MLJScikitLearnInterface)
BayesianLDA                   (MLJScikitLearnInterface)
BayesianLDA                   (MultivariateStats)
BayesianQDA                   (MLJScikitLearnInterface)
BayesianSubspaceLDA           (MultivariateStats)
CatBoostClassifier            (CatBoost)
ConstantClassifier            (MLJModels)
DecisionTreeClassifier        (BetaML)
DecisionTreeClassifier        (DecisionTree)
DummyClassifier               (MLJScikitLearnInterface)
EvoTreeClassifier             (EvoTrees)
ExtraTreesClassifier          (MLJScikitLearnInterface)
GaussianNBClassifier          (MLJScikitLearnInterface)
GaussianNBClassifier          (NaiveBayes)
GaussianProcessClassifier     (MLJScikitLearnInterface)
GradientBoostingClassifier    (MLJScikitLearnInterface)
KNNClassifier                 (NearestNeighborModels)
KNeighborsClassifier          (MLJScikitLearnI

In [168]:
LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels
mach = machine(LogisticClassifier(), X2, y)

import MLJLinearModels ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /Users/mph/.julia/packages/MLJModels/UM8fF/src/loading.jl:159


untrained Machine; caches model-specific representations of data
  model: LogisticClassifier(lambda = 2.220446049250313e-16, …)
  args: 
    1:	Source @792 ⏎ Table{AbstractVector{Continuous}}
    2:	Source @994 ⏎ AbstractVector{Multiclass{99}}


In [169]:
fit!(mach)

┌ Info: Training machine(LogisticClassifier(lambda = 2.220446049250313e-16, …), …).
└ @ MLJBase /Users/mph/.julia/packages/MLJBase/g5E7V/src/machines.jl:492
┌ Info: Solver: MLJLinearModels.LBFGS{Optim.Options{Float64, Nothing}, NamedTuple{(), Tuple{}}}
│   optim_options: Optim.Options{Float64, Nothing}
│   lbfgs_options: NamedTuple{(), Tuple{}} NamedTuple()
└ @ MLJLinearModels /Users/mph/.julia/packages/MLJLinearModels/9ZsIv/src/mlj/interface.jl:72


┌ Error: Problem fitting the machine machine(LogisticClassifier(lambda = 2.220446049250313e-16, …), …). 
└ @ MLJBase /Users/mph/.julia/packages/MLJBase/g5E7V/src/machines.jl:682
┌ Info: Running type checks... 
└ @ MLJBase /Users/mph/.julia/packages/MLJBase/g5E7V/src/machines.jl:688
┌ Info: Type checks okay. 
└ @ MLJBase /Users/mph/.julia/packages/MLJBase/g5E7V/src/machines.jl:692
