In [1]:
using Pkg;
Pkg.activate(".");
Pkg.instantiate();

[32m[1m  Activating[22m[39m project at `/DoingRightNow`


In [2]:
using DataFrames, Arrow, CategoricalArrays, ScientificTypes, MLJ, MLJBase

In [4]:
DATA_FILE_PATH = "./data/model_data.arrow";
df = DataFrame(Arrow.Table(DATA_FILE_PATH));
df = copy(df);

In [5]:

function clean_data!(df)
    
    # Fix machine types.
    HEFAMINC_ordered_set = [
        "Less than 5,000",
        "5,000 to 7,499",
        "7,500 to 9,999",
        "10,000 to 12,499",
        "12,500 to 14,999",
        "15,000 to 19,999",
        "20,000 to 24,999",
        "25,000 to 29,999",
        "30,000 to 34,999",
        "35,000 to 39,999",
        "40,000 to 49,999",
        "50,000 to 59,999",
        "60,000 to 74,999",
        "75,000 to 99,999",
        "100,000 to 149,999",
        "150,000 and over"
    ]

    df.TRTIER2 = categorical(df.TRTIER2)
    df.GESTFIPS_label = categorical(df.GESTFIPS_label)
    df.HEFAMINC_label = categorical(df.HEFAMINC_label; levels=HEFAMINC_ordered_set, ordered=true)
    df.PEMARITL_label = categorical(df.PEMARITL_label)
    df.HETENURE_label = categorical(df.HETENURE_label)
    df.TUDIARYDAY_label = categorical(df.TUDIARYDAY_label)

    # drop columns and disallow missing.
    drop_cols = [
        :TUCASEID,:TUACTIVITY_N,:TUSTARTTIM,:TUSTOPTIME,
        :start_time_int,:stop_time_int,:TULINENO, :TUDIARYDAY
        ]
    select!(df, Not(drop_cols))
    disallowmissing!(df)

    # Define scientific types.
    coerce!(df, :snap_time_int => Continuous, :PRTAGE => Continuous)
end

clean_data!(df);

In [6]:
y, X = unpack(df, ==(:TRTIER2));

In [7]:
train, test = partition(eachindex(y), 0.8)

([1, 2, 3, 4, 5, 6, 7, 8, 9, 10  …  2093636, 2093637, 2093638, 2093639, 2093640, 2093641, 2093642, 2093643, 2093644, 2093645], [2093646, 2093647, 2093648, 2093649, 2093650, 2093651, 2093652, 2093653, 2093654, 2093655  …  2617047, 2617048, 2617049, 2617050, 2617051, 2617052, 2617053, 2617054, 2617055, 2617056])

In [8]:
X_test = X[test, :]
X = X[train,:]

y_test = y[test]
y = y[train];

2093645-element CategoricalArray{Int64,1,UInt32}:
 101
 101
 101
 101
 101
 101
 101
 101
 101
 101
 ⋮
 501
 1203
 101
 1203
 101
 101
 101
 101
 101

## Find the right model to use

We'll take a look at what type of models are available to MLJ to predict on our target.

In [None]:
for m in models(matching(X,y))
    if m.prediction_type == :probabilistic
        println(rpad(m.name, 30), "($(m.package_name))")
    end
end

The only models showing are tree-based models. We're prodicting a multi-class category. And this is how it is encoded in the data. Tree-based models will handle this explicitly.

But we _should_ be able to use something like a multivariate logistic regression, shouldn't we? Likely, the reason is typing. A regression isn't going to work on non-encoded predictors. According to the documentation, it _should_ properly interpret the multivariate target though.

In [None]:
# One Hot Encode X into a new object called X2.
ohe = OneHotEncoder(drop_last=true)
mach = fit!(machine(ohe, X))
X2 = MLJ.transform(mach, X)

# Search for the available models.
for m in models(matching(X2,y))
    if m.prediction_type == :probabilistic
        println(rpad(m.name, 30), "($(m.package_name))")
    end
end

That's a big variety of models to choose from.

We'll start from the smaller list of tree-based models. The random forest is a good one. We can do this two ways -- by using the default `RandomForestClassifier` or by composing our own bagging of a set of `DecisionTreeClassifier` models.

The easy, fast thing to do would be to use the default. But I'd like to get some practice in. So I'm going to do the bagging from scratch.

Before I continue, I'm going to partition the data into testing and training.

## Random Forest Classifier

Note, a lot of this is adopted from [this MLJ documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/#Tuning-multiple-nested-hyperparameters) and to a lesser extent from [this slightly outdated tutorial](https://juliaai.github.io/DataScienceTutorials.jl/getting-started/ensembles-2/).

The `DecisionTreeClassifier` from the `BetaML` package works with no encoding or transformation. But it takes a very long time to run. We'll try setting up a pipeline to transform the data ard run the `DecisionTreeClassifier` from the `DecisionTree` package.

In [None]:
# Load models from packages.
DecisionTreeClassifier = @load DecisionTreeClassifier pkg=DecisionTree

### Step 1 -- Define a new model struct.

Likely this is a probabalistic model. We'll need to confirm this and define a probabalistic network composite model.

In [None]:
supertype(typeof(DecisionTreeClassifier()))    # Should be "Probabilistic"

In [None]:
# Define a new model struct.
mutable struct CompositeA <: ProbabilisticNetworkComposite
    preprocessor    # This part does the pre-processing.
    classifier    # This part does the classifying
end

### Step 2 -- Create and wrap the learning network in `prefit`

In [None]:
# Wrap the above steps into a function called `prefit`
import MLJBase    # We need to import in order to overload `MLJBase.prefit`
function MLJBase.prefit(composite::CompositeA, verbosity, X, y)
    # Define data input nodes. We just want the training set.
    Xs = source(X[train,:])
    ys = source(y[train])

    # First machine -- We substitute the symbols in the struct defined above for the model objects.
    mach1 = machine(:preprocessor, Xs)
    x = MLJ.transform(mach1, Xs)    # `transform` has duplicated namespace. So we specify `MLJ.transform`
    mach2 = machine(:classifier, x, ys)
    ŷ = predict(mach2, x)

    verbosity > 0 && @info "I'm a noisy fellow!"

    #return "learning network interface":
    return (; predict=ŷ)
end

`prefit` always returns a _learning network interface_. Here, the inteface dictates that calling `predict(mach, Xnew)` on a machine `mach` bound to some instance of `CompositeA` should internally call `y\hat(Xnew)`.


This means we can use the above like any other model.

In [None]:
using MLJ

one_hot_encoder = OneHotEncoder()
tree = DecisionTreeClassifier(n_subfeatures=3)
ensemble_model = EnsembleModel(model=tree, n=20)

composite_a = CompositeA(one_hot_encoder,ensemble_model)

In [None]:
mach = machine(composite_a, X, y)
#fit!(mach, rows=train, verbosity=0)
estimates = evaluate!(mach, measure=cross_entropy)    # Equal to fit! then predict! then calling the measure.

### Tuning Hyperparameters

Let's start by tuning the `tree.n_subfeatures` parameter.

In [None]:
r_n_subfeatures = range(composite_a, :(classifier.model.n_subfeatures),lower=1, upper=6)
tuned_composite_a = TunedModel(
    composite_a,
    range=r_n_subfeatures,
    tuning=RandomSearch(rng=123),
    measure=cross_entropy,
    resampling=CV(nfolds=6),
    n=100,
)
mach = machine(tuned_composite_a, X, y) |> fit!
report(mach).best_model
# estimates2 = evaluate!(mach, measure=cross_entropy)    # Equal to fit! then predict! then calling the measure.

That takes way too long. I even tried it on my gaming PC and throwing compute at it doesn't fix the problem.

Let's try the out-of-the-box RandomForest model.

# Out-of-the-box Random Forest

In [9]:
# Load models from packages.
RandomForestClassifier = @load RandomForestClassifier pkg=DecisionTree

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /root/.julia/packages/MLJModels/UM8fF/src/loading.jl:159


import MLJDecisionTreeInterface ✔




MLJDecisionTreeInterface.RandomForestClassifier

In [10]:
# Define a new model struct.
mutable struct ATUSRandomForest <: ProbabilisticNetworkComposite
    preprocessor    # This part does the pre-processing.
    classifier    # This part does the classifying
end

In [11]:
# Create prefit
import MLJBase
function MLJBase.prefit(composite::ATUSRandomForest, verbosity, X, y)

    # Learning network
    Xs = source(X)
    ys = source(y)
    mach1 = machine(:preprocessor, Xs)
    x = MLJ.transform(mach1, Xs)
    mach2 = machine(:classifier, x, ys)
    yhat = predict(mach2, x)

    verbosity > 0 && @info "I sure am noisy"

    # return "learning network interface":
    return (; predict=yhat)

end

In [12]:
one_hot_encoder = OneHotEncoder()
forest = RandomForestClassifier(
    n_subfeatures=12,
    sampling_fraction=0.3,    # We have lots of data. Only use 30%.
    max_depth=10,
    rng=71
    )

atus_random_forest = ATUSRandomForest(one_hot_encoder,forest)

ATUSRandomForest(
  preprocessor = OneHotEncoder(
        features = Symbol[], 
        drop_last = false, 
        ordered_factor = true, 
        ignore = false), 
  classifier = RandomForestClassifier(
        max_depth = 10, 
        min_samples_leaf = 1, 
        min_samples_split = 2, 
        min_purity_increase = 0.0, 
        n_subfeatures = 12, 
        n_trees = 100, 
        sampling_fraction = 0.3, 
        feature_importance = :impurity, 
        rng = 71))

In [13]:
mach = machine(atus_random_forest, X, y)
fit!(mach)

┌ Info: Training machine(ATUSRandomForest(preprocessor = OneHotEncoder(features = Symbol[], …), …), …).
└ @ MLJBase /root/.julia/packages/MLJBase/g5E7V/src/machines.jl:492
┌ Info: I sure am noisy
└ @ Main /DoingRightNow/atus_ml_model.ipynb:13


┌ Info: Training machine(:preprocessor, …).
└ @ MLJBase /root/.julia/packages/MLJBase/g5E7V/src/machines.jl:492
┌ Info: Spawning 51 sub-features to one-hot encode feature :GESTFIPS_label.
└ @ MLJModels /root/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878


┌ Info: Spawning 16 sub-features to one-hot encode feature :HEFAMINC_label.
└ @ MLJModels /root/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878
┌ Info: Spawning 2 sub-features to one-hot encode feature :PEMARITL_label.
└ @ MLJModels /root/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878
┌ Info: Spawning 3 sub-features to one-hot encode feature :HETENURE_label.
└ @ MLJModels /root/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878
┌ Info: Spawning 7 sub-features to one-hot encode feature :TUDIARYDAY_label.
└ @ MLJModels /root/.julia/packages/MLJModels/UM8fF/src/builtins/Transformers.jl:878


┌ Info: Training machine(:classifier, …).
└ @ MLJBase /root/.julia/packages/MLJBase/g5E7V/src/machines.jl:492


trained Machine; does not cache data
  model: ATUSRandomForest(preprocessor = OneHotEncoder(features = Symbol[], …), …)
  args: 
    1:	Source @887 ⏎ Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{51}}, AbstractVector{Multiclass{2}}, AbstractVector{Multiclass{3}}, AbstractVector{Multiclass{7}}, AbstractVector{OrderedFactor{16}}}}
    2:	Source @137 ⏎ AbstractVector{Multiclass{99}}


In [15]:
ŷ = predict(mach, X_test)
#MulticlassFScore()(ŷ, y[test])

523411-element UnivariateFiniteVector{Multiclass{99}, Int64, UInt32, Float64}:
 UnivariateFinite{Multiclass{99}}(101=>0.35, 102=>0.0, 103=>0.0, 104=>0.0, 105=>0.0, 201=>0.0, 202=>0.0, 203=>0.0, 204=>0.0, 205=>0.0, 206=>0.0, 207=>0.0, 208=>0.0, 209=>0.0, 299=>0.0, 301=>0.0, 302=>0.0, 303=>0.0, 304=>0.0, 305=>0.0, 399=>0.0, 401=>0.0, 402=>0.0, 403=>0.0, 404=>0.0, 405=>0.0, 499=>0.0, 501=>0.0, 502=>0.0, 503=>0.0, 504=>0.0, 599=>0.0, 601=>0.0, 602=>0.0, 603=>0.0, 604=>0.0, 699=>0.0, 701=>0.0, 702=>0.0, 801=>0.0, 802=>0.0, 803=>0.0, 804=>0.0, 805=>0.0, 806=>0.0, 807=>0.0, 899=>0.0, 901=>0.0, 902=>0.0, 903=>0.0, 904=>0.0, 905=>0.0, 999=>0.0, 1001=>0.0, 1002=>0.0, 1003=>0.0, 1004=>0.0, 1101=>0.0, 1102=>0.0, 1201=>0.0, 1202=>0.0, 1203=>0.65, 1204=>0.0, 1205=>0.0, 1301=>0.0, 1302=>0.0, 1303=>0.0, 1401=>0.0, 1499=>0.0, 1501=>0.0, 1502=>0.0, 1503=>0.0, 1504=>0.0, 1505=>0.0, 1506=>0.0, 1507=>0.0, 1508=>0.0, 1599=>0.0, 1601=>0.0, 1602=>0.0, 1801=>0.0, 1802=>0.0, 1803=>0.0, 1804=>0.0, 1805=>0.0, 180

In [17]:
mean(cross_entropy(ŷ, y_test))

1.9817923871429786

The cross_entropy for this is 1.98179

Let's try clustering the predictors first.

In [32]:
# Load models from packages.
DBSCAN = @load DBSCAN pkg=Clustering verbosity=0

# Define a new model struct.
mutable struct ATUSClusterClassifier <: ProbabilisticNetworkComposite
    one_hot_encoder    # This part does the pre-processing.
    clusterer    # This part clusters the predictors.
    classifier    # This part does the classifying
end

# Create prefit
function MLJBase.prefit(composite::ATUSClusterClassifier, verbosity, X, y)

    verbosity > 0 && @info "Running ATUSClusterClassifier composite model."

    # Learning network
    Xs = source(X)
    ys = source(y)
    ## Transform categoricals using one-hot-encoding.
    mach_ohe = machine(:one_hot_encoder, Xs)
    x_proc = MLJ.transform(mach_ohe, Xs)
    ## Cluster predictors. Produces a single categorical vector.
    mach_clust = machine(:clusterer, x_proc)
    x_clust = predict(mach_clust, x_proc)
    ## One hot encode the cluster vector.
    mach_clust_ohe = machine(:one_hot_encoder, x_clust)
    x_clust_ohe = MLJ.transform(mach_clust_ohe, x_clust)
    ## Run the classifier and predict.
    mach_class = machine(:classifier, x_clust, ys)
    yhat = predict(mach_class, x_clust)

    # return "learning network interface":
    return (; predict=yhat)

end

one_hot_encoder = OneHotEncoder(ordered_factor=false, drop_last=true)
dbscan = DBSCAN(min_cluster_size=288)    # Each individual has 288 observations. A little arbitrary.
forest = RandomForestClassifier(
    n_subfeatures=12,
    sampling_fraction=0.3,    # We have lots of data. Only use 30%.
    max_depth=10,
    rng=71
    )

atus_cluster_classifier = ATUSClusterClassifier(one_hot_encoder, dbscan,forest)

ATUSClusterClassifier(
  one_hot_encoder = OneHotEncoder(
        features = Symbol[], 
        drop_last = true, 
        ordered_factor = false, 
        ignore = false), 
  clusterer = DBSCAN(
        radius = 1.0, 
        leafsize = 20, 
        min_neighbors = 1, 
        min_cluster_size = 288), 
  classifier = RandomForestClassifier(
        max_depth = 10, 
        min_samples_leaf = 1, 
        min_samples_split = 2, 
        min_purity_increase = 0.0, 
        n_subfeatures = 12, 
        n_trees = 100, 
        sampling_fraction = 0.3, 
        feature_importance = :impurity, 
        rng = 71))

In [33]:
mach = machine(atus_cluster_classifier, X, y)
fit!(mach)

┌ Info: Training machine(ATUSClusterClassifier(one_hot_encoder = OneHotEncoder(features = Symbol[], …), …), …).
└ @ MLJBase /root/.julia/packages/MLJBase/g5E7V/src/machines.jl:492
┌ Info: I sure am noisy
└ @ Main /DoingRightNow/atus_ml_model.ipynb:30


In [None]:
ŷ = predict(mach, X_test)