In [None]:
using Random
using Statistics
using ProgressMeter
using DataFrames
using Suppressor
using JuMP
using Ipopt
using Combinatorics
import MathOptInterface as MOI

The following notebook reproduces the tests in our original manuscript for the quadratic separability of the best subsets under various objective and lifting functions.

## Helper functions

We first write a method that uses Ipopt and JuMP to separate the convex hulls of the inliers and outliers for different objective and lift functions as described in the original manuscript:
> We will map $\mathcal I$ and $\mathcal O$ into higher dimensional vector spaces such that $\operatorname{conv}\left( \mathbf V_{\mathcal I} \right) \cap \operatorname{conv}\left( \mathbf V_{\mathcal O} \right) = \emptyset$, where $\mathbf V_{\mathcal I}$ and $\mathbf V_{\mathcal O}$ are the two sets of points concatenated into separate matrices. Intersections of the two hulls can be detected by computing the smallest distance $d^*$ between any two points $\mathbf v_1 \in \operatorname{conv}\left( \mathbf V_{\mathcal I} \right)$ and $\mathbf v_2 \in \operatorname{conv}\left( \mathbf V_{\mathcal O} \right)$. If $d^* = 0$, then the points are identical and the intersection is non-empty. We can compute $d^*$ by solving the following quadratic program:
\begin{aligned}
\min \ & \lVert \boldsymbol{\lambda}^\top \mathbf V_{\mathcal I} - \boldsymbol{\mu}^\top \mathbf V_{\mathcal O} \rVert_2^2 \\
\text{subject to} \ & \sum_{i=1}^{d} \lambda_i = 1, \\
& \lambda_i \geq 0 \quad \forall i \in [d], \\
& \sum_{i=1}^{d} \mu_i = 1, \\
& \mu_i \geq 0 \quad \forall i \in [d].
\end{aligned}

In [None]:
function lp_intersect(A::Matrix{Float64}, B::Matrix{Float64}, tol::Float64 = 1e-10)
    @suppress begin
        @assert size(A, 2)==size(B, 2) "Points must have same dimension"

        m, d = size(A)
        n, _ = size(B)

        # Create model
        model = Model(Ipopt.Optimizer)
        set_silent(model)

        # Define variables
        @variable(model, lambda[1:m]>=0)
        @variable(model, mu[1:n]>=0)

        # Constraints
        @constraint(model, sum(lambda)==1)
        @constraint(model, sum(mu)==1)

        # Objective function
        diff = lambda' * A .- mu' * B
        @objective(model, Min, sum(diff .^ 2))

        # Optimize the model
        optimize!(model)

        # Return whether less than tol
        return model
    end
end;

Next, we write a simple method that uses brute-force to partition the points using brute-force combinatorial search according to arbitrary scoring functions.

In [None]:
function brute_force_partition(x::Vector{Float64}, y::Vector{Float64}, k::Int,
        score::Function)::Tuple{Vector{Int64}, Vector{Int64}}
    n = size(x, 1)
    best_val = -Inf
    best_idxs = nothing

    # Try each subset of size k
    for idxs in combinations(1:n, k)
        if (val = score(x[idxs], y[idxs])) > best_val
            best_val = val
            best_idxs = idxs
        end
    end

    # Get complement of best set
    comp_idxs = setdiff(1:n, best_idxs)

    return best_idxs, comp_idxs
end;

## Experiment parameters

Here, we define our score and lift functions.

In [None]:
scores = [
    (x, y) -> cor(x, y),
    (x, y) -> cor(x, y)^2,
    (x, y) -> cov(x, y),
    (x, y) -> var(x) + var(y),
    (x, y) -> var(x) - var(y)
]

lifts = [
    (x, y) -> hcat(x .^ 2, x .* y, y .^ 2, x, y),
    (x, y) -> hcat(x .^ 2, y .^ 2, x, y),
    (x, y) -> hcat(x .* y, x, y)
];

Now, we seed our experiment and set the parameters.

In [None]:
tol = 1e-10
init_seed = 1234
seed_rng = MersenneTwister(init_seed)
n_iter = 1_000
k = 8
n = 15;

## Run experiment

We now initialize our dataframe, run the experiment, and view the results.

In [None]:
# DataFrame to store the means and standard deviations of all combinations
summary_df = DataFrame(
    score_func = String[],
    lift_func = String[],
    mean_success = Float64[],
    std_success = Float64[],
    mean_objective_value = Float64[],
    std_objective_value = Float64[],
    mean_dual_objective_value = Float64[],
    std_dual_objective_value = Float64[],
    mean_barrier_iterations = Float64[],
    std_barrier_iterations = Float64[],
    mean_solve_time = Float64[],
    std_solve_time = Float64[]
);

In [None]:
for score_idx in 1:length(scores)
    for lift_idx in 1:length(lifts)
        score = scores[score_idx]
        lift = lifts[lift_idx]

        results_df = DataFrame(
            success_value = Bool[],
            objective_value = Float64[],
            dual_objective_value = Float64[],
            barrier_iterations = Int[],
            solve_time = Float64[]
        )

        @showprogress for i in 1:n_iter
            # Seed and generate data
            seed = rand(seed_rng, UInt128)
            data_rng = MersenneTwister(seed)
            x, y = rand(data_rng, n), rand(data_rng, n)

            # Separate via brute-force
            best_idxs, comp_idxs = brute_force_partition(x, y, k, score)

            # Lift data
            lifted = lift(x, y)
            A = lifted[best_idxs, :]
            B = lifted[comp_idxs, :]

            model = lp_intersect(A, B)

            # Collect optimization attributes
            obj_value = MOI.get(model, MOI.ObjectiveValue())
            dual_obj_value = MOI.get(model, MOI.DualObjectiveValue())
            barrier_iters = MOI.get(model, MOI.BarrierIterations())
            solve_time = MOI.get(model, MOI.SolveTimeSec())
            success = obj_value > tol

            # Append results to the DataFrame
            push!(
                results_df, (success, obj_value, dual_obj_value, barrier_iters, solve_time))
        end

        # Compute mean and standard deviation of the attributes
        mean_values = combine(results_df, names(results_df) .=> mean)
        std_values = combine(results_df, names(results_df) .=> std)

        # Append the mean and std results to the summary DataFrame
        push!(summary_df,
            (
                "score_$score_idx",  # Adding labels for each score and lift
                "lift_$lift_idx",
                mean_values[1, :success_value_mean], std_values[1, :success_value_std],
                mean_values[1, :objective_value_mean], std_values[1, :objective_value_std],
                mean_values[1, :dual_objective_value_mean], std_values[
                    1, :dual_objective_value_std],
                mean_values[1, :barrier_iterations_mean], std_values[
                    1, :barrier_iterations_std],
                mean_values[1, :solve_time_mean], std_values[1, :solve_time_std]
            ))
    end
end

In [None]:
summary_df