In [1]:
using DataFrames, CSV
using JuMP, Gurobi
using LinearAlgebra, Random, Printf, StatsBase, CategoricalArrays
using Distributions

In [27]:
df = CSV.read("dataclean.csv", DataFrame);

In [28]:
df

Row,Column1,id,time,status,trt,age,sex,ascites,hepato,spiders,edema,bili,chol,albumin,copper,alk.phos,ast,trig,platelet,protime,stage
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Float64,Int64,Int64,Int64,Int64,Float64,Float64,Int64?,Float64,Int64?,Float64,Float64,Int64?,Int64?,Float64,Int64
1,0,1,400,2,1,58.7652,1,1,1,1,1.0,14.5,261,2.6,156,1718.0,137.95,172,190,12.2,4
2,1,2,4500,0,1,56.4463,1,0,1,1,0.0,1.1,302,4.14,54,7394.8,113.52,88,221,10.6,3
3,2,3,1012,2,1,70.0726,0,0,0,0,0.5,1.4,176,3.48,210,516.0,96.1,55,151,12.0,4
4,3,4,1925,2,1,54.7406,1,0,1,1,0.5,1.8,244,2.54,64,6121.8,60.63,92,183,10.3,4
5,4,5,1504,1,2,38.1054,1,0,1,1,0.0,3.4,279,3.53,143,671.0,113.15,72,136,10.9,3
6,5,6,2503,2,2,66.2587,1,0,1,0,0.0,0.8,248,3.98,50,944.0,93.0,63,missing,11.0,3
7,6,7,1832,0,2,55.5346,1,0,1,0,0.0,1.0,322,4.09,52,824.0,60.45,213,204,9.7,3
8,7,8,2466,2,2,53.0568,1,0,0,0,0.0,0.3,280,4.0,52,4651.2,28.38,189,373,11.0,3
9,8,9,2400,2,1,42.5079,1,0,0,1,0.0,3.2,562,3.08,79,2276.0,144.15,88,251,11.0,2
10,9,10,51,2,2,70.5599,1,1,0,1,1.0,12.6,200,2.74,140,918.0,147.25,143,302,11.5,4


In [29]:
X = Matrix(select(df, Not([:Column1, :id, :time, :status, :trt])));

In [None]:
N, S = size(X)
K = 10
S_0 = 5

In [None]:
# cut ranges for variables (make them start at 0)
Ks = [0:K, 0:2, 0:2, 0:2, 0:2, 0:3, 0:K, 0:K, 0:K, 0:K, 0:K, 0:K, 0:K, 0:K, 0:K, 0:6]

In [None]:
function get_value_for_cut(s, k)
    # get max of X for feature s
    max_s = maximum(X[:, s])
    # get min of X for feature s
    min_s = minimum(X[:, s])
    # get cut value
    return min_s + (max_s - min_s) * k / maximum(Ks[s])
end

In [None]:
# find the cuts k for variable i in feature s for which X[i,s] is is smaller than the k-th cut
function get_k_L(i, s)
    cuts = []
    for k in 1:Ks[s]
        if X[i, s] < get_value_for_cut(s, k)
            push!(cuts, k)
        end
    end
    return cuts
end

# find the cuts k for variable i in feature s for which X[i,s] is is larger than the k-th cut
function get_k_U(i, s)
    cuts = []
    for k in 1:Ks[s]
        if X[i, s] > get_value_for_cut(s, k)
            push!(cuts, k)
        end
    end
    return cuts
end

# find the patients i for cut k in feature s for which X[i,s] is is smaller than the k-th cut
function get_i_L(k, s)
    patients = []
    for i in 1:N
        if X[i, s] < get_value_for_cut(s, k)
            push!(patients, i)
        end
    end
    return patients
end

# find the patients i for cut k in feature s for which X[i,s] is is larger than the k-th cut
function get_i_U(k, s)
    patients = []
    for i in 1:N
        if X[i, s] > get_value_for_cut(s, k)
            push!(patients, i)
        end
    end
    return patients
end

In [None]:
model = Model(Gurobi.Optimizer)
set_optimizer_attribute(model, "OutputFlag", 1)
set_optimizer_attribute(model, "Threads", 20)
#set_optimizer_attribute(model, "MIPGap", 0.005)
set_optimizer_attribute(model, "TimeLimit", 600)

# variables
@variable(model, z[i=1:N], Bin)
#@variable(model, subloops >= 1)
@variable(model, L[s=1:S, k=1:K], Bin)
@variable(model, U[s=1:S, k=1:K], Bin)
@variable(model, q[s=1:S], Bin)

# constraints
@constraint(model, [i=1:N], z[i] + sum(sum(L[s, k] for k in get_k_L(i, s)) + sum(U[s, k] for k in get_k_U(i, s)) for s=1:S) >= 1)

@constraint(model, [s=1:S, k=1:Ks[s], i in get_i_L(s, k)], z[i] + L[s, k] <= 1)
@constraint(model, [s=1:S, k=1:Ks[s], i in get_i_U(s, k)], z[i] + U[s, k] <= 1)

@constraint(model, [s=1:S], sum(L[s, k] for k=1:Ks[s]) == 1)
@constraint(model, [s=1:S], sum(U[s, k] for k=1:Ks[s]) == 1)

@constraint(model, [s=1:S], q[s] + L[s, 1] >= 1)
@constraint(model, [s=1:S], q[s] + U[s, 1] >= 1)
@constraint(model, [s=1:S], q[s] + L[s, 1] + U[s, 1] <= 2)

@constraint(model, sum(q[s] for s=1:S) <= S_0)