In [29]:
using DataFrames, CSV
using JuMP, Gurobi
using LinearAlgebra, Random, Printf, StatsBase, CategoricalArrays
using Distributions

In [30]:
data = CSV.read("datasubset.csv", DataFrame);

In [31]:
data

Row,Column1,id,time,status,trt,age,sex,ascites,hepato,spiders,edema,bili,chol,albumin,copper,alk.phos,ast,trig,platelet,protime,stage
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Float64,Float64,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0,1,400,2,1.0,58.7652,1,1.0,1.0,1.0,1.0,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
2,1,2,4500,0,1.0,56.4463,1,0.0,1.0,1.0,0.0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
3,2,3,1012,2,1.0,70.0726,0,0.0,0.0,0.0,0.5,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
4,3,4,1925,2,1.0,54.7406,1,0.0,1.0,1.0,0.5,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
5,4,5,1504,1,2.0,38.1054,1,0.0,1.0,1.0,0.0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0
6,6,7,1832,0,2.0,55.5346,1,0.0,1.0,0.0,0.0,1.0,322.0,4.09,52.0,824.0,60.45,213.0,204.0,9.7,3.0
7,7,8,2466,2,2.0,53.0568,1,0.0,0.0,0.0,0.0,0.3,280.0,4.0,52.0,4651.2,28.38,189.0,373.0,11.0,3.0
8,8,9,2400,2,1.0,42.5079,1,0.0,0.0,1.0,0.0,3.2,562.0,3.08,79.0,2276.0,144.15,88.0,251.0,11.0,2.0
9,9,10,51,2,2.0,70.5599,1,1.0,0.0,1.0,1.0,12.6,200.0,2.74,140.0,918.0,147.25,143.0,302.0,11.5,4.0
10,10,11,3762,2,2.0,53.7139,1,0.0,1.0,1.0,0.0,1.4,259.0,4.16,46.0,1104.0,79.05,79.0,258.0,12.0,4.0


In [32]:
X = Matrix(select(data, Not([:Column1, :id, :time, :status, :trt])));

In [99]:
#Define constants

#Number of patients
N = size(data)[1]

#Min _N and Max N_ number of patients allowed in an interpretable subset
_N = 30
N_ = 100

100

In [100]:
#Define a list of patient numbers that are in each of the two treatment groups
#Note that 2 in the data set means placebo, and 1 means experimental group

T0 = findall(data[!,"trt"].==2) #indices of placebo patients
T1 = findall(data[!,"trt"].==1) #indices of experimental patients

T = [T0, T1]

2-element Vector{Vector{Int64}}:
 [5, 6, 7, 9, 10, 11, 12, 14, 15, 18  …  258, 261, 263, 265, 267, 269, 270, 271, 273, 276]
 [1, 2, 3, 4, 8, 13, 16, 17, 20, 22  …  256, 259, 260, 262, 264, 266, 268, 272, 274, 275]

In [101]:
#Patient-wise (from patient i=N) whether the patient is in placebo (1) or experimental group (2)
Ti = (data[!,"trt"].==1).+1;

In [102]:
Ti[5]

1

In [140]:
N, S = size(X)
K = 3
S_0 = 2

2

In [141]:
# cut ranges for variables (make them start at 0)
Ks = [K, 3, 3, 3, 3, 4, K, K, K, K, K, K, K, K, K, 7];

In [142]:
function get_value_for_cut(s, k)
    # get max of X for feature s
    max_s = maximum(X[:, s])
    # get min of X for feature s
    min_s = minimum(X[:, s])
    # get cut value
    return min_s + (max_s - min_s) * (k - 1) / (Ks[s] - 1) 
end

get_value_for_cut (generic function with 1 method)

In [143]:
# find the cuts k for variable i in feature s for which X[i,s] is is smaller than the k-th cut
function get_k_L(i, s)
    cuts = []
    for k in 1:Ks[s]
        if X[i, s] < get_value_for_cut(s, k)
            push!(cuts, k)
        end
    end
    return cuts
end

# find the cuts k for variable i in feature s for which X[i,s] is is larger than the k-th cut
function get_k_U(i, s)
    cuts = []
    for k in 1:Ks[s]
        if X[i, s] > get_value_for_cut(s, k)
            push!(cuts, k)
        end
    end
    return cuts
end

# find the patients i for cut k in feature s for which X[i,s] is is smaller than the k-th cut
function get_i_L(s, k)
    patients = []
    for i in 1:N
        if X[i, s] < get_value_for_cut(s, k)
            push!(patients, i)
        end
    end
    return patients
end

# find the patients i for cut k in feature s for which X[i,s] is is larger than the k-th cut
function get_i_U(s, k)
    patients = []
    for i in 1:N
        if X[i, s] > get_value_for_cut(s, k)
            push!(patients, i)
        end
    end
    return patients
end

get_i_U (generic function with 1 method)

In [144]:
# treatment effect
# set multiplier to 10 for survival, 3 for transplant and 1 for death
function multiplier(patient)
    if data[patient, :status] == 2
        return 1
    elseif data[patient, :status] == 1
        return 3
    else
        return 10
    end
end

# define treatment effect as time * multiplier for each patient
v = [data[patient, :time] * multiplier(patient) for patient in 1:N];

In [145]:
model = Model(Gurobi.Optimizer)
set_optimizer_attribute(model, "OutputFlag", 1)
set_optimizer_attribute(model, "Threads", 20)
#set_optimizer_attribute(model, "MIPGap", 0.005)
set_optimizer_attribute(model, "TimeLimit", 600)

# variables
#Variables
@variable(model, z[1:N], Bin) #Indicator variable - if each patient i is contained within the interpretable subgroup
@variable(model, theta[_N:N_, 1:2], Bin) #Indicator variable - if j between _N and N_ is equal to the number of patients from treatment group t within the interpretable subgroup
@variable(model, zeta[1:N,_N:N_]) #Indicator variable that is 1 iff both zi = 1 and thetaj = 1
@variable(model, L[s=1:S, k=1:maximum(Ks)], Bin)
@variable(model, U[s=1:S, k=1:maximum(Ks)], Bin)
@variable(model, q[s=1:S], Bin)

# constraints
@constraint(model, [i=1:N], z[i] + sum(sum(L[s, k] for k in get_k_L(i, s)) + sum(U[s, k] for k in get_k_U(i, s)) for s=1:S) >= 1)

@constraint(model, [s=1:S, k=1:Ks[s], i in get_i_L(s, k)], z[i] + L[s, k] <= 1)
@constraint(model, [s=1:S, k=1:Ks[s], i in get_i_U(s, k)], z[i] + U[s, k] <= 1)

@constraint(model, [s=1:S], sum(L[s, k] for k=1:Ks[s]) == 1)
@constraint(model, [s=1:S], sum(U[s, k] for k=1:Ks[s]) == 1)

# @constraint(model, [s=1:S], sum(L[s, k] for k=1:K) == 1)
# @constraint(model, [s=1:S], sum(U[s, k] for k=1:K) == 1)

@constraint(model, [s=1:S], q[s] + L[s, 1] >= 1)
@constraint(model, [s=1:S], q[s] + U[s, Ks[s]] >= 1)
@constraint(model, [s=1:S], q[s] + L[s, 1] + U[s, Ks[s]] <= 2)

@constraint(model, sum(q[s] for s=1:S) <= S_0)

@constraint(model, [t=1:2], _N <= sum(z[i] for i in T[t]) <= N_) #The number of patients within the interpretable subgroup from EACH treatment group must be within the bounds _N and N_

@constraint(model, [i=1:N, j=_N:N_], zeta[i,j] <= theta[j,Ti[i]]) #Ensuring z works as indicator variable (see variable section)
@constraint(model, [i=1:N, j=_N:N_], zeta[i,j] <= z[i])
@constraint(model, [i=1:N, j=_N:N_], zeta[i,j] >= theta[j,Ti[i]] + z[i] - 1)

@constraint(model, [t=1:2], sum(sum((1/j)*zeta[i,j] for j in _N:N_) for i in T[t]) == 1) #Confirming that the sum of the patients in the interpretable cluster equals j for each treatment group

@constraint(model, [t=1:2], sum(theta[j,t] for j=_N:N_) == 1) #Ensuring theta works as indicator variable (see variable section)

@constraint(model, [i=1:N, j=_N:N_], 0<=zeta[i,j]<=1) #Zeta bounds

@objective(model, Max, sum(sum((1/j) * v[i] * zeta[i,j] for j in _N:N_) for i in T1) - sum(sum((1/j) * v[i] * zeta[i,j] for j in _N:N_) for i in T0)); #Objective function

Set parameter Username
Academic license - for non-commercial use only - expires 2023-11-08
Set parameter Threads to value 20
Set parameter TimeLimit to value 600


In [146]:
optimize!(model)

Set parameter Threads to value 20
Set parameter TimeLimit to value 600
Gurobi Optimizer version 10.0.0 build v10.0.0rc2 (win64)

CPU model: 12th Gen Intel(R) Core(TM) i9-12900HK, instruction set [SSE2|AVX|AVX2]
Thread count: 14 physical cores, 20 logical processors, using up to 20 threads

Optimize a model with 91720 rows, 39852 columns and 235809 nonzeros
Model fingerprint: 0x363634be
Variable types: 39194 continuous, 658 integer (658 binary)
Coefficient statistics:
  Matrix range     [1e-02, 1e+00]
  Objective range  [4e-01, 2e+03]
  Bounds range     [1e+00, 1e+02]
  RHS range        [1e+00, 2e+00]
Presolve removed 30914 rows and 19785 columns
Presolve time: 0.92s
Presolved: 60806 rows, 20067 columns, 162358 nonzeros
Variable types: 19596 continuous, 471 integer (451 binary)

Deterministic concurrent LP optimizer: primal and dual simplex (primal and dual model)
Showing first log only...

Root relaxation presolved: 60806 rows, 20067 columns, 162358 nonzeros

Concurrent spin time: 0.00

In [147]:
L_opt = value.(L)

16×7 Matrix{Float64}:
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 0.0   1.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 0.0   1.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0  -0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0   0.0  0.0  0.0  0.0  0.0
 1.0   0.0  -0.0  0.0  0.0  0.0  0.0

In [148]:
U_opt = value.(U)

16×7 Matrix{Float64}:
 0.0  -0.0   1.0   0.0  0.0  0.0  0.0
 0.0   0.0   1.0   0.0  0.0  0.0  0.0
 0.0  -0.0   1.0   0.0  0.0  0.0  0.0
 0.0  -0.0   1.0   0.0  0.0  0.0  0.0
 0.0   0.0   1.0   0.0  0.0  0.0  0.0
 0.0   0.0  -0.0   1.0  0.0  0.0  0.0
 0.0  -0.0   1.0   0.0  0.0  0.0  0.0
 0.0  -0.0   1.0   0.0  0.0  0.0  0.0
 0.0   0.0   1.0   0.0  0.0  0.0  0.0
 0.0  -0.0   1.0   0.0  0.0  0.0  0.0
 0.0   0.0   1.0   0.0  0.0  0.0  0.0
 0.0  -0.0   1.0   0.0  0.0  0.0  0.0
 0.0  -0.0   1.0   0.0  0.0  0.0  0.0
 0.0   0.0   1.0   0.0  0.0  0.0  0.0
 0.0   0.0   1.0   0.0  0.0  0.0  0.0
 0.0   0.0   0.0  -0.0  0.0  0.0  1.0