In [2]:
using CSV, DataFrames, Statistics, Random, Plots, JuMP, Gurobi

In [404]:
X1= CSV.read("X_train", header=false, DataFrame); 
y1= CSV.read("y_train", header=false,DataFrame); 

# (train_X, train_y), (test_X, test_y) =
#     IAI.split_data(:regression, X1, [Array(y1)[i] for i in 1:nrow(X1)], train_proportion=0.7);
split = 800
train_X = DataFrame(X1[1:split, :])
train_y = permutedims(DataFrame(y1[:, 1:split]))
test_X = DataFrame(X1[split + 1:end, :])
test_y = permutedims(DataFrame(y1[:, split + 1:end]))

X2 = CSV.read("X_test", header=false, DataFrame); 
y2 = permutedims(CSV.read("y_test", header=false, DataFrame)); 
weights = permutedims(CSV.read("y_test", header=false, DataFrame)[:,1:split]); 

## split train into train and val


In [377]:
n = nrow(train_X)
p = ncol(train_X)
train_fraction = 0.8
k = n * train_fraction
lambda = 0.001

0.001

In [20]:
function opt_split(X, y, n, p, k, lambda, weight = nothing)

    model = Model(Gurobi.Optimizer)
    set_optimizer_attribute(model, "OutputFlag", 0)

    @variable(model, theta)
    @variable(model, u[1:n] >= 0)
    @variable(model, beta[1:p])
    @variable(model, w[1:p])

    @objective(model, Min, k * theta + sum(u) + lambda * sum(w))

    for i in 1:p
        @constraint(model, w[i] >= beta[i])
        @constraint(model, w[i] >= -beta[i])
    end 

    for i in 1:n
        if weight == nothing
            @constraint(model, theta + u[i] >= y[i] - sum(X[i, :].*beta))
            @constraint(model, theta + u[i] >= -(y[i] - sum(X[i, :].*beta)))
        else 
            @constraint(model, theta + u[i] >=  weight[i] * (y[i] - sum(X[i, :].*beta)))
            @constraint(model, theta + u[i] >= - weight[i] * (y[i] - sum(X[i, :].*beta)))
        end 
    end
    
    optimize!(model)

    return value(theta), value.(u), value.(beta), value.(w)
end

opt_split (generic function with 2 methods)

In [48]:
function split_train_val(X, y, beta_star, train_fraction = 0.8, weights = nothing)
    
    residuals = y - X * beta_star

    if weights == nothing
        sorted_indices = sortperm(abs.(residuals), rev=true)
    else 
        residuals_weights = [residuals[i] * weights[i] for i in 1:length(weights)]
        sorted_indices = sortperm(abs.(residuals_weights), rev=true)
    end

    num_train_points = round(Int, train_fraction * length(sorted_indices))

    train_indices = sorted_indices[1:num_train_points]

    val_indices = setdiff(1:length(y), train_indices)

    X_train = X[train_indices,:]
    y_train = y[train_indices,:]

    X_val = X[val_indices,:]
    y_val = y[val_indices,:]

    return X_train, y_train, X_val, y_val
end


split_train_val (generic function with 3 methods)

In [389]:
#optimised split 
_, _, betas_star, _ = opt_split(Array(train_X), Array(train_y), n, p, k, lambda)
X_train_opt, y_train_opt, X_val_opt, y_val_opt= split_train_val(Array(train_X), Array(train_y), betas_star)

#non optimised split 
(X_train, y_train), (X_val, y_val) =
    IAI.split_data(:regression, train_X, [Array(train_y)[i] for i in 1:nrow(train_X)], train_proportion=0.8);

Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11


In [390]:
using ScikitLearn
@sk_import linear_model: LinearRegression
@sk_import model_selection: train_test_split


[33m[1m└ [22m[39m[90m@ ScikitLearn.Skcore ~/.julia/packages/ScikitLearn/sqLdT/src/Skcore.jl:259[39m


PyObject <function train_test_split at 0x2c750dea0>

In [57]:
function mse(y_true, y_pred)
    mse = mean((y_true .- y_pred).^2)
    return mse
end


mse (generic function with 1 method)

In [398]:
model_opt = LinearRegression()
fit!(model_opt, X_train_opt, y_train_opt)
y_pred_val_opt = predict(model_opt, X_val_opt)
y_pred_test_opt = predict(model_opt, Matrix(test_X))
println("val score: ", mse(y_val_opt, y_pred_val_opt))
println("test score: ", mse(Array(test_y), y_pred_test_opt))

val score: 0.005515166992537046
test score: 0.005961225192939648


In [399]:
model = LinearRegression()
fit!(model, Matrix(X_train), Array(y_train))
y_pred_val = predict(model, Matrix(X_val))
y_pred_test = predict(model, Matrix(test_X))
println("val score: ", mse(y_val, y_pred_val))
println("test score: ", mse(Array(test_y), y_pred_test))

val score: 0.00528208518066815
test score: 0.005869018814113406


## try with weights

In [412]:
#optimised split 
_, _, betas_star, _ = opt_split(Array(train_X), Array(train_y), n, p, k, lambda, Matrix(weights))
X_train_opt, y_train_opt, X_val_opt, y_val_opt= split_train_val(Array(train_X), Array(train_y), betas_star, 0.8, Matrix(weights))

#non optimised split 
(X_train, y_train), (X_val, y_val) =
    IAI.split_data(:regression, train_X, [Array(train_y)[i] for i in 1:nrow(train_X)], train_proportion=0.8);

Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11


In [413]:
model_opt = LinearRegression()
fit!(model_opt, X_train_opt, y_train_opt)
y_pred_val_opt = predict(model_opt, X_val_opt)
y_pred_test_opt = predict(model_opt, Matrix(test_X))
println("val score: ", mse(y_val_opt, y_pred_val_opt))
println("test score: ", mse(Array(test_y), y_pred_test_opt))

val score: 0.005703388444933003
test score: 0.005958287219605079


In [414]:
model = LinearRegression()
fit!(model, Matrix(X_train), Array(y_train))
y_pred_val = predict(model, Matrix(X_val))
y_pred_test = predict(model, Matrix(test_X))
println("val score: ", mse(y_val, y_pred_val))
println("test score: ", mse(Array(test_y), y_pred_test))

val score: 0.0057961063578372465
test score: 0.005882404456655677


## abolone dataset

In [54]:
using ScikitLearn
@sk_import linear_model: Lasso

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mmkl not found, proceeding to installing non-mkl versions of sci-kit learn via Conda
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda install -y -c conda-forge 'scikit-learn>=1.2,<1.3'` in root environment


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.





  current version: 23.3.1
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0


[33m[1m└ [22m[39m[90m@ ScikitLearn.Skcore ~/.julia/packages/ScikitLearn/sqLdT/src/Skcore.jl:259[39m


PyObject <function train_test_split at 0x2ca185ea0>

In [33]:
# Load the data
df = CSV.read("abalone_original.csv", DataFrame)
selected_columns = setdiff(names(df), ["rings", "sex"])

X = df[:,selected_columns]
y = CSV.read("abalone_original.csv", DataFrame)[!,"rings"];

(train_X, train_y), (test_X, test_y) =
    IAI.split_data(:regression, X, y, train_proportion=0.8);

In [49]:
#get optimised split 
n = nrow(train_X)
p = ncol(train_X)
train_fraction = 0.8
k = n * train_fraction
lambda = 0.001

_, _, betas_star, _ = opt_split(Array(train_X), Array(train_y), n, p, k, lambda)
X_train_opt, y_train_opt, X_val_opt, y_val_opt= split_train_val(Array(train_X), Array(train_y), betas_star)

Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11


([91.0 71.0 … 23.2 26.7; 140.0 117.0 … 64.3 95.0; … ; 110.0 86.0 … 34.7 71.0; 117.0 84.0 … 44.5 64.0], [8; 29; … ; 13; 11;;], [95.0 74.0 … 22.5 33.0; 107.0 81.0 … 34.2 41.0; … ; 81.0 60.0 … 10.1 17.6; 100.0 76.0 … 25.3 30.7], [9; 10; … ; 7; 9;;])

In [53]:
#get non optimised split 
(X_train_noopt, y_train_noopt), (X_val_noopt, y_val_noopt) =
    IAI.split_data(:regression, train_X, train_y, train_proportion=train_fraction);

In [58]:
model_opt = Lasso(alpha = lambda)
fit!(model_opt, X_train_opt, y_train_opt)
y_pred_val_opt = predict(model_opt, X_val_opt)
y_pred_test_opt = predict(model_opt, Matrix(test_X))
println("val score: ", mse(y_val_opt, y_pred_val_opt))
println("test score: ", mse(Array(test_y), y_pred_test_opt))

val score: 0.6367145372906455
test score: 4.284529695239996


In [60]:
model = Lasso(alpha = lambda)
fit!(model, Matrix(X_train_noopt), Array(y_train_noopt))
y_pred_val = predict(model, Matrix(X_val_noopt))
y_pred_test = predict(model, Matrix(test_X))
println("val score: ", mse(y_val_noopt, y_pred_val))
println("test score: ", mse(Array(test_y), y_pred_test))

val score: 4.802199413453855
test score: 4.262317384595217


  model = cd_fast.enet_coordinate_descent(
