In [50]:
using CSV, DataFrames, Statistics, Random, Plots, JuMP, Gurobi

In [51]:
using ScikitLearn
@sk_import linear_model: LinearRegression
@sk_import linear_model: Lasso
@sk_import model_selection: train_test_split

[33m[1m└ [22m[39m[90m@ ScikitLearn.Skcore ~/.julia/packages/ScikitLearn/sqLdT/src/Skcore.jl:259[39m


PyObject <function train_test_split at 0x2b93d5ea0>

In [52]:
function get_abalone_data()
    df = CSV.read("data/abalone/abalone_original.csv", DataFrame)
    selected_columns = setdiff(names(df), ["rings", "sex"])

    X = df[:,selected_columns]
    y = CSV.read("data/abalone/abalone_original.csv", DataFrame)[!,"rings"];
    return X, y

end

function get_comp_hard_data()
    df = CSV.read("data/comp_hard/machine.data", header = false, DataFrame)
    df_names = ["Vendor_Name", "Model_Name", "MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP", "ERP"]
    rename!(df, Symbol.(df_names))

    feature_list = ["MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP"]
    X = df[:, feature_list]
    y = df[!, "ERP"];

    return X, y 
end

function get_concrete_data()
    df = CSV.read("data/concrete/concrete_data.csv", DataFrame)
    selected_columns = setdiff(names(df), ["concrete_compressive_strength"])

    X = df[:,selected_columns]
    y = df[!,"concrete_compressive_strength"];
    return X, y
end

get_concrete_data (generic function with 1 method)

In [53]:
function mse(y_true, y_pred)
    return sum((y_true .- y_pred).^2) / length(y_true)
end

mse (generic function with 1 method)

In [54]:
function opt_split(X, y, lambda, train_fraction = 0.7, weight = nothing)
    # add column of ones
    X = hcat(ones(Int, size(X, 1)), X)

    #parameters
    n, p = size(X)
    k = n * train_fraction

    model = Model(Gurobi.Optimizer)
    set_optimizer_attribute(model, "OutputFlag", 0)

    @variable(model, theta)
    @variable(model, u[1:n] >= 0)
    @variable(model, beta[1:p])
    @variable(model, w[1:p])

    @objective(model, Min, k * theta + sum(u) + lambda * sum(w))

    for i in 1:p
        @constraint(model, w[i] >= beta[i])
        @constraint(model, w[i] >= -beta[i])
    end 

    for i in 1:n
        if weight == nothing
            @constraint(model, theta + u[i] >= y[i] - sum(X[i, :].*beta))
            @constraint(model, theta + u[i] >= -(y[i] - sum(X[i, :].*beta)))
        else 
            @constraint(model, theta + u[i] >=  weight[i] * (y[i] - sum(X[i, :].*beta)))
            @constraint(model, theta + u[i] >= - weight[i] * (y[i] - sum(X[i, :].*beta)))
        end 
    end
    
    optimize!(model)

    return value(theta), value.(u), value.(beta), value.(w)
end

opt_split (generic function with 3 methods)

In [55]:
function split_train_val(X, y, beta_opt, train_fraction=0.7, weights = nothing)
    # add column of ones
    X = hcat(ones(Int, size(X, 1)), X)

    residuals = y - X * beta_opt

    if weights == nothing
        sorted_indices = sortperm(abs.(residuals), rev=true)
    else 
        residuals_weights = [residuals[i] * weights[i] for i in 1:length(weights)]
        sorted_indices = sortperm(abs.(residuals_weights), rev=true)
    end

    num_train_points = round(Int, train_fraction * length(sorted_indices))

    train_indices = sorted_indices[1:num_train_points]

    val_indices = setdiff(1:length(y), train_indices)

    X_train = X[train_indices, 2:end] #do not include the 1s
    y_train = y[train_indices]

    X_val = X[val_indices, 2:end]
    y_val = y[val_indices]

    return X_train, y_train, X_val, y_val
end


split_train_val (generic function with 3 methods)

In [56]:
function get_optimised_test_score(X_full_train, y_full_train, X_test, y_test, lambdas, train_fraction=0.7, print_result=false)
    best_lambda = Inf
    best_val_mse = Inf
    best_model = Inf
    
    for lambda in lambdas
        # Get optimized split
        _, _, beta_opt, _ = opt_split(X_full_train, y_full_train, lambda, train_fraction)
        X_train, y_train, X_val, y_val = split_train_val(X_full_train, y_full_train, beta_opt, train_fraction)

        # Predict on validation set
        # y_pred_val =  X_val * beta_opt
        # val_mse_i = mse(y_val, y_pred_val)

        model = Lasso(alpha=lambda, max_iter=20000)
        fit!(model, X_train, y_train)
        y_pred_val = predict(model, X_val)
        val_mse_i = mse(y_val, y_pred_val)
        
        if best_val_mse > val_mse_i
            best_lambda = lambda
            best_val_mse = val_mse_i
            best_model = model
        end
    end

    #get mse on test set for best performing model 
    # X_test_with_intercept = hcat(ones(Int, size(X_test, 1)), X_test)
    # y_pred_test = X_test_with_intercept * best_model
    # mse_test_mse = mse(y_test, y_pred_test)

    #get test mse score on best performing model
    y_pred_test = predict(best_model, X_test)
    mse_test_mse = mse(y_test, y_pred_test)


    if print_result
        println("Best lambda: ", best_lambda)
        println("Validation score: ", best_val_mse)
        println("Test score: ", mse_test_mse)
        println("Number of betas: ", length(best_betas))
    end

    return mse_test_mse
end


get_optimised_test_score (generic function with 3 methods)

In [116]:
function get_random_test_score(X_full_train, y_full_train, X_test, y_test, lambdas, train_fraction=0.7, print_result=false)
    best_lambda = Inf
    best_val_mse = Inf
    best_model = Inf

    (X_train, y_train), (X_val, y_val) = 
        IAI.split_data(:regression, X_full_train, y_full_train, train_proportion=train_fraction, seed = 209)

    for lambda in lambdas
        
        model = Lasso(alpha=lambda, max_iter=20000)
        fit!(model, X_train, y_train)

        y_pred_val = predict(model, X_val)
        val_mse_i = mse(y_val, y_pred_val)

        if best_val_mse > val_mse_i
            best_lambda = lambda
            best_val_mse = val_mse_i
            best_model = model
        end
    end

    #get test mse score on best performing model
    y_pred_test = predict(best_model, X_test)
    mse_test_mse = mse(y_test,y_pred_test)

    if print_result
        println("Best lambda: ", best_lambda)
        println("Validation score: ", best_val_mse)
        println("Test score: ", mse_test_mse)
        println("Number of betas: ", length(best_model.coef_) + length(best_model.intercept_))
    end

    return mse_test_mse
end


get_random_test_score (generic function with 3 methods)

In [117]:
function get_normalized_data(X_train, X_test)
    # Calculate mean and standard deviation from training data
    mean_vals = mean(X_train, dims=1)
    std_vals = std(X_train, dims=1)

    # Normalize training data
    X_train_norm = (X_train .- mean_vals) ./ std_vals;
    X_test_norm = (X_test .- mean_vals) ./ std_vals;

    return X_train_norm, X_test_norm
end

get_normalized_data (generic function with 1 method)

In [118]:
function prepare_data(X::DataFrame, y, random_seed = 1, train_proportion = 0.9)
    (train_X, train_y), (test_X, test_y) =
        IAI.split_data(:regression, X, y, train_proportion=train_proportion, seed=random_seed)
    
    train_X_norm, test_X_norm = get_normalized_data(Matrix(train_X), Matrix(test_X))
    
    return train_X_norm, train_y, test_X_norm, test_y
end


prepare_data (generic function with 3 methods)

In [119]:
function repeat_stable_reg(X, y, train_fraction = 0.7, num_repeats = 100, optimised_method = true)
    test_mse_scores = []

    for random_seed in 1:num_repeats
        train_X, train_y, test_X, test_y = prepare_data(X, y, random_seed, 0.9);
        
        if optimised_method
            mse_test_score = get_optimised_test_score(train_X, train_y, test_X, test_y, lambdas, train_fraction)
        else
            mse_test_score = get_random_test_score(train_X, train_y, test_X, test_y, lambdas, train_fraction)
        end

        push!(test_mse_scores, mse_test_score)
    end 
    
    return test_mse_scores
end 

repeat_stable_reg (generic function with 4 methods)

In [120]:
function calculate_percentage_improvement(opt_scores, rand_scores)
    if length(opt_scores) != length(rand_scores)
        error("Old scores and new scores must have the same length.")
    end
    
    percentage_improvements = []

    for i in 1:length(opt_scores)
        opt_score = opt_scores[i]
        rand_score = rand_scores[i]

        percentage_improvement = ((rand_score - opt_score) / rand_score) * 100
        push!(percentage_improvements, percentage_improvement)
    end

    return mean(percentage_improvements)
end

calculate_percentage_improvement (generic function with 1 method)

## Single test 

In [121]:
## hyperparamters which are common to all datasets
train_fraction = 0.7
lambdas = [0.00001, 0.0001, 0.001, 0.01, 0.1];

In [122]:
# Load the data
X_c, y_c = get_concrete_data();
train_X, train_y, test_X, test_y = prepare_data(X_c, y_c);

In [123]:
test_mse_opt = get_optimised_test_score(train_X, train_y, test_X, test_y, lambdas, train_fraction)
print("optimised split test mse: ", test_mse_opt)

Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
optimised split test mse: 112.69140148548713

In [124]:
test_mse_rand = get_random_test_score(train_X, train_y, test_X, test_y, lambdas, train_fraction)
print("randoom split test mse: ", test_mse_rand)

randoom split test mse: 113.451214856934

## Do it several times

In [125]:
# Load the data
X_c, y_c = get_concrete_data()

opt_test_mse_scores = repeat_stable_reg(X_c, y_c, train_fraction, 100, true);
rand_test_mse_scores = repeat_stable_reg(X_c, y_c, train_fraction, 100, false);
meam_percentage_diff = calculate_percentage_improvement(opt_test_mse_scores, rand_test_mse_scores)
println("percentage improvment for concrete dataset: ", meam_percentage_diff)

In [None]:
# Load the data
X_ad, y_ad = get_abalone_data()

opt_test_mse_scores = repeat_stable_reg(X_ad, y_ad, train_fraction, 100, true);
rand_test_mse_scores = repeat_stable_reg(X_ad, y_ad, train_fraction, 100, false);
meam_percentage_diff = calculate_percentage_improvement(opt_test_mse_scores, rand_test_mse_scores)
println("percentage improvment for abalone dataset: ", meam_percentage_diff)

Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11

In [None]:
# Load the data
X_ch, y_ch = get_comp_hard_data()

opt_test_mse_scores = repeat_stable_reg(X_ch, y_ch, train_fraction, 100, true);
rand_test_mse_scores = repeat_stable_reg(X_ch, y_ch, train_fraction, 100, false);
meam_percentage_diff = calculate_percentage_improvement(opt_test_mse_scores, rand_test_mse_scores)
println("percentage improvment for comp_hard dataset: ", meam_percentage_diff)

Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11
Set parameter Username
Academic license - for non-commercial use only - expires 2024-09-11