In [120]:
using CSV, DataFrames, Statistics, Random, Distributions, ScikitLearn
@sk_import linear_model: LogisticRegression

PyObject <class 'sklearn.linear_model._logistic.LogisticRegression'>

### Util functions

In [114]:
function standardize_data(df, df_test)
    numerical_cols = names(df, Number)
    mean_df = [mean(df[!, col]) for col in numerical_cols]
    std_df = [std(df[!, col]) for col in numerical_cols]

    for (col, mean_value, std_value) in zip(numerical_cols, mean_df, std_df)
        df[!, col] = (df[!, col] .- mean_value) / std_value
        df_test[!, col] = (df_test[!, col] .- mean_value) / std_value
    end

    return df, df_test
end

standardize_data (generic function with 2 methods)

### Load data

In [92]:
dataset = CSV.read("data/abalone/abalone_original.csv", DataFrame);
y = dataset[:, "rings"];

categorical_var = ["sex", "rings"]
dataset_numerical = select!(dataset, Not(categorical_var))
X_normalized = standardize_data(dataset_numerical);

### Covariate shift

- think about the shift values more
- abalone original has been normalized

In [99]:
function shift_dataset(df, features_to_shift, shift_type="normal",seed_value=123)
    mean_df = [mean(df[!, col]) for col in features_to_shift]
    std_df = [std(df[!, col]) for col in features_to_shift]
    
    df_shifted = copy(df)
    column_indices = [findfirst(isequal(col), names(df)) for col in features_to_shift]
        
    Random.seed!(seed_value)
    
    if shift_type == "normal"
        dist = MvNormal(mean_df, std_df)
    else shift_type == "uniform"
        dist = Uniform(0, 1)
    end
            
    shift_values = Array(rand(d, (size(df, 1), length(features_to_shift))))
    
    for i in 1:size(df, 1)
        for j in 1:length(features_to_shift)
            df_shifted[i, j] += shift_values[i, j]
        end
    end
        
    return df_shifted
end

shift_dataset (generic function with 4 methods)

In [101]:
features_to_shift = ["length", "diameter", "height", "whole-weight", "shucked-weight", "viscera-weight", "shell-weight"]
X_shifted = shift_dataset(X_normalized, features_to_shift, "normal");

### Split data into train & set

In [109]:
function split_data(X, X_shifted, y, train_test_split)
    num_train_indices = round(Int, train_test_split*size(X, 1))
        
    train_indices = randperm(size(X, 1))[1:num_train_indices]
    test_indices = setdiff(1:size(X, 1), train_indices)
    
    X_train, y_train = X[train_indices, :], y[train_indices]
    X_test, y_test = X_shifted[test_indices, :], y[test_indices]
    
    return X_train, y_train, X_test, y_test
end

split_data (generic function with 1 method)

In [113]:
X_train, y_train, X_test, y_test = split_data(dataset_numerical, X_shifted, y, 0.8);

### Normalize data

In [None]:
X_train_norm, X_test_norm = standardize_data(X_train, X_test)

### Calculate covariate weights

In [121]:
function check_accuracy(y_pred, y_true)
    accuracy = sum(y_pred .== y_true) / length(y_true)
    println("Accuracy: $accuracy")
end

check_accuracy (generic function with 1 method)

In [None]:
X_combined = vcat(X, X2)
y_combined = vcat(zeros(size(X1, 1)), ones(size(X2, 1)))
    
lr = LogisticRegression()
fit!(lr, X_combined, y_combined)

In [None]:
function covariate_weights(X1, X2)
    X_combined = vcat(X1, X2)
    y_combined = vcat(zeros(size(X1, 1)), ones(size(X2, 1)))
    
    lr = LogisticRegression()
    fit!(lr, X_combined, y_combined)
    
    y_pred_combined = predict(lr, X_combined)
    check_accuracy(y_pred_combined, y_combined)
    y_pred_combined_prob = predict_proba(lr, X1)
    weights = y_pred_combined_prob[:, 2] ./ y_pred_combined_prob[:, 1]
    
    return weights

In [116]:

    
    y_pred_combined = predict(lr, X_combined)
    check_accuracy(y_pred_combined, y_combined)
    
    y_pred_combined_prob = predict_proba(lr, X1)
    weights = y_pred_combined_prob[:, 2] ./ y_pred_combined_prob[:, 1]
    CSV.write("weights.csv", DataFrame(weights=weights))
    
    return weights
end

# Example usage
X1 = rand(5, 3)
X2 = rand(5, 3)
weights = covariate_weights(X1, X2)

LoadError: UndefVarError: `LogisticClassifier` not defined

### Save arrays