# LASSO

In [1]:
using CSV, Tables, LinearAlgebra, Random, Gurobi, JuMP, Statistics, DataFrames, Metrics
#read train and validation data and split it into X and y
X_train_solar=Matrix(DataFrame(CSV.File("X_train_solar.csv")))
y_train_solar=Matrix(DataFrame(CSV.File("y_train_solar.csv")))
X_train_wind=Matrix(DataFrame(CSV.File("X_train_wind.csv")))
y_train_wind=Matrix(DataFrame(CSV.File("y_train_wind.csv")))

X_valid_solar=Matrix(DataFrame(CSV.File("X_valid_solar.csv")))
y_valid_solar=Matrix(DataFrame(CSV.File("y_valid_solar.csv")))
X_valid_wind=Matrix(DataFrame(CSV.File("X_valid_wind.csv")))
y_valid_wind=Matrix(DataFrame(CSV.File("y_valid_wind.csv")));

### NORMALIZE DATA
We normalize the data with min max scaling. We apply normalization for the numerical features only and then we use alpha to balance between numerical and binary variables.

In [2]:
function min_max_scaling(X, num_feature_indices)
    for i in num_feature_indices
        X[:, i] = (X[:, i] .- minimum(X[:, i])) ./ (maximum(X[:, i]) .- minimum(X[:, i]))
    end
    return X
end

min_max_scaling (generic function with 1 method)

#### SOLAR

In [3]:
#scale SOLAR TRAIN only numerical
X_num_solar=min_max_scaling(Matrix(X_train_solar), 1:15)

# Binary features as they are
X_bin_solar = X_train_solar[:,16:end]

# Numerical vs Categorical features scaling
alpha = 0.75
X_num_solar = alpha*X_num_solar
X_bin_solar = (1-alpha)*X_bin_solar;

# Append data
X_train_solar_norm = [X_num_solar X_bin_solar];

In [4]:
#scale SOLAR VALID only numerical
X_num_solar=min_max_scaling(Matrix(X_valid_solar), 1:15)

# Binary features as they are
X_bin_solar = X_valid_solar[:,16:end]

# Numerical vs Categorical features scaling
alpha = 0.75
X_num_solar = alpha*X_num_solar
X_bin_solar = (1-alpha)*X_bin_solar;

# Append data
X_valid_solar_norm = [X_num_solar X_bin_solar];

In [5]:
#scale Y 
y_train_solar_norm = min_max_scaling(Matrix(y_train_solar), 1:1)
y_valid_solar_norm = min_max_scaling(Matrix(y_valid_solar), 1:1);

#### WIND

In [6]:
X_num_wind=min_max_scaling(Matrix(X_train_wind), 1:15)

# Binary features as they are
X_bin_wind = X_train_wind[:,16:end]

# Numerical vs Categorical features scaling
alpha = 0.75
X_num_wind = alpha*X_num_wind
X_bin_wind = (1-alpha)*X_bin_wind;

# Append data
X_train_wind_norm = [X_num_wind X_bin_wind];

In [7]:
X_num_wind=min_max_scaling(Matrix(X_valid_wind), 1:15)

# Binary features as they are
X_bin_wind = X_valid_wind[:,16:end]

# Numerical vs Categorical features scaling
alpha = 0.75
X_num_wind = alpha*X_num_wind
X_bin_wind = (1-alpha)*X_bin_wind;

# Append data
X_valid_wind_norm = [X_num_wind X_bin_wind];

In [8]:
#scale Y 
y_train_wind_norm=min_max_scaling(Matrix(y_train_wind), 1:1)
y_valid_wind_norm=min_max_scaling(Matrix(y_valid_wind), 1:1);

## LASSO

In [9]:
function lasso(X,y,lambda)
    
    #Build model
    model = Model(Gurobi.Optimizer)#we have defined the model, pass Gurobi optimizer into the model
    set_optimizer_attribute(model,"OutputFlag",0)
    
    #Insert variables
    n, p = size(X)
    
    @variable(model, beta[j=1:p])
    @variable(model, beta_abs[j=1:p])    
    #Insert constraints
    @constraint(model, beta_abs .>= beta) #put the dot is like doing the loop over all j
    @constraint(model, beta_abs .>= -beta)
    
    #Insert objective
    @objective(model, Min, sum((y-X*beta).^2) + lambda*sum(beta_abs))
    
    # Optimize
    optimize!(model)
    
    # Return estimated betas
    return (value.(beta))
    
end

lasso (generic function with 1 method)

In [10]:
#CROSS VALIDATION: find best lambda 
function find_best_lambda(X,y,X_valid,y_valid)
    lowest_mse=Inf
    best_lambda=0
    for lambda in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
        beta_lasso=lasso(X, y, lambda)
        mse_temp= mse(y_valid, X_valid*beta_lasso)
        if mse_temp<lowest_mse
            lowest_mse=mse_temp
            best_lambda=lambda
        end
    end
    return best_lambda
end

find_best_lambda (generic function with 1 method)

### SOLAR

#### CV

In [11]:
#Not normalized
find_best_lambda(X_train_solar,y_train_solar, X_valid_solar, y_valid_solar)

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19


0.9

In [12]:
#X normalized
find_best_lambda(X_train_solar_norm,y_train_solar, X_valid_solar_norm, y_valid_solar)

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19


0.1

In [13]:
#Not normalized
find_best_lambda(X_train_solar_norm,y_train_solar_norm, X_valid_solar_norm, y_valid_solar_norm)

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19


0.1

#### EVALUATION 

In [13]:
#Not normalized
beta_lasso=lasso(X_train_solar,y_train_solar, 0.9);

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [14]:
println("MSE: ", mse(y_valid_solar, X_valid_solar*beta_lasso))
println("MAE: ", mae(y_valid_solar, X_valid_solar*beta_lasso))
println("R2: ", r2_score(y_valid_solar, X_valid_solar*beta_lasso))

MSE: 9472.655407769904
MAE: 78.9299646645766
R2: 0.35395894537563455


### Test Evaluation 

In [15]:
X_test_solar=Matrix(DataFrame(CSV.File("X_test_solar.csv")))
y_test_solar=Matrix(DataFrame(CSV.File("y_test_solar.csv")));

In [16]:
println("MSE: ", mse(y_test_solar, X_test_solar*beta_lasso))
println("MAE: ", mae(y_test_solar, X_test_solar*beta_lasso))
println("R2: ", r2_score(y_test_solar, X_test_solar*beta_lasso))

MSE: 265742.13620059553
MAE: 374.2676699339018
R2: -13.631600566108197


In [19]:
#X normalized
beta_lasso=lasso(X_train_solar_norm,y_train_solar, 0.1);

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [20]:
println("MSE: ", mse(y_valid_solar, X_valid_solar_norm*beta_lasso))
println("MAE: ", mae(y_valid_solar, X_valid_solar_norm*beta_lasso))
println("R2: ", r2_score(y_valid_solar, X_valid_solar_norm*beta_lasso))

MSE: 9665.677442833918
MAE: 80.45169548608108
R2: 0.3756043846968983


In [21]:
#scale SOLAR TEST only numerical
X_test_num_solar=min_max_scaling(Matrix(X_test_solar), 1:15)

# Binary features as they are
X_test_bin_solar = X_test_solar[:,16:end]

# Numerical vs Categorical features scaling
alpha = 0.75
X_test_num_solar = alpha*X_test_num_solar
X_test_bin_solar = (1-alpha)*X_test_bin_solar;

# Append data
X_test_solar_norm = [X_test_num_solar X_test_bin_solar];

In [22]:
println("MSE: ", mse(y_test_solar, X_test_solar_norm*beta_lasso))
println("MAE: ", mae(y_test_solar, X_test_solar_norm*beta_lasso))
println("R2: ", r2_score(y_test_solar, X_test_solar_norm*beta_lasso))

MSE: 260020.4497584387
MAE: 367.5065623207428
R2: -20.001909034715098


In [18]:
#X,y normalized
beta_lasso=lasso(X_train_solar_norm,y_train_solar_norm, 0.1);

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19


In [19]:
println("MSE: ", mse(y_valid_solar_norm, X_valid_solar_norm*beta_lasso))
println("MAE: ", mae(y_valid_solar_norm, X_valid_solar_norm*beta_lasso))
println("R2: ", r2_score(y_valid_solar_norm, X_valid_solar_norm*beta_lasso))

MSE: 0.050119619394356364
MAE: 0.18146498772482905
R2: -43.337798875564424


### WIND

#### CV

In [20]:
#Not normalized
find_best_lambda(X_train_wind,y_train_wind, X_valid_wind, y_valid_wind)

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19


0.9

In [21]:
#X normalized
find_best_lambda(X_train_wind_norm,y_train_wind, X_valid_wind_norm, y_valid_wind)

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19


0.1

In [22]:
#Not normalized
find_best_lambda(X_train_wind_norm,y_train_wind_norm, X_valid_wind_norm, y_valid_wind_norm)

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19
Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19


0.1

#### EVALUATION 

In [23]:
#Not normalized
beta_lasso=lasso(X_train_wind,y_train_wind, 0.9);

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [25]:
println("MSE: ", mse(y_valid_wind, X_valid_wind*beta_lasso))
println("MAE: ", mae(y_valid_wind, X_valid_wind*beta_lasso))
println("R2: ", r2_score(y_valid_wind, X_valid_wind*beta_lasso))

MSE: 1.3286934600135155e7
MAE: 2630.9089982001233
R2: -0.3187260316225953


### Test Evaluation

In [27]:
X_test_wind=Matrix(DataFrame(CSV.File("X_test_wind.csv")))
y_test_wind=Matrix(DataFrame(CSV.File("y_test_wind.csv")));

In [28]:
println("MSE: ", mse(y_test_wind, X_test_wind*beta_lasso))
println("MAE: ", mae(y_test_wind, X_test_wind*beta_lasso))
println("R2: ", r2_score(y_test_wind, X_test_wind*beta_lasso))

MSE: 1.5379191371068548e7
MAE: 3033.255336044892
R2: -0.08107911385694666


In [31]:
#X normalized
beta_lasso=lasso(X_train_wind_norm,y_train_wind, 0.1);

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-18


In [26]:
println("MSE: ", mse(y_valid_wind, X_valid_wind_norm*beta_lasso))
println("MAE: ", mae(y_valid_wind, X_valid_wind_norm*beta_lasso))
println("R2: ", r2_score(y_valid_wind, X_valid_wind_norm*beta_lasso))

MSE: 1.3295536354377085e8
MAE: 9143.913566975465
R2: -1.4381435392020148


### Test Evaluation

In [29]:
#scale SOLAR TRAIN only numerical
X_test_num_wind=min_max_scaling(Matrix(X_test_wind), 1:15)

# Binary features as they are
X_test_bin_wind = X_test_wind[:,16:end]

# Numerical vs Categorical features scaling
alpha = 0.75
X_test_num_wind = alpha*X_test_num_wind
X_test_bin_wind = (1-alpha)*X_test_bin_wind;

# Append data
X_test_wind_norm = [X_test_num_wind X_test_bin_wind];

In [34]:
println("MSE: ", mse(y_test_wind, X_test_wind_norm*beta_lasso))
println("MAE: ", mae(y_test_wind, X_test_wind_norm*beta_lasso))
println("R2: ", r2_score(y_test_wind, X_test_wind_norm*beta_lasso))

MSE: 1.1719328007854867e8
MAE: 9266.731888031982
R2: -0.7605837604250911


In [27]:
#X,y normalized
beta_lasso=lasso(X_train_wind_norm,y_train_wind_norm, 0.1);

Set parameter Username
Academic license - for non-commercial use only - expires 2023-08-19


In [28]:
println("MSE: ", mse(y_valid_wind_norm, X_valid_wind_norm*beta_lasso))
println("MAE: ", mae(y_valid_wind_norm, X_valid_wind_norm*beta_lasso))
println("R2: ", r2_score(y_valid_wind_norm, X_valid_wind_norm*beta_lasso))

MSE: 0.08995528338662923
MAE: 0.22082103384027854
R2: -0.2798452192046488
