In [35]:
############################################################
# SINDy – SOLUTION 1 FIXÉE (Float64 + STLSQ)
############################################################

using NPZ
using LinearAlgebra
using Statistics
using DataDrivenDiffEq
using DataDrivenSparse
using ModelingToolkit
using Printf

############################################################
# CONFIG
############################################################

FILEPATH = "data/processed/sstReducedState2COPERNICUS20102019.npz"
POLY_DEGREE = 2
LAMBDA_LIST = [1e-4, 1e-3, 5e-3, 1e-2]

############################################################
# LOAD DATA (FORCE Float64)
############################################################

data = npzread(FILEPATH)

Z_raw  = Float64.(data["Z"])     # (time, state)
dZ_raw = Float64.(data["dZ"])
split  = Int.(data["split"])

Z  = permutedims(Z_raw)          # (state, time)
dZ = permutedims(dZ_raw)

n_state, _ = size(Z)

train_idx = findall(split .== 0)
test_idx  = findall(split .== 1)

Z_train  = Z[:, train_idx]
dZ_train = dZ[:, train_idx]

Z_test  = Z[:, test_idx]
dZ_test = dZ[:, test_idx]

############################################################
# BASIS (Float64)
############################################################

@variables x[1:n_state]

raw_basis = polynomial_basis(x, POLY_DEGREE)
basis = Basis(raw_basis, x)

############################################################
# METRICS
############################################################

metrics(pred, truth) = (
    sqrt(mean((pred .- truth).^2)),
    1 - sum((truth .- pred).^2) / sum((truth .- mean(truth)).^2)
)

############################################################
# STLSQ PAR ÉTAT (ROBUSTE)
############################################################

results = Dict()

for λ in LAMBDA_LIST
    println("\nλ = $λ")

    rmses = Float64[]
    sparsities = Float64[]
    models = Any[]

    for i in 1:n_state
        prob_i = ContinuousDataDrivenProblem(
            Z_train,
            dZ_train[i:i, :]
        )

        alg = STLSQ(λ)

        res = solve(prob_i, basis, STLSQ(λ))

        dZ_pred = res(prob_i)
        dZ_true = dZ_train[i:i, :]

        rmse, r2 = metrics(dZ_pred, dZ_true)

        # ✅ accès correct aux équations
        model = DataDrivenDiffEq.symbolic_model(res)

        eq = equations(model)[1]

        rhs = eq.rhs
        active_terms =
            rhs.operation === (+) ? length(rhs.args) :
            rhs isa Num ? 1 :
            0

        sparsity = 100 * (1 - active_terms / length(raw_basis))

        push!(rmses, rmse)
        push!(sparsities, sparsity)
        push!(models, model)

    end

    results[λ] = (
        mean_rmse = mean(rmses),
        mean_sparsity = mean(sparsities),
        models = models
    )

    @printf "→ RMSE = %.5f | Sparsity = %.1f%%\n" results[λ].mean_rmse results[λ].mean_sparsity
end

############################################################
# BEST MODEL
############################################################

best_λ = argmin(λ -> results[λ].mean_rmse, keys(results))
best = results[best_λ]

println("\n=== BEST λ ===")
println("λ = $best_λ")
@printf "Mean RMSE = %.5f\n" best.mean_rmse
@printf "Mean sparsity = %.1f%%\n" best.mean_sparsity

println("\nFirst 5 equations:")
for i in 1:min(5, n_state)
    println("dx$i/dt = ", best.models[i].equations[1])
end



λ = 0.0001


LoadError: DimensionMismatch: Both inputs should have the same number of columns

In [42]:
############################################################
# PURE SINDy – STLSQ MANUEL (ROBUSTE)
############################################################

using NPZ
using LinearAlgebra
using Statistics
using ModelingToolkit
using Printf

############################################################
# LOAD DATA
############################################################

data = npzread("data/processed/sstReducedState2COPERNICUS20102019.npz")

Z  = Float64.(data["Z"])     # (time, state)
dZ = Float64.(data["dZ"])

Z  = permutedims(Z)          # (state, time)
dZ = permutedims(dZ)

n_state, T = size(Z)

############################################################
# BUILD LIBRARY Θ
############################################################

POLY_DEGREE = 2

@variables x[1:n_state]
basis = polynomial_basis(x, POLY_DEGREE)

Θ_fun = ModelingToolkit.build_function(
    basis, x, expression=Val(false)
)[1]

Θ = zeros(Float64, T, length(basis))
for t in 1:T
    Θ[t, :] .= Θ_fun(Z[:, t])
end

############################################################
# STLSQ IMPLEMENTATION
############################################################

function stlsq(Θ, y; λ=1e-3, n_iter=10)
    ξ = Θ \ y                # least squares
    for _ in 1:n_iter
        small = abs.(ξ) .< λ
        ξ[small] .= 0.0
        big = .!small
        ξ[big] = Θ[:, big] \ y
    end
    ξ
end

############################################################
# RUN SINDy
############################################################

λ = 1e-3
Ξ = zeros(Float64, length(basis), n_state)

for i in 1:n_state
    println("Learning equation $i / $n_state")
    Ξ[:, i] = stlsq(Θ, dZ[i, :]; λ=λ)
end

############################################################
# METRICS
############################################################

dZ_pred = (Θ * Ξ)'   # (state, time)

rmse_total = sqrt(mean((dZ_pred .- dZ).^2))
println("\nRMSE total = ", rmse_total)


############################################################
# DISPLAY FIRST EQUATIONS
############################################################

println("\nDiscovered equations:")
for i in 1:min(5, n_state)
    println("\ndx$i/dt =")
    for j in findall(!iszero, Ξ[:, i])
        println("  + $(Ξ[j,i]) * $(basis[j])")
    end
end


Learning equation 1 / 17
Learning equation 2 / 17
Learning equation 3 / 17
Learning equation 4 / 17
Learning equation 5 / 17
Learning equation 6 / 17
Learning equation 7 / 17
Learning equation 8 / 17
Learning equation 9 / 17
Learning equation 10 / 17
Learning equation 11 / 17
Learning equation 12 / 17
Learning equation 13 / 17
Learning equation 14 / 17
Learning equation 15 / 17
Learning equation 16 / 17
Learning equation 17 / 17

RMSE total = 0.27254526010248076

Discovered equations:

dx1/dt =
  + -0.15307929825380137 * x[1]
  + -0.9138327355477529 * x[1]^2
  + 0.007705415900856475 * x[2]
  + 0.1788425060410267 * x[1]*x[2]
  + -0.003146743880923973 * x[2]^2
  + -0.04664705375528293 * x[3]
  + -0.5198602971442043 * x[1]*x[3]
  + 0.057752162144082346 * x[2]*x[3]
  + -1.3481745567135217 * x[3]^2
  + 0.2146968469176186 * x[4]
  + 2.82667471028666 * x[1]*x[4]
  + -0.198883804109463 * x[2]*x[4]
  + 0.1058711278602489 * x[3]*x[4]
  + -1.0155242823196435 * x[4]^2
  + 0.08365811184051904 * x[5

In [43]:
############################################################
# PURE SINDy – STLSQ MANUEL (ROBUSTE + NORMALISÉ)
############################################################

using NPZ
using LinearAlgebra
using Statistics
using ModelingToolkit
using Printf

############################################################
# LOAD DATA
############################################################

data = npzread("data/processed/sstReducedState2COPERNICUS20102019.npz")

Z  = Float64.(data["Z"])     # (time, state)
dZ = Float64.(data["dZ"])

Z  = permutedims(Z)          # (state, time)
dZ = permutedims(dZ)

n_state, T = size(Z)

############################################################
# BUILD LIBRARY Θ
############################################################

POLY_DEGREE = 2

@variables x[1:n_state]
basis = polynomial_basis(x, POLY_DEGREE)

Θ_fun = ModelingToolkit.build_function(
    basis, x, expression=Val(false)
)[1]

Θ = zeros(Float64, T, length(basis))
for t in 1:T
    Θ[t, :] .= Θ_fun(Z[:, t])
end

############################################################
# NORMALISE Θ (CRUCIAL)
############################################################

Θ_norm = copy(Θ)
col_norms = zeros(size(Θ, 2))

for j in 1:size(Θ, 2)
    col_norms[j] = norm(Θ[:, j])
    if col_norms[j] > 0
        Θ_norm[:, j] ./= col_norms[j]
    end
end

############################################################
# STLSQ IMPLEMENTATION (ROBUSTE)
############################################################

function stlsq(Θ, y; λ=5e-2, n_iter=15)
    ξ = Θ \ y

    for _ in 1:n_iter
        small = abs.(ξ) .< λ
        ξ[small] .= 0.0
        big = .!small

        if any(big)
            ξ[big] = Θ[:, big] \ y
        else
            break
        end
    end
    ξ
end

############################################################
# RUN SINDy
############################################################

λ = 5e-2
Ξ_norm = zeros(Float64, length(basis), n_state)

for i in 1:n_state
    println("Learning equation $i / $n_state")
    Ξ_norm[:, i] = stlsq(Θ_norm, dZ[i, :]; λ=λ)
end

############################################################
# DENORMALISE COEFFICIENTS
############################################################

Ξ = copy(Ξ_norm)
for j in 1:size(Ξ, 1)
    if col_norms[j] > 0
        Ξ[j, :] ./= col_norms[j]
    end
end

############################################################
# METRICS
############################################################

dZ_pred = (Θ * Ξ)'   # (state, time)

rmse_eq = [sqrt(mean((dZ_pred[i, :] .- dZ[i, :]).^2)) for i in 1:n_state]
rmse_total = mean(rmse_eq)

println("\nRMSE total = ", rmse_total)
@printf "RMSE min = %.4f | RMSE max = %.4f\n" minimum(rmse_eq) maximum(rmse_eq)

############################################################
# DISPLAY FIRST EQUATIONS
############################################################

println("\nDiscovered equations:")
for i in 1:min(5, n_state)
    println("\ndx$i/dt =")
    for j in findall(!iszero, Ξ[:, i])
        @printf "  %+ .4e * %s\n" Ξ[j,i] basis[j]
    end
end


Learning equation 1 / 17
Learning equation 2 / 17
Learning equation 3 / 17
Learning equation 4 / 17
Learning equation 5 / 17
Learning equation 6 / 17
Learning equation 7 / 17
Learning equation 8 / 17
Learning equation 9 / 17
Learning equation 10 / 17
Learning equation 11 / 17
Learning equation 12 / 17
Learning equation 13 / 17
Learning equation 14 / 17
Learning equation 15 / 17
Learning equation 16 / 17
Learning equation 17 / 17

RMSE total = 0.27246819720879756
RMSE min = 0.2629 | RMSE max = 0.2911

Discovered equations:

dx1/dt =
  -8.9613e-04 * 1
  -1.5266e-01 * x[1]
  -9.0883e-01 * x[1]^2
  +7.7151e-03 * x[2]
  +1.7817e-01 * x[1]*x[2]
  -2.6572e-03 * x[2]^2
  -4.6613e-02 * x[3]
  -5.2091e-01 * x[1]*x[3]
  +5.7083e-02 * x[2]*x[3]
  -1.3425e+00 * x[3]^2
  +2.1438e-01 * x[4]
  +2.8194e+00 * x[1]*x[4]
  -1.9880e-01 * x[2]*x[4]
  +1.0702e-01 * x[3]*x[4]
  -1.0098e+00 * x[4]^2
  +8.3892e-02 * x[5]
  +9.5568e-01 * x[1]*x[5]
  +1.8511e-01 * x[2]*x[5]
  +1.6360e+00 * x[3]*x[5]
  -6.9374e-01

In [47]:
############################################################
# PURE SINDy – STLSQ MANUEL (SPARSE + ROBUSTE)
############################################################

using NPZ
using LinearAlgebra
using Statistics
using ModelingToolkit
using Printf

############################################################
# LOAD DATA
############################################################

data = npzread("data/processed/sstReducedState2COPERNICUS20102019.npz")

Z  = Float64.(data["Z"])     # (time, state)
dZ = Float64.(data["dZ"])

Z  = permutedims(Z)          # (state, time)
dZ = permutedims(dZ)

n_state, T = size(Z)

############################################################
# BUILD LIBRARY Θ (NO CONSTANT TERM)
############################################################

POLY_DEGREE = 2

@variables x[1:n_state]
full_basis = polynomial_basis(x, POLY_DEGREE)

# remove constant term
basis = full_basis[2:end]
n_terms = length(basis)

Θ_fun = ModelingToolkit.build_function(
    basis, x, expression = Val(false)
)[1]

Θ = zeros(Float64, T, n_terms)
for t in 1:T
    Θ[t, :] .= Θ_fun(Z[:, t])
end

############################################################
# STLSQ
############################################################

function stlsq(Θ, y; λ=1.0, n_iter=15)
    ξ = Θ \ y
    for _ in 1:n_iter
        small = abs.(ξ) .< λ
        ξ[small] .= 0.0
        big = .!small
        if any(big)
            ξ[big] = Θ[:, big] \ y
        end
    end
    ξ
end

############################################################
# SWEEP λ
############################################################

λ_list = [0.1, 0.2, 0.5, 1.0, 2.0]
results = Dict()

for λ in λ_list
    println("\nλ = $λ")

    Ξ = zeros(Float64, n_terms, n_state)
    for i in 1:n_state
        Ξ[:, i] = stlsq(Θ, dZ[i, :]; λ=λ)
    end

    dZ_pred = (Θ * Ξ)'
    rmse_val = sqrt(mean((dZ_pred .- dZ).^2))
    sparsity = 100 * count(iszero, Ξ) / length(Ξ)

    results[λ] = (rmse=rmse_val, sparsity=sparsity, Ξ=Ξ)

    @printf "RMSE = %.4f | Sparsity = %.1f%%\n" rmse_val sparsity
end

############################################################
# BEST λ (EXCLUDE TRIVIAL MODELS)
############################################################

MAX_SPARSITY = 95.0

valid_λ = filter(λ -> results[λ].sparsity < MAX_SPARSITY, keys(results))

@assert !isempty(valid_λ) "No valid model found (all too sparse)"

best_λ = argmin(λ -> results[λ].rmse, valid_λ)
best = results[best_λ]

println("\nBEST λ (non-trivial) = $best_λ")
@printf "RMSE = %.4f | Sparsity = %.1f%%\n" best.rmse best.sparsity


############################################################
# DISPLAY FIRST EQUATIONS
############################################################

Ξ = best.Ξ

println("\nDiscovered equations (first 5 states):")

for i in 1:min(5, n_state)
    println("\ndx$i/dt =")
    for j in findall(!iszero, Ξ[:, i])
        @printf "  %+8.4f * %s\n" Ξ[j, i] basis[j]
    end
end



λ = 0.1
RMSE = 0.2732 | Sparsity = 18.3%

λ = 0.2
RMSE = 0.2741 | Sparsity = 31.3%

λ = 0.5
RMSE = 0.2763 | Sparsity = 53.7%

λ = 1.0
RMSE = 0.2835 | Sparsity = 91.7%

λ = 2.0
RMSE = 0.2863 | Sparsity = 100.0%

BEST λ (non-trivial) = 0.1
RMSE = 0.2732 | Sparsity = 18.3%

Discovered equations (first 5 states):

dx1/dt =
   -0.1071 * x[1]
   -0.9286 * x[1]^2
   +0.1943 * x[1]*x[2]
   -0.6450 * x[1]*x[3]
   -1.1772 * x[3]^2
   +0.2054 * x[4]
   +2.9292 * x[1]*x[4]
   -0.1867 * x[2]*x[4]
   +0.1617 * x[3]*x[4]
   -1.0680 * x[4]^2
   +0.9196 * x[1]*x[5]
   +0.1774 * x[2]*x[5]
   +2.0847 * x[3]*x[5]
   -0.5994 * x[4]*x[5]
   +0.3148 * x[5]^2
   +0.4148 * x[1]*x[6]
   +1.7093 * x[3]*x[6]
   +0.2705 * x[4]*x[6]
   -0.8558 * x[6]^2
   +0.2396 * x[1]*x[7]
   +0.1862 * x[2]*x[7]
   -1.0171 * x[3]*x[7]
   +0.4086 * x[4]*x[7]
   +1.3458 * x[5]*x[7]
   -1.1565 * x[6]*x[7]
   +0.5407 * x[7]^2
   -0.1145 * x[8]
   -1.1038 * x[1]*x[8]
   +0.2502 * x[2]*x[8]
   -2.8095 * x[3]*x[8]
   +0.2911 * x[5]*x[8

In [48]:
############################################################
# PURE SINDy – STLSQ MANUEL (VERSION FINALE NORMALISÉE)
############################################################

using NPZ
using LinearAlgebra
using Statistics
using ModelingToolkit
using Printf

############################################################
# LOAD DATA
############################################################

data = npzread("data/processed/sstReducedState2COPERNICUS20102019.npz")

Z  = Float64.(data["Z"])     # (time, state)
dZ = Float64.(data["dZ"])    # (time, state)

Z  = permutedims(Z)          # (state, time)
dZ = permutedims(dZ)

n_state, T = size(Z)

############################################################
# BUILD POLYNOMIAL LIBRARY Θ
############################################################

POLY_DEGREE = 2

@variables x[1:n_state]
basis = polynomial_basis(x, POLY_DEGREE)

Θ_fun = ModelingToolkit.build_function(
    basis, x, expression = Val(false)
)[1]

Θ = zeros(Float64, T, length(basis))
for t in 1:T
    Θ[t, :] .= Θ_fun(Z[:, t])
end

############################################################
# NORMALISE Θ (COLUMN-WISE)
############################################################

Θ_norm = similar(Θ)
scales = zeros(size(Θ, 2))

for j in 1:size(Θ,2)
    scales[j] = norm(Θ[:,j])
    Θ_norm[:,j] = Θ[:,j] / scales[j]
end

############################################################
# STLSQ IMPLEMENTATION (ROBUST)
############################################################

function stlsq(Θ, y; λ=1e-3, n_iter=10)
    ξ = Θ \ y
    for _ in 1:n_iter
        small = abs.(ξ) .< λ
        ξ[small] .= 0.0
        big = .!small
        if any(big)
            ξ[big] = Θ[:, big] \ y
        end
    end
    return ξ
end

############################################################
# GRID SEARCH ON λ
############################################################

LAMBDA_LIST = [0.1, 0.2, 0.5, 1.0]
results = Dict()

for λ in LAMBDA_LIST
    println("\nλ = $λ")

    Ξ = zeros(Float64, length(basis), n_state)

    for i in 1:n_state
        Ξ[:, i] = stlsq(Θ_norm, dZ[i, :]; λ=λ)
    end

    # Undo normalisation
    Ξ .= Ξ ./ scales

    dZ_pred = (Θ * Ξ)'   # (state, time)

    rmse = sqrt(mean((dZ_pred .- dZ).^2))
    sparsity = 100 * count(iszero, Ξ) / length(Ξ)

    results[λ] = (rmse=rmse, sparsity=sparsity, Xi=Ξ)

    @printf "RMSE = %.4f | Sparsity = %.1f%%\n" rmse sparsity
end

############################################################
# SELECT BEST NON-TRIVIAL λ
############################################################

valid_λ = filter(λ -> results[λ].sparsity < 95.0, LAMBDA_LIST)
best_λ = argmin(λ -> results[λ].rmse, valid_λ)
best = results[best_λ]

println("\nBEST λ (non-trivial) = $best_λ")
@printf "RMSE = %.4f | Sparsity = %.1f%%\n" best.rmse best.sparsity

############################################################
# DISPLAY DISCOVERED EQUATIONS
############################################################

Ξ = best.Xi

println("\nDiscovered equations (first 5 states):")

for i in 1:min(5, n_state)
    println("\ndx$i/dt =")
    for j in findall(!iszero, Ξ[:, i])
        @printf "  %+8.4f * %s\n" Ξ[j,i] string(basis[j])
    end
end



λ = 0.1


LoadError: invalid redefinition of constant Main.rmse