In [1]:
# Set up a chain rule to calculate the loss function @ saturation w/out propagating derivatives through saturation solver
# This was done in the Language of Molecules paper.

# 1_differentiable_saft works when no solvers are involved, i.e. for properties specified with a given (V, T)
# if, instead, we want to solve for saturation conditions, then I have the old problem of 

In [2]:
import Pkg; Pkg.activate(".")

using Revise
using Clapeyron
includet("./saftvrmienn.jl")
# These are functions we're going to overload for SAFTVRMieNN
import Clapeyron: a_res, saturation_pressure, pressure

using Flux
using Plots
using ForwardDiff, DiffResults

using Zygote, ChainRulesCore
using ImplicitDifferentiation

using CSV, DataFrames
using MLUtils
using RDKitMinimalLib 
using Statistics

[32m[1m  Activating[22m[39m project at `~/SAFT_ML`




In [3]:
X = [16.04, 1.0, 3.737, 6.0, 12.504, 152.58]
T = 100.0

X = [114.14099884033203, 1.6983753442764282, 7.993192672729492, 8.710663437843323, 18.967856884002686, 248.48995435237885]
# T = 345.38165

p = saturation_pressure_NN(X, T)
∂p∂X = Zygote.gradient(X -> saturation_pressure_NN(X, T), X)
∂p∂T = Zygote.gradient(T -> saturation_pressure_NN(X, T), T)
@show p
@show ∂p∂X, ∂p∂T;

p = 16.267432682904257
(∂p∂X, ∂p∂T) = 

(([-0.0, -109.93750954832143, -6.105482508288177, 21.53097976724112, 4.416594844872478, -1.3639866231328097],), (3.5520440640281374,))


In [34]:
# Set up data creation & loading here 
# Currently random
# nsamples = 10
# nfeatures = 4
# nout = 5
# X = rand(Float32, nfeatures, nsamples)
# y = rand(Float32, nout, nsamples)

df = CSV.read("./pcpsaft_params/SI_pcp-saft_parameters.csv", DataFrame, header=1)
filter!(row -> occursin("Alkane", row.family), df)
mol_data = zip(df.common_name, df.isomeric_smiles, df.molarweight)
@info "generating data for $(length(mol_data)) molecules"

function make_fingerprint(s::String)::Vector{Float32}
    mol = get_mol(s)
    @assert !isnothing(mol)

    fp = []
    fp_details = Dict{String,Any}("nBits" => 512, "radius" => 4)
    fp_str = get_morgan_fp(mol, fp_details)
    append!(fp, [parse(Float32, string(c)) for c in fp_str])

    desc = get_descriptors(mol)
    relevant_keys = [
        "CrippenClogP",
        "NumHeavyAtoms",
        "amw",
        "FractionCSP3",
    ]
    relevant_desc = [desc[k] for k in relevant_keys]
    append!(fp, last.(relevant_desc))

    return fp
end

T = Float32
X_data = Vector{Tuple{Vector{T},T,T}}([])
Y_data = Vector{Vector{T}}()

n = 30
for (name, smiles, Mw) in mol_data
    saft_model = PPCSAFT([name])
    Tc, pc, Vc = crit_pure(saft_model)

    fp = make_fingerprint(smiles)

    T_range = range(0.5 * Tc, 0.975 * Tc, n)
    for T in T_range
        (p₀, V_vec...) = saturation_pressure(saft_model, T)
        push!(X_data, (fp, T, Mw))
        push!(Y_data, Float32[p₀])
    end
end

# Shuffle all samples into a random order
# Package into data loaders
# batchsize = 10
# train_data = DataLoader((X_data, y_data), batchsize=batchsize, shuffle=true)
#* shuffle=true randomises observation order every iteration

#* Remove zero columns from fingerprints
# Identify Zero Columns
num_cols = length(X_data[1][1])
zero_cols = trues(num_cols)
for (vec, _, _) in X_data
    zero_cols .&= (vec .== 0)
end

# Create a Mask
keep_cols = .!zero_cols

# Apply Mask
X_data = [(vec[keep_cols], val1, val2) for (vec, val1, val2) in X_data]

train_data, test_data = splitobs((X_data, Y_data), at=0.8, shuffle = true)

train_loader = DataLoader(train_data, batchsize=32, shuffle=true)
test_loader = DataLoader(test_data, batchsize=32, shuffle=false)

┌ Info: generating data for 80 molecules
└ @ Main /home/luc/SAFT_ML/6_sat_solver_NN.ipynb:12


15-element DataLoader(::Tuple{SubArray{Tuple{Vector{Float32}, Float32, Float32}, 1, Vector{Tuple{Vector{Float32}, Float32, Float32}}, Tuple{Vector{Int64}}, false}, SubArray{Vector{Float32}, 1, Vector{Vector{Float32}}, Tuple{Vector{Int64}}, false}}, batchsize=32)
  with first element:
  (32-element Vector{Tuple{Vector{Float32}, Float32, Float32}}, 32-element Vector{Vector{Float32}},)

In [40]:
# Base NN architecture from "Fitting Error vs Parameter Performance"
nfeatures = length(X_data[1][1])
nout = 5
unbounded_model = Chain(
    Dense(nfeatures, 2048, selu),
    Dense(2048, 1024, selu),
    Dense(1024, 512, selu),
    Dense(512, 128, selu),
    Dense(128, 32, selu),
    Dense(32, nout, selu),
)
# model(x) = m, σ, λ_a, λ_r, ϵ

opt = ADAM(1e-3)

# Add constant bias to the model output
b = [
    3.0,
    3.5,
    7.0,
    12.5,
    250.0,
]
nn_model(x) = unbounded_model(x)/100.0 .+ b

# Training loop
@info "Beginning training..."
epochs = 10
epoch_percent_loss_vec = Float32[]
loss_vec = Float32[]
mean_loss_vec = Float32[]

loss_fn(X_batch, y_batch) = begin
    n = 0
    batch_loss = 0.0
    for (X, y) in zip(X_batch, y_batch)
        fp, T, Mw = X
        y = y[1]

        X_pred = nn_model(fp)
        X_saft = vcat(Mw, X_pred)
        Tc = critical_temperature_NN(X_saft)
        if T < Tc
            ŷ = saturation_pressure_NN(X_saft, T)
            if !isnan(ŷ)
                n += 1
                batch_loss += ((ŷ - y) / y)^2
            end
        end
    end
    if n != 0
        batch_loss /= n
    end
    batch_loss
end

for epoch in 1:epochs
    epoch_loss_vec = Float32[]
    epoch_loss = 0.0

    for (X_batch, y_batch) in train_loader
        # @show loss_fn(X_batch, y_batch) 

        batch_loss = 0.0
        grads = Zygote.gradient(Flux.params(unbounded_model)) do
            batch_loss = loss_fn(X_batch, y_batch)
        end

        # Update model parameters
        Flux.update!(opt, Flux.params(unbounded_model), grads)

        append!(epoch_loss_vec, batch_loss)
        append!(epoch_percent_loss_vec, 100 * sqrt(batch_loss))
    end
    mean_loss = mean(epoch_loss_vec)
    mean_percent_loss = mean(epoch_percent_loss_vec)
    append!(loss_vec, epoch_loss_vec)
    append!(mean_loss_vec, mean_loss)

    if epoch in [1, 2, 3, 4, 5, 10] || epoch % 5 == 0 || epoch == epochs
        println("Epoch: $epoch, Loss: (μ=$mean_loss, σ=$(std(epoch_loss_vec))), Percent Error: $mean_percent_loss")
    end
end

┌ Info: Beginning training...
└ @ Main /home/luc/SAFT_ML/6_sat_solver_NN.ipynb:27


Epoch: 1, Loss: (μ=2230.331, σ=14829.467), Percent Error: 1082.3358


Epoch: 2, Loss: (μ=0.98204464, σ=0.009505792), Percent Error: 590.7165


Epoch: 3, Loss: (μ=0.98219174, σ=0.010176442), Percent Error: 426.8456


Epoch: 4, Loss: (μ=0.9820813, σ=0.009836892), Percent Error: 344.90903


Epoch: 5, Loss: (μ=0.9816503, σ=0.008563999), Percent Error: 295.74265


Epoch: 10, Loss: (μ=0.9807997, σ=0.011182162), Percent Error: 197.39743


ErrorException: syntax: unexpected "end"

In [41]:
epochs2 = 50
for epoch in epochs:epochs2
    epoch_loss_vec = Float32[]
    epoch_loss = 0.0

    for (X_batch, y_batch) in train_loader
        # @show loss_fn(X_batch, y_batch) 

        batch_loss = 0.0
        grads = Zygote.gradient(Flux.params(unbounded_model)) do
            batch_loss = loss_fn(X_batch, y_batch)
        end

        # Update model parameters
        Flux.update!(opt, Flux.params(unbounded_model), grads)

        append!(epoch_loss_vec, batch_loss)
        append!(epoch_percent_loss_vec, 100 * sqrt(batch_loss))
    end
    mean_loss = mean(epoch_loss_vec)
    mean_percent_loss = mean(epoch_percent_loss_vec)
    append!(loss_vec, epoch_loss_vec)
    append!(mean_loss_vec, mean_loss)

    if epoch in [1, 2, 3, 4, 5, 10] || epoch % 5 == 0 || epoch == epochs
        println("Epoch: $epoch, Loss: (μ=$mean_loss, σ=$(std(epoch_loss_vec))), Percent Error: $mean_percent_loss")
    end
end

Epoch: 10, Loss: (μ=0.98062664, σ=0.009668167), Percent Error: 188.4545


Epoch: 15, Loss: (μ=0.9798928, σ=0.01088991), Percent Error: 160.5026


Epoch: 20, Loss: (μ=0.9787283, σ=0.010470124), Percent Error: 145.84695


In [None]:
#! The model doesn't start with any thermodynamic knowledge. Is there any database we could pre-train on?