In [1]:
import Pkg; Pkg.activate("..")

[32m[1m  Activating[22m[39m project at `~/SAFT_ML`


In [2]:
using CSV, DataFrames, DelimitedFiles, Clapeyron, PyCall
import PyPlot; const plt = PyPlot ;

In [4]:
# Open and structure Esper et al. Statistics data
# Set missing values to -1 for easier processing (all valid values are positive)

raw_data_statistics = CSV.read("Esper et al Statistics (CSV).csv",DataFrame,header=1) 
processed_data_statistics = raw_data_statistics[1:1842,1:15]

for i = 1:nrow(processed_data_statistics)
    processed_data_statistics[i,:p_sat_AAD_outliers] = replace(processed_data_statistics[i,:p_sat_AAD_outliers], r"[()]" => "")
    processed_data_statistics[i,:rho_vap_AAD_outliers] = replace(processed_data_statistics[i,:rho_vap_AAD_outliers], r"[()]" => "")
    processed_data_statistics[i,:p_sat_n_points_outliers] = replace(processed_data_statistics[i,:p_sat_n_points_outliers], r"[()]" => "")
    processed_data_statistics[i,:rho_vap_n_points_outliers] = replace(processed_data_statistics[i,:rho_vap_n_points_outliers], r"[()]" => "")

    processed_data_statistics[i,:p_sat_AAD_outliers] = replace(processed_data_statistics[i,:p_sat_AAD_outliers], r"-" => -1)
    processed_data_statistics[i,:rho_liq_sat_AAD] = replace(processed_data_statistics[i,:rho_liq_sat_AAD], r"-" => -1)
    processed_data_statistics[i,:rho_liq_sp_AAD] = replace(processed_data_statistics[i,:rho_liq_sp_AAD], r"-" => -1)
    processed_data_statistics[i,:rho_vap_AAD_outliers] = replace(processed_data_statistics[i,:rho_vap_AAD_outliers], r"-" => -1)
    processed_data_statistics[i,:p_sat_n_points_outliers] = replace(processed_data_statistics[i,:p_sat_n_points_outliers], r"-" => -1)
    processed_data_statistics[i,:rho_liq_sat_n_points] = replace(processed_data_statistics[i,:rho_liq_sat_n_points], r"-" => -1)
    processed_data_statistics[i,:rho_liq_sp_n_points] = replace(processed_data_statistics[i,:rho_liq_sp_n_points], r"-" => -1)
    processed_data_statistics[i,:rho_vap_n_points_outliers] = replace(processed_data_statistics[i,:rho_vap_n_points_outliers], r"-" => -1)

end

processed_data_statistics.Name = String.(processed_data_statistics.Name)
processed_data_statistics.CAS = String15.(processed_data_statistics.CAS)
processed_data_statistics.molarweight = Float64.(processed_data_statistics.molarweight)
processed_data_statistics.p_sat_AAD = Float64.(processed_data_statistics.p_sat_AAD)
processed_data_statistics.rho_liq_AAD = Float64.(processed_data_statistics.rho_liq_AAD)

processed_data_statistics.p_sat_AAD_outliers = parse.(Float64, processed_data_statistics.p_sat_AAD_outliers)
processed_data_statistics.rho_liq_sat_AAD = parse.(Float64, processed_data_statistics.rho_liq_sat_AAD)
processed_data_statistics.rho_liq_sp_AAD = parse.(Float64, processed_data_statistics.rho_liq_sp_AAD)
processed_data_statistics.rho_vap_AAD_outliers = parse.(Float64, processed_data_statistics.rho_vap_AAD_outliers)

processed_data_statistics.p_sat_n_points = Int64.(processed_data_statistics.p_sat_n_points)
processed_data_statistics.rho_liq_n_points = Int64.(processed_data_statistics.rho_liq_n_points)

processed_data_statistics.p_sat_n_points_outliers = parse.(Int64, processed_data_statistics.p_sat_n_points_outliers)
processed_data_statistics.rho_liq_sat_n_points = parse.(Int64, processed_data_statistics.rho_liq_sat_n_points)
processed_data_statistics.rho_liq_sp_n_points = parse.(Int64, processed_data_statistics.rho_liq_sp_n_points)
processed_data_statistics.rho_vap_n_points_outliers = parse.(Int64, processed_data_statistics.rho_vap_n_points_outliers)

processed_data_statistics ;

[33m[1m└ [22m[39m[90m@ CSV ~/.julia/packages/CSV/OnldF/src/file.jl:577[39m


In [53]:
# Open and structure Esper et al. SAFT parameter data
raw_data_parameters = CSV.read("SI_pcp-saft_parameters.csv",DataFrame,header=1) ;
fieldnames(typeof(raw_data_parameters))
#display(names(df)[1:20]) ;
writedlm("out.txt", names(raw_data_parameters), ' ') ;

replace!(raw_data_parameters.mu, missing => 0) ;
replace!(raw_data_parameters.kappa_ab, missing => 0) ;
replace!(raw_data_parameters.epsilon_k_ab, missing => 0) ;

raw_data_parameters.mu = Float64.(raw_data_parameters.mu)
raw_data_parameters.kappa_ab = Float64.(raw_data_parameters.kappa_ab)
raw_data_parameters.epsilon_k_ab = Float64.(raw_data_parameters.epsilon_k_ab) ;
raw_data_parameters = sort(raw_data_parameters, :molarweight) 

raw_data_parameters = filter(row -> row.common_name != "cis-2-butene", raw_data_parameters)
raw_data_parameters = filter(row -> row.common_name != "(cis/trans)-2-butene", raw_data_parameters)
raw_data_parameters = filter(row -> row.common_name != "cis-2-pentene", raw_data_parameters);

In [54]:
num_rows = nrow(raw_data_parameters)
processed_data = DataFrame(
    common_name = fill(missing, num_rows),
    iupac_name = fill(missing, num_rows),
    CAS = fill(missing,num_rows),
    inchi = fill(missing,num_rows),
    canonical_SMILES = fill(missing,num_rows),
    isomeric_SMILES = fill(missing, num_rows), 
    family = fill(missing, num_rows),
    Mw = fill(missing, num_rows),
    segment = fill(missing, num_rows),
    sigma = fill(missing, num_rows),
    epsilon = fill(missing, num_rows),
    dipole = fill(missing, num_rows),
    kappa_ab = fill(missing, num_rows),
    epsilon_k_ab = fill(missing, num_rows),
    na = fill(missing, num_rows),
    nb = fill(missing, num_rows),
    expt_p_sat_T_min = fill(missing, num_rows),
    expt_p_sat_T_max = fill(missing, num_rows),
    expt_density_T_min = fill(missing, num_rows),
    expt_density_T_max = fill(missing, num_rows),
    interaction = fill(missing,num_rows),
    bounds_violation = fill(missing,num_rows),
    source = fill(missing, num_rows)
) ;

In [55]:
processed_data.common_name = raw_data_parameters.common_name
processed_data.iupac_name = raw_data_parameters.iupac_name
processed_data.CAS = raw_data_parameters.cas
processed_data.inchi = raw_data_parameters.inchi
processed_data.canonical_SMILES = raw_data_parameters.canonical_smiles
processed_data.Mw = raw_data_parameters.molarweight
processed_data.isomeric_SMILES = raw_data_parameters.isomeric_smiles
processed_data.family = raw_data_parameters.family
processed_data.segment = raw_data_parameters.m
processed_data.sigma = raw_data_parameters.sigma
processed_data.epsilon = raw_data_parameters.epsilon_k
processed_data.dipole = raw_data_parameters.mu
processed_data.kappa_ab = raw_data_parameters.kappa_ab
processed_data.epsilon_k_ab = raw_data_parameters.epsilon_k_ab
processed_data.na = raw_data_parameters.na
processed_data.nb = raw_data_parameters.nb
processed_data.expt_p_sat_T_min = raw_data_parameters.t_min_psat
processed_data.expt_p_sat_T_max = raw_data_parameters.t_max_psat
processed_data.expt_density_T_min = raw_data_parameters.t_min_density
processed_data.expt_density_T_max = raw_data_parameters.t_max_density
processed_data.interaction = raw_data_parameters.opt
processed_data.bounds_violation = raw_data_parameters.bounds_violation
processed_data[!, :source] .= "10.1021/acs.iecr.3c02255" ;

species_names = processed_data.common_name;

In [56]:
num_rows = nrow(processed_data)
training_data = DataFrame(
    common_name = fill(missing, num_rows),
    iupac_name = fill(missing, num_rows),
    CAS = fill(missing,num_rows),
    inchi = fill(missing,num_rows),
    isomeric_SMILES = fill(missing, num_rows), 
    canonical_SMILES = fill(missing, num_rows), 
    family = fill(missing, num_rows),
    Mw = fill(missing, num_rows),
    interaction = fill(missing,num_rows),
    source = fill(missing, num_rows),    
    expt_p_sat_T_min = fill(0,num_rows),
    expt_p_sat_T_max = fill(0,num_rows),
    expt_density_T_min = fill(0,num_rows),
    expt_density_T_max = fill(0,num_rows),
    
    p_sat_AAD = fill(0.0,num_rows),
    p_sat_AAD_outliers = fill(0.0,num_rows),
    rho_liq_AAD = fill(0.0,num_rows),
    rho_liq_sat_AAD = fill(0.0,num_rows),
    rho_liq_sp_AAD = fill(0.0,num_rows),
    rho_vap_AAD_outliers = fill(0.0,num_rows),

    p_sat_n_points = fill(0,num_rows),
    p_sat_n_points_outliers = fill(0,num_rows),
    rho_liq_n_points = fill(0,num_rows),
    rho_liq_sat_n_points = fill(0,num_rows),
    rho_liq_sp_n_points = fill(0,num_rows),
    rho_vap_n_points_outliers = fill(0,num_rows),
) ;

training_data.common_name = processed_data.common_name
training_data.iupac_name = processed_data.iupac_name
training_data.CAS = processed_data.CAS
training_data.inchi = processed_data.inchi
training_data.isomeric_SMILES = processed_data.isomeric_SMILES
training_data.canonical_SMILES = processed_data.canonical_SMILES
training_data.family = processed_data.family
training_data.Mw = processed_data.Mw
training_data.interaction = processed_data.interaction
training_data.source = processed_data.source 

training_data.expt_p_sat_T_min = processed_data.expt_p_sat_T_min ;
training_data.expt_p_sat_T_max = processed_data.expt_p_sat_T_max ;
training_data.expt_density_T_min = processed_data.expt_density_T_min ;
training_data.expt_density_T_max = processed_data.expt_density_T_min ;


In [57]:
# # This cell generates data for all ~1800 species in Esper et al.
# num_rows = nrow(processed_data)

# n = 50

# for i in species_names

#     row_number = findall(processed_data.species .== i)[1]
#     println(row_number," ",i)

#     model = PPCSAFT(i) ;
#     critical_props = crit_pure(model)

#     temp_range = collect(range(0.5*critical_props[1],critical_props[1],n)) ;
#     sat_props = [saturation_pressure(model,temp_range[i]) for i ∈ 1:n] ;

#     sat_pressures = [sat_props[i][1] for i ∈ 1:n] ;
#     sat_vols_liq = [sat_props[i][2] for i ∈ 1:n] ;
#     sat_vols_vap = [sat_props[i][3] for i ∈ 1:n] ;

#     for i in 1:length(critical_props)
#         training_data[row_number, 7 + i] = critical_props[i]
#     end

#     training_data[row_number,:sat_temperatures_K] = temp_range
#     training_data[row_number,:sat_pressures_MPa] = sat_pressures
#     training_data[row_number,:sat_volumes_liq_m3_per_mol] = sat_vols_liq
#     training_data[row_number,:sat_volumes_vap_m3_per_mol] = sat_vols_vap

# end


In [58]:
# Add statistics to training data dataframe
for i = 1:nrow(training_data)
    
    cas_lookup = training_data[i,:CAS]
    index_in_processed_data_statistics = findfirst(processed_data_statistics.CAS .== cas_lookup)

    training_data[i,:p_sat_AAD] = processed_data_statistics[index_in_processed_data_statistics,:p_sat_AAD]
    training_data[i,:p_sat_AAD_outliers] = processed_data_statistics[index_in_processed_data_statistics,:p_sat_AAD_outliers]
    training_data[i,:rho_liq_AAD] = processed_data_statistics[index_in_processed_data_statistics,:rho_liq_AAD]
    training_data[i,:rho_liq_sat_AAD] = processed_data_statistics[index_in_processed_data_statistics,:rho_liq_sat_AAD]
    training_data[i,:rho_liq_sp_AAD] = processed_data_statistics[index_in_processed_data_statistics,:rho_liq_sp_AAD]
    training_data[i,:rho_vap_AAD_outliers] = processed_data_statistics[index_in_processed_data_statistics,:rho_vap_AAD_outliers]

    training_data[i,:p_sat_n_points] = processed_data_statistics[index_in_processed_data_statistics,:p_sat_n_points]
    training_data[i,:p_sat_n_points_outliers] = processed_data_statistics[index_in_processed_data_statistics,:p_sat_n_points_outliers]
    training_data[i,:rho_liq_n_points] = processed_data_statistics[index_in_processed_data_statistics,:rho_liq_n_points]
    training_data[i,:rho_liq_sat_n_points] = processed_data_statistics[index_in_processed_data_statistics,:rho_liq_sat_n_points]
    training_data[i,:rho_liq_sp_n_points] = processed_data_statistics[index_in_processed_data_statistics,:rho_liq_sp_n_points]
    training_data[i,:rho_vap_n_points_outliers] = processed_data_statistics[index_in_processed_data_statistics,:rho_vap_n_points_outliers]

end

In [59]:
# Data storage
# CSV.write("training_data.csv", training_data) ;