### This workbook is used for generating activity coefficient and phase diagram predictions across the composition and temperature range using a loaded UFR model

**Oliver Xie - Olsen Lab, Massachusetts Institute of Technology, 2025**

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import pandas as pd
import numpy as np
from scipy import optimize
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import regex as re
import warnings

from ufr.util.analysis import TrainedModel
import ufr.util.data_processing as data_processing

# Control master font size
plt.rcParams.update({
    'font.size': 18,
    'font.family': 'Arial'
})


### Open model result file, small molecule property file, and data file (if desired) with IDAC

In [None]:
# Open the data file
eval_data_file = "./data/opensource_IDAC_data.csv"
prop_file = "./data/small_molecule_prop.csv"

df_eval = pd.read_csv(eval_data_file, index_col = 0)
df_prop = pd.read_csv(prop_file, index_col = 0)

solute_smiles = 'Solute SMILES' # Specify the column name in df_eval
solvent_smiles = 'Solvent SMILES' # Specify the column name in df_eval

# Specify parts of the model
combinatorial_layer = 'mod_FH' # Choose between FH, FV, SG, GK-FV, mod_FH, mod_FV
residual_layer = 'mod_UNIQUAC' # Choose between UNIQUAC, mod_UNIQUAC, Wilson, NRTL
association_layer = 'wertheim' # Choose between none, wertheim
dimension = 12
temp_type = 'invT'
temp_dim = 2
sobolev = 0
trial = 5

model_name = f'UFR_{combinatorial_layer}_{residual_layer}_{association_layer}_{dimension}D_{temp_type}_{temp_dim}_sobolev_{sobolev}_{trial}'
model_file = f"./trained_models/models_from_paper/{model_name}.h5"

# We need to specify which IDAC is the correct one when removing the combinatorial layer - this is the IDAC to use for the slope with 1/T calculation
ln_y_label = f'ln_gamma_res_{combinatorial_layer}' # Do it for just this for now
comb_label = f'comb_{combinatorial_layer}' # This is name of the column containing the calculated combinatorial value

**Clean data if desired**

In [None]:
# Certain datasets report deuterated water but RDKit converts it to water. There are slight differences in IDAC for D2O. Manually convert these to a different SMILES name.
deuterated_water = 'Deuterium oxide <Heavy water>'
df_eval.loc[df_eval['Solute'] == deuterated_water, solute_smiles] = '[2H]O[2H]'
df_eval.loc[df_eval['Solvent'] == deuterated_water, solvent_smiles] = '[2H]O[2H]'

# Filter out duplicates and drop self-edges
self_edges = df_eval[df_eval[solute_smiles] == df_eval[solvent_smiles]].index
df_eval = df_eval.drop(self_edges)

# Drop duplicates
df_eval = data_processing.drop_duplicates(df_eval, solute_smiles=solute_smiles, solvent_smiles=solvent_smiles)

# Set flag on whether to clean the data or not
clean_temp_outliers = True # False or True

# If True, clean the data
if clean_temp_outliers:
    # Load the cleaning rules
    cleaning_rules_file = './data/cleaning_rules.xlsx' # Take the sheet named cleaning rules
    df_clean_rules = pd.read_excel(cleaning_rules_file, sheet_name = 'cleaning rules')

    # Clean according to the cleaning rules
    df_eval = data_processing.apply_cleaning(df_eval, df_clean_rules, solute_smiles=solute_smiles, solvent_smiles=solvent_smiles)

    # Drop all outliers. Can specify how many times away from the standard deviation to consider an outlier (default 3)
    df_eval, df_dropped = data_processing.remove_temperature_outliers(df_eval, std_dev = 3, solute_smiles=solute_smiles, solvent_smiles=solvent_smiles) # df_dropped is what were dropped

**Extract model parameters**

In [None]:
with pd.HDFStore(model_file) as store:
    df_results = store['df_chemical_parameters']
    df_temp = store['df_temp_parameters']
    df_loss = store['df_loss']
    df_distance = store['df_distance']

# Get the exponents used for this model
temp_exponents = np.array(df_temp.columns, dtype = np.float64)

# Fill in na with 0
x_results = df_results.to_numpy()

# Extract everything as numpy arrays to send into model
T = df_temp.to_numpy()
O = df_distance.to_numpy()

# Set up the parameter arrays
# Programatically access
A = df_results.filter(like = 'ua_').to_numpy()
Q = df_results.filter(like = 'q_').to_numpy()
Alpha = df_results.filter(like = 'alpha_').to_numpy()
Delta = df_results.loc[:, df_results.columns.str.contains('acceptor') | df_results.columns.str.contains('donor')].to_numpy()

arrays = [arr for arr in (A, Alpha, Q, Delta) if arr.size > 0]

### Create a model and mixture for calculation of phase properties

Currently supported methods in TrainedModel:
1) `plot_idac` Calculates and plots IDAC across a specified temperature range
2) `calc_param` Calculates the model parameters including those akin to the UNIQUAC, Wilson, NRTL parameters
3) `calc_activity` Calculates the activity coefficients at a specified temperature across the composition window for both species
4) `calc_activity_parts` Does the same as `calc_activity` except the output is broken up into the contribution of each model layer (combinatorial, residual, association)
5) `calc_VLE` Calculates the vapor-liquid equilibrium for the mixture. Either gives a T-x-y plot or P-x-y plot depending on if you specify the pressure (in bars - T-x-y) or temperature (in Kelvin - P-x-y)
6) `calc_LLE` Calculates the liquid-liquid equilibirum for the mixture using the isopotential and minimum energy criteria outlined in the text

In [None]:
# We need to load vapor pressure correlation. In the paper, the DIPPR correlations (Eqn. 101) extracted from Aspen Plus are used. Here, only open-source data for Antoine equations gathered from NIST are used. For any correlations used, the result needs to be converted to bar. See the method _antoine for equation details
df_vapor_pressure = pd.read_csv('./data/vapor_pressure_nist.csv', index_col = 'Canonical SMILES')
df_vapor_pressure['Equation'] = 'antoine' # Options include 101, 'antoine', 'extended_101'

# Generate the model
model_spec = (combinatorial_layer, residual_layer, association_layer) # combinatorial, residual, association
Model = TrainedModel(model_spec, df_results, df_prop, Tmat = T, Omat = O, temp_exponent = temp_exponents, df_antoine = df_vapor_pressure) # If learn is specified, supply O_mat as last variable

**Specify the components of the mixture**

In [None]:
A_SMILES = 'CO' # Specify the SMILES of component A
B_SMILES = 'c1ccccc1' # Specify the SMILES of component B

solution = Model.solution(A_SMILES, B_SMILES)

**Calculate the IDAC and compare to the dataset**

In [None]:
fig = solution.plot_idac(df_eval, 'ln gamma', 298.15, 398.15) # Inputs are the IDAC DataFrame, the column name to look for the experiemental IDAC, and the low and high temperatures. There must also be a column named 'Temp (K)' for this method to properly extract the temperatures

**Calculate the activity coefficient and its parts**

In [None]:
T = np.array([273.15 + 30]) # Specify temperature of interest. This must always be supplied in Kelvin as a numpy array.
x = (np.linspace(0,1,101), np.linspace(1,0,101)) # Specify the mole fractions of component A and B. The first array is for component A, the second is for component B. The two arrays must be the same length

ln_y_A, ln_y_B = solution.calc_activity(x, T) # Inputs are the x tuple and T numpy array. Outputs are the ln gamma of component A and B respectively
activity_dict = solution.calc_activity_parts(x, T) # Outputs are a dictionary of the contributions.

plt.figure(figsize=(6,6))

plt.plot(x[0], ln_y_A, label = f'ln gamma of {A_SMILES}', color = 'blue')
plt.plot(x[0], ln_y_B, label = f'ln gamma of {B_SMILES}', color = 'red')

# Plot options
plt.title(f'Plot of activity coefficients of {A_SMILES} and {B_SMILES}')
plt.xlabel(f'Mole fraction of {A_SMILES}')
plt.ylabel(r'ln $\gamma$')
plt.legend()

plt.tight_layout()
plt.show()

**Calculate a VLE curve**

In [None]:
# If you have converged VLE results to compare against, you can import them (here named df_Aspen_bubble and df_Aspen_dew)
# To use the Aspen Plus results, df_vapor_pressure must have a column called 'Aspen Name' specifying what each component is called in Aspen Plus
# The index column (first column) of each bubble/dew CSV should be the mixture in the form "('Aspen_A', 'Aspen_B')"
# Currently commented out

aspen_data = None
'''
df_Aspen_bubble = pd.read_csv('Aspen_bubble.csv', index_col=0)
df_Aspen_dew = pd.read_csv('Aspen_dew.csv', index_col=0)
column_num = df_Aspen_bubble.shape[1]

Aspen_A = df_vapor_pressure.loc[A_SMILES]['Aspen Name']
Aspen_B = df_vapor_pressure.loc[B_SMILES]['Aspen Name']

mixture_aspen = f"('{Aspen_A}', '{Aspen_B}')" # Aspen mixture name

try:
    aspen_data = {'x': np.linspace(0, 1, column_num), 'T_bubble': df_Aspen_bubble.loc[mixture_aspen].values, 'T_dew': df_Aspen_dew.loc[mixture_aspen].values, 'P_bubble': None, 'P_dew': None}
except:
    print(f'Mixture {mixture_aspen} not found in the Aspen dataset')
    aspen_data = None
'''
T = 298.15 # Kelvin
p = 1.01325 # bars
# Inputs to calc_VLE are temperature, pressure, number of x points, aspen data (or any literature data)
# Specify only temperature for P-x-y and leave pressure none
# Specify only pressure for T-x-y and leave temperature none

xA_p, P_bubble, P_dew = solution.calc_VLE(np.array([T]), None, 101, aspen_data) # P-x-y
xA_T, T_bubble, T_dew = solution.calc_VLE(None, np.array([p]), 101, aspen_data)

# Plot the P-x-y and T-x-y plots

plt.figure(figsize=(6,6))
plt.plot(xA_p, P_bubble, label = 'bubble', color = 'blue')
plt.plot(xA_p, P_dew, label = 'dew', color = 'red')

if aspen_data is not None:
    plt.plot(aspen_data['x'], aspen_data['P_bubble'], label = 'Aspen bubble', color = 'blue', linestyle = 'dashed')
    plt.plot(aspen_data['x'], aspen_data['P_dew'], label = 'Aspen dew', color = 'red', linestyle = 'dashed')

# Plot options
plt.title(f'P-x-y plot at {T - 273.15} C of {A_SMILES} and {B_SMILES}')
plt.xlabel(f'Mole fraction of {A_SMILES}')
plt.ylabel(f'Pressure (bar)')
plt.legend()

plt.tight_layout()
plt.show()

plt.figure(figsize=(6,6))
plt.plot(xA_T, T_bubble, label = 'bubble', color = 'blue')
plt.plot(xA_T, T_dew, label = 'dew', color = 'red')

if aspen_data is not None:
    plt.plot(aspen_data['x'], aspen_data['T_bubble'], label = 'Aspen bubble', color = 'blue', linestyle = 'dashed')
    plt.plot(aspen_data['x'], aspen_data['T_dew'], label = 'Aspen dew', color = 'red', linestyle = 'dashed')

# Plot options
plt.title(f'T-x-y plot at {p} bar of {A_SMILES} and {B_SMILES}')
plt.xlabel(f'Mole fraction of {A_SMILES}')
plt.ylabel(f'Temperature (Kelvin)')
plt.legend()

plt.tight_layout()
plt.show()

**Calculate an LLE curve**

In [None]:
# A range of temperatures (in Kelvin) must be specified for LLE calculations. At each temperature, the free energy is checked for phase splitting
T_range = np.arange(273 - 20 , 273 + 260, 1) # From low to high temperature, in Kelvin. Increments determine resolution of the LLE plot

xA_I, xA_II, T_lle = solution.calc_LLE(T_range, 201) # Inputs are the temperature range (in Kelvin) and the number of x points. Standard is 201 points (0.005 increments) but can be made finer

# Create figure with primary y-axis
fig, ax = plt.subplots(figsize=(6,6))

# Plot the phase envelope on primary y-axis
viridis = cm.get_cmap('viridis', 5)       
              
# Add a colorbar
ax.plot(xA_I, np.array(T_lle) - 273.15, label='xA_I', color=viridis(2), lw = 3) 
ax.plot(xA_II, np.array(T_lle) - 273.15, label='xA_II', color=viridis(2), lw = 3)

ax.set_xlabel(f'Mole fraction of component A')
ax.set_ylabel('Temperature (C)')
ax.set_title(f'LLE for (A) {A_SMILES} and (B) {B_SMILES}')
ax.set_xlim([0, 1])
ax.tick_params(labelsize=18)

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 18
ax.spines['bottom'].set_color('black')
ax.spines['top'].set_color('black') 
ax.spines['right'].set_color('black')
ax.spines['left'].set_color('black')

plt.grid(False)

plt.tight_layout()
plt.show()

**Example: Predict the activity coefficients at all compositions using a UFR model trained on COSMO-RS IDAC (Qin et al.)**

With a UFR model calibrated solely on infinite dilution activity coefficients published by Qin et al. as part of SolvGNN, we can calculate the composition dependent activity coefficients

In [None]:
# Load only the Zavala non-infinite dilution data and evaluate
eval_data_file = "./data/Qin_Zavala_AllConcentrations_data.csv"
prop_file = "./data/Qin_Zavala_small_molecule_prop.csv" # A separate properties file for all the molecules in the dataset is provided.

df_non_inf = pd.read_csv(eval_data_file)
df_prop = pd.read_csv(prop_file, index_col = 0)

# Set as multi-level index the Species_1_CanonicalSMILES and Species_2_CanonicalSMILES
df_non_inf.set_index(['Species_1_CanonicalSMILES', 'Species_2_CanonicalSMILES'], inplace=True)
df_non_inf['Calc ln_y_1'] = np.nan
df_non_inf['Calc ln_y_2'] = np.nan

df_non_inf = df_non_inf.sort_index()

# Specify parts of the model
combinatorial_layer = 'mod_FH' # Choose between FH, FV, SG, GK-FV, mod_FH, mod_FV
residual_layer = 'mod_UNIQUAC' # Choose between UNIQUAC, mod_UNIQUAC, Wilson, NRTL
association_layer = 'wertheim' # Choose between none, wertheim
dimension = 70
temp_type = 'invT'
temp_dim = 1
sobolev = 0
trial = 4

model_name = f'COSMO_UFR_{combinatorial_layer}_{residual_layer}_{association_layer}_{dimension}D_{temp_type}_{temp_dim}_sobolev_{sobolev}_{trial}' # Use the COSMO prefix
model_file = f"./trained_models/{model_name}.h5"

# Open the model
with pd.HDFStore(model_file) as store:
    df_results = store['df_chemical_parameters']
    df_temp = store['df_temp_parameters']
    df_loss = store['df_loss']
    df_distance = store['df_distance']

# Get the exponents used for this model
temp_exponents = np.array(df_temp.columns, dtype = np.float64)

# Fill in na with 0
x_results = df_results.to_numpy()

# Extract everything as numpy arrays to send into model
T = df_temp.to_numpy()
O = df_distance.to_numpy()

# Set up the parameter arrays
# Programatically access
A = df_results.filter(like = 'ua_').to_numpy()
Q = df_results.filter(like = 'q_').to_numpy()
Alpha = df_results.filter(like = 'alpha_').to_numpy()
Delta = df_results.loc[:, df_results.columns.str.contains('acceptor') | df_results.columns.str.contains('donor')].to_numpy()

arrays = [arr for arr in (A, Alpha, Q, Delta) if arr.size > 0]

# Load the solution model
# We don't need vapor pressures for this task, set as None
# Generate the model
model_spec = (combinatorial_layer, residual_layer, association_layer) # combinatorial, residual, association
Model = TrainedModel(model_spec, df_results, df_prop, Tmat = T, Omat = O, temp_exponent = temp_exponents, df_antoine = None) # If learn is specified, supply O_mat as last variable

# Iterate through every pair in df_non_inf and calculate the activity coefficients
idx = 0
pairs = len(df_non_inf.index)
T = 298.15 # Set as room temperature

# Iterate through every index. Get the SMILES for A and B. Calculate activity coefficient at desired concentrations
with warnings.catch_warnings():
    # Suppress overflow errors
    warnings.simplefilter("ignore", RuntimeWarning)
    for A_smiles, B_smiles in df_non_inf.index:
        solution = Model.solution(A_smiles, B_smiles)
        xA = df_non_inf.loc[(A_smiles, B_smiles)]['Species_1_x'].to_numpy()
        xB = df_non_inf.loc[(A_smiles, B_smiles)]['Species_2_x'].to_numpy()
        x = (xA, xB)
        ln_y_A, ln_y_B = solution.calc_activity(x, np.array([T]))
        df_non_inf.loc[(A_smiles, B_smiles), 'Calc ln_y_1'] = ln_y_A
        df_non_inf.loc[(A_smiles, B_smiles), 'Calc ln_y_2'] = ln_y_B
        idx += 1
        print(f'Completed {idx} / {pairs} entries')

df_non_inf['AE_Species_1'] = np.abs(df_non_inf['Calc ln_y_1'] - df_non_inf['Species_1_gamma'])
df_non_inf['AE_Species_2'] = np.abs(df_non_inf['Calc ln_y_2'] - df_non_inf['Species_2_gamma'])