### This workbook is used for training UFR models on infinite dilution activity coefficients (IDAC)

**Oliver Xie - Olsen Lab, Massachusetts Institute of Technology, 2025**

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import torch
import warnings
import ufr.model.idac_model as idac_model
import ufr.util.data_processing as data_processing
import ufr.util.model_launch as model_launch

# Disable prototype warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
# Disable future deprecation warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Specify to use a specific GPU device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

### Data Loading

Two files are required, one containing the IDAC data (data_file) and another containing the small molecule properties (prop_file). Each should be loaded as a pandas DataFrame.

The IDAC data should have each unique IDAC entry be its own row. Columns should be named as follows and contain:
1) 'Solute': The name of the solute molecule
2) 'Solvent': The name of the solvent molecule
3) 'Solute SMILES': The canonical SMILES name of the solute molecule
4) 'Solvent SMILES': The canonical SMILES name of the solvent molecule
5) 'Temp (K)': The temperature of the measurement in Kelvin
6) 'ln gamma': The IDAC measurement reported as ln $\gamma^\infty_i$

The small molecule properties file should contain a unique chemical in each row. Columns should be named as follows and contain:
1) 'IUPAC Name': The IUPAC name of the molecule
2) 'Canonical SMILES': The canonical SMILES name of the molecule
3) 'van der waals volume (m3/kmol)': The van der Waals volume of each molecule in m3/kmol. Water has a modified van der Waals volume reflecting the value used in UNIQUAC.
4) 'van der waals area (m2/kmol)': The van der Waals surface area of each molecule in m2/kmol. Water has a modified van der Waals volume reflecting the value used in UNIQUAC.
5) 'H donor sites': The number of hydrogen bond donor sites per molecule as calculated using RDKit Lipinski module
6) 'H acceptor sites': The number of hydrogen bond acceptor sites per molecule as calculated using RDKit Lipinski module

There are 5 additional columns ('A', 'B', 'C', 'D', 'Eqn') that are required for calculating the free volume of each molecule using the DIPPR Project 801 correlations for molar volume with temperature. As this data cannot be freely distributed, all entries are left blank. These columns are required for any combinatorial layer choice that requires free volume (Elbro-FV, mod-FV, GK-FV)

In [None]:
# Open the data file
data_file = "./data/opensource_IDAC_data.csv"
prop_file = "./data/small_molecule_prop.csv"

df_inf = pd.read_csv(data_file, index_col = 0)
df_prop = pd.read_csv(prop_file, index_col = 0)

solute_smiles = 'Solute SMILES' # Specify the column name in df_inf
solvent_smiles = 'Solvent SMILES' # Specify the column name in df_inf

### Data Cleaning and Processing

Several steps are used to clean the data (if desired). Experimental IDAC data is inherently noisy and contains errors. Cleaning out significant outliers is necessary to train the UFRs on real relationships rather than wrong ones induced by the data.

The data cleaning steps include:
1) Change all D2O entries to have a unique SMILES that is not the same as water's.
2) Filter out duplicate entries, keeping only one copy. Duplicates defined as having the exact same solute, solvent, temperature and IDAC value.
3) Remove any self-edges (solute and solvent are the same)
4) Clean data according to cleaning rules (optional). Load and use attached cleaning rules.
5) Remove outliers from lines of best fit with inverse temperature. For aqueous mixtures use a quadratic polynomial fit and for non-aqueous mixtures use a linear fit with inverse temperature. For a given mixture, outliers are classified as points with absolute residuals more than 3 times greater than the standard deviation of the residuals. Only do this filtering on mixtures containing more than 5 points at 4 unique temperatures.


In [None]:
# Certain datasets report deuterated water but RDKit converts it to water. There are slight differences in IDAC for D2O. Manually convert these to a different SMILES name.
deuterated_water = 'Deuterium oxide <Heavy water>'
df_inf.loc[df_inf['Solute'] == deuterated_water, solute_smiles] = '[2H]O[2H]'
df_inf.loc[df_inf['Solvent'] == deuterated_water, solvent_smiles] = '[2H]O[2H]'

# Filter out duplicates and drop self-edges
self_edges = df_inf[df_inf[solute_smiles] == df_inf[solvent_smiles]].index
df_inf = df_inf.drop(self_edges)

# Drop duplicates
df_inf_no_duplicates = data_processing.drop_duplicates(df_inf)

# Set flag on whether to clean the data or not
clean_temp_outliers = True # False or True

# If True, clean the data
if clean_temp_outliers:
    # Load the cleaning rules
    cleaning_rules_file = './data/cleaning_rules.xlsx'
    df_clean_rules = pd.read_excel(cleaning_rules_file, sheet_name = 'cleaning rules')

    # Clean according to the cleaning rules
    df_inf_no_duplicates = data_processing.apply_cleaning(df_inf_no_duplicates, df_clean_rules)

    # Drop all outliers. Can specify how many times away from the standard deviation to consider an outlier (default 3)
    df_inf_clean, df_dropped = data_processing.remove_temperature_outliers(df_inf_no_duplicates, std_dev = 3) # df_dropped is what were dropped

else:
    df_inf_clean = df_inf_no_duplicates.copy()

print(f'We now have {df_inf_clean.shape[0]} rows in the dataset from the original {df_inf.shape[0]} rows')

Use the df_prop DataFrame to get the required small molecule properties for each solute and solvent. Enter it into the df_inf_clean DataFrame. There are two modes for the addition, 'FH' for all combinatorial models that do not require free volume, and 'FV' for all combinatorial models that do.

Because the combinatorial term does not require any regressed parameters, it is directly calculated in molecular_property_addition and stored in df_inf_regress. The Pytorch UFR models train against the remaining activity coefficient after subtraction of the combinatorial term.

In [None]:
df_inf_regress, v0, s0 = data_processing.molecular_property_addition(df_inf_clean, df_prop, mode = 'FH', solute_smiles = solute_smiles, solvent_smiles = solvent_smiles) # The mode can be changed between 'FH' and 'FV'. Note: 'FV' requires the DIPPR correlations

### Train the UFR model

The UFR models can now be trained. The following model choices need to be specified.

1) Dimensions: The total number of dimensions to use for each node's (chemical) thermodynamic embedding. This is $N_{embedding}$ as specified in the paper. Depending on the model chosen, different parts of the embedding may be reserved for different model layers.
2) Trials: How many trials of each model to run. Each trial number serves as the seed for the random number generator, ensuring different initial conditions each time.
3) Combinatorial model: The choice of combinatorial layer to use specifies which column of df_inf_regress to gather $\ln{\gamma}^{\infty}$ from. The combinatorial contribution is calculated for every model and subtracted from the data. 
    - Choices without free volume include: FH (Flory-Huggins), mod-FH (2/3 modified Flory-Huggins), SG (Staverman-Guggenheim)
    - Choices with free volume include FV (Elbro free volume), mod-FV (2/3 modified Elbro free volume), GK-FV (Staverman-Guggenheim with free volume)
4) Residual model: The choice of the residual layer as described in the main text
    - Choices include: UNIQUAC, mod-UNIQUAC, Wilson, NRTL
5) Association model: The choice of the association layer as described in the main text
    - Choices include: None, Wertheim
6) Temperature layer: The exponents for the temperature dependence must be specified. We work with inverse temperature, so the exponents correspond to that for inverse temperature.
6) Sobolev loss: To penalize for incorrect temperature correlations, the Sobolev loss can be turned on. In the paper, the parameter weighting the Sobolev loss was studied at 0.2 and 1.
7) Hyperparameters: Learning rate. Total number of epochs and number of epochs for ramping up, holding, and ramping down the learning rate. The number of epochs to separately regress the residual and association layers (if both are used); keep this number low (~500)
8) Truncation mode: For data truncation to prevent overfitting, the graph of interconnected IDACs are recursively checked until no more truncations occur. This can be turned on or off.


In [None]:
# Specify a model savepath
save_path = './trained_models/'

# Set the desired dimensionality of the thermodynamic embedding. This affects how much data is truncated. If set too high, a lot of the data will be truncated.
dimension = 6

# Set number of trials to run. This number serves as the seed to the random number generator.
trials = 1

# Specify parts of the model
combinatorial_layer = 'mod_FH' # Choose between FH, FV, SG, GK-FV, mod_FH, mod_FV
residual_layer = 'mod_UNIQUAC' # Choose between UNIQUAC, mod_UNIQUAC, Wilson, NRTL
association_layer = 'wertheim' # Choose between none, wertheim

# Temperature layer - Always go from smallest to largest. These are powers and correspond to inverse temperature
# To use the Taylor series expansion in temperature, negative exponents are needed. Due to the division by T, there will always be a 1/T component
# The choices investigated in the model include:
# Original dependence (original UNIFAC, Wilson, NRTL dependence) : [1]
# First-degree expansion (either in 1/T or T): [0, 1]
# Second-degree expansion in 1/T: [0, 1, 2]
# Second-degree expansion in T: [-1, 0, 1]
temp_type = 'invT' # Specify whether the expansion is in invT or T. This is just for the model filename
temp_exponents = np.array([0, 1], dtype = float) # Must set as numpy array and as float

# Sobolev loss
sobolev = 0 # 0 turns this off. Any non-zero value turns on the Sobolev loss and becomes the weight for the term.

# Hyperparameters
lr = 0.01 # Learning rate
total_epochs = 30000 # Total number of epochs to run
up_epochs = 1000 # Number of epochs for ramping up the model
hold_epochs = 20000 # Number of epochs for holding at the maximum learning rate
pre_train_epoch = 500 # Number of epochs for separately regressing the residual and association layers if both are used.

# Truncation to prevent overfitting
truncation = 'chemical_connections' # Set to temp_connections if we want to count each pairing's temperature as unique. Set to chemical_connections if we want to count each pairing's temperature as one entry. Set to 'keep_all' if we don't want it to truncate

# Set up a savename for the model
model_name = f'UFR_{combinatorial_layer}_{residual_layer}_{association_layer}_{dimension}D_{temp_type}_{temp_exponents.size}_sobolev_{sobolev}'
save_name = save_path + model_name

# Set up dictionaries of model parameters for passing into the model
ln_y_data = f'ln_gamma_res_{combinatorial_layer}' # This specifies which IDAC with combinatorial removed to regress on.
model_layer_options = {'ln_y_data': ln_y_data, 'combinatorial_layer': combinatorial_layer, 'residual_layer': residual_layer, 'association_layer': association_layer, 'temp_exponents': temp_exponents, 'reference_volume': v0, 'reference_area': s0}
model_opt_options = {'sobolev': sobolev, 'lr': lr, 'total_epochs': total_epochs, 'up_epochs': up_epochs, 'hold_epochs': hold_epochs}
model_run_options = {'truncation': truncation, 'smile_labels' : (solute_smiles, solvent_smiles), 'pre_train_epoch': pre_train_epoch, 'save_name': save_name}

# We need to calculate the gradient if we are using Sobolev loss. This is done in the invtemp_gradient_calc function.
# We can specify what solute-solvent pairs to consider in the calculation
if sobolev > 0:
    print('Starting calculation for Sobolev regularization, this might take a while')
    df_inf_regress = data_processing.invtemp_gradient_calc(df_inf_regress, ln_y_data, min_points = 4, min_delta_T = 30, std_residuals_tol = 0.1, rel_std_residuals_tol = 0.1)

# Launch the model
model_launch.launch_model(dimension, trials, df_inf_regress.copy(), df_prop.copy(), model_layer_options, model_opt_options, model_run_options, device)