# Dynamic Vertical Triplet Energy Benchmarking (DvTEBench)

## Import Dependencies

In [11]:
from pathlib import Path
import csv, math
import pandas as pd

import numpy as np
from numpy.random import seed
from numpy.random import randn

from scipy.stats import norm
from scipy.stats import t
from scipy.optimize import curve_fit

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.mixture import GaussianMixture

## Load Datased and Experimental Values

Provide the names of the molecules you want to be included in the benchmark in the "species" list. Note: make sure that the names are consistent with the names of string before the underscore "_" delimiter in the output.csv files. Additionally the .csv files should be located in the same working directory.

Provide the experimental triplet energy value in "exptl_values" list.

In [12]:
# List of molecules
species = ['Naphthalene', 'Benzene', 'Dimethyl_Diazene', 'Teramethylethene', 'Isoprene', 'Azulene', 
           'Trans-Stilbene', 'Biphenyl', 'Cis-Stilbene', 'Styrene', 'Norbornene', 'Indene', 'Cyclopentene', 
           'Cyclohexenone', 'Cyclohexene', 'Trans-b-Methylstyrene', '1,1-Dichloroethene', 'Ethene', 
           'Cis-b-Methylstyrene', 'Diphenylacetylene']

# List of experimentally determined triplet energies (kcal/mol)
exptl_values = [60.5, 83.6, 54.0, 75.8, 60.0, 39.0, 51.0, 69.5, 55.5, 60.8, 72.3, 64.1, 79.4, 67.0, 
                81.2, 60.5, 73.3, 84.0, 66.5, 62.7]

# CSV outputs containing DFT data
output_list = [f"{entry}_output.csv" for entry in species]

mol_list = zip(output_list, species, exptl_values)

#print(output_files)

# Normal Distribution Models

This code utilises the compiled .csv dataset of vertical triplet energies generated using DvTEProc (i.e. {MoleculeName}_output.csv). Then using the mean and standard deviation from fitted normla distributions it construct cumulative distribution functions for each species. It then stores predicted triplet energies using a range of populations starting from 0.001 to 0.050 in steps of 0.001 as a new .csv file called "df_proc_normal.csv". The code then proceed to construct linear regressions of predicted triplet energy against experimental values for all selected populations within the 0.001 and 0.050 range, followed by reporting all R2s, MAEs and RMSDs. The model with the lowest RMSD is then printed out at the end.

In [13]:
# Get the current working directory
current_directory = Path.cwd()

# Create an empty dataframe df_proc
df_proc = pd.DataFrame()

# Add species names
df_proc['Name'] = species

# Define the range of desired populations
desired_populations = np.arange(0.001, 0.051, 0.001)

# Add columns to df_proc for each desired population
for pop in desired_populations:
    df_proc[f'{pop:.3f}'] = np.nan

# Add experimental values
df_proc['Experimental'] = exptl_values

# Loop over the CSV filenames
for file, name in zip(output_list,species):
    
    # Construct the full path to the CSV file in the current directory
    csv_path = current_directory / file
    df = pd.read_csv(csv_path)
    
    # Calculate mean and standard deviation
    mu = df['dE'].mean()
    sigma = df['dE'].std()

    # Loop over desired populations
    for pop in desired_populations:
        # Find corresponding x value for desired y value
        x_value = norm.ppf(pop, loc=mu, scale=sigma)

        # Update df_proc with the calculated x_value
        df_proc.at[df_proc[df_proc['Name'] == name].index[0], f'{pop:.3f}'] = x_value

# Assuming df_proc is your dataframe
df_proc.to_csv('df_proc_normal.csv', index=False)

# Extract the 'Experimental' column
exptl_values = df_proc['Experimental'].values

# Initialize variables to keep track of the column with the lowest MAE
min_rmsd = float('inf')
best_column = None

# Loop over columns in df_proc (excluding 'Name' and 'Experimental' columns)
for column in df_proc.columns[1:-1]:
    # Extract the column values
    column_values = df_proc[column].values

    # Remove NaN values from both arrays
    mask = ~np.isnan(exptl_values) & ~np.isnan(column_values)
    experimental_values_cleaned = exptl_values[mask]
    column_values_cleaned = column_values[mask]

    # Calculate Mean Squared Error using scikit-learn's function
    mse = mean_squared_error(experimental_values_cleaned, column_values_cleaned)

    # Calculate RMSD
    rmsd = math.sqrt(mse)

    # Calculate Mean Absolute Error
    mae = mean_absolute_error(experimental_values_cleaned, column_values_cleaned)

    # Calculate R-squared
    r2 = r2_score(experimental_values_cleaned, column_values_cleaned)
    
    # Check if the current column has a lower MAE
    if rmsd < min_rmsd:
        min_rmsd = rmsd
        best_column = column

    # Print results
    print(f"Population: {column}")
    print(f"MAE: {mae:.4f}")
    print(f"R2: {r2:.4f}")
    print(f"RMSD: {rmsd:.4f}")
    print("--------------------")

# Print the column with the lowest MAE
print(f"Best Model Population: {best_column}")
print(f"Lowest RMSD: {min_rmsd:.4f}")

Population: 0.001
MAE: 5.4016
R2: 0.7256
RMSD: 5.9644
--------------------
Population: 0.002
MAE: 3.7255
R2: 0.8476
RMSD: 4.4456
--------------------
Population: 0.003
MAE: 2.8536
R2: 0.9001
RMSD: 3.5998
--------------------
Population: 0.004
MAE: 2.4314
R2: 0.9277
RMSD: 3.0621
--------------------
Population: 0.005
MAE: 2.2087
R2: 0.9431
RMSD: 2.7164
--------------------
Population: 0.006
MAE: 2.1245
R2: 0.9515
RMSD: 2.5085
--------------------
Population: 0.007
MAE: 2.1078
R2: 0.9554
RMSD: 2.4045
--------------------
Population: 0.008
MAE: 2.1180
R2: 0.9564
RMSD: 2.3784
--------------------
Population: 0.009
MAE: 2.1604
R2: 0.9552
RMSD: 2.4093
--------------------
Population: 0.010
MAE: 2.2071
R2: 0.9526
RMSD: 2.4800
--------------------
Population: 0.011
MAE: 2.2770
R2: 0.9488
RMSD: 2.5776
--------------------
Population: 0.012
MAE: 2.3415
R2: 0.9441
RMSD: 2.6926
--------------------
Population: 0.013
MAE: 2.4015
R2: 0.9387
RMSD: 2.8184
--------------------
Population: 0.014
MAE: 2.

# 2GMM Models

Similar to the code above, this cell tries to fit a Guassian Mixture Model (GMM) to the dynamics data utilizing two guassians, which will then be used to construct the cumulative distribution function to compile predicted triplet energies using a range of populations. The code then proceed to construct linear regressions of predicted triplet energy against experimental values for all selected populations within the 0.001 and 0.050 range, followed by reporting all R2s, MAEs and RMSDs. The model with the lowest RMSD is then printed out at the end.

In [15]:
# Get the current working directory
current_directory = Path.cwd()

# Create an empty dataframe df_proc
df_proc = pd.DataFrame()

# Add species names
df_proc['Name'] = species

# Define the range of desired populations
desired_populations = np.arange(0.001, 0.051, 0.001)

# Add columns to df_proc for each desired population
for pop in desired_populations:
    df_proc[f'{pop:.3f}'] = np.nan

# Add experimental values
df_proc['Experimental'] = exptl_values

# Loop over the CSV filenames
for file, name in zip(output_list,species):

    # Construct the full path to the CSV file in the current directory
    csv_path = current_directory / file
    df = pd.read_csv(csv_path)

    # Extract the data as a 1D array
    data = df['dE'].values

    # Create a Gaussian Mixture Model with 2 components
    model = GaussianMixture(n_components=2)

    # Fit the model to the data
    model.fit(data.reshape(-1, 1))

    # Get the parameters of the fitted distributions
    means = model.means_
    covariances = model.covariances_
    weights = model.weights_

    # Generate data points for plotting the distributions
    x = np.linspace(data.min(), data.max(), 1000).reshape(-1, 1)
    pdf1 = weights[0] * np.exp(-(x - means[0])**2 / (2 * covariances[0])) / np.sqrt(2 * np.pi * covariances[0])
    pdf2 = weights[1] * np.exp(-(x - means[1])**2 / (2 * covariances[1])) / np.sqrt(2 * np.pi * covariances[1])

    # Sum of the two distributions
    pdf_sum = pdf1 + pdf2
    cdf = np.cumsum(pdf_sum) * (x[1] - x[0])

    # Loop over desired populations
    for pop in desired_populations:
        # Find corresponding x value for desired y value
        x_value = np.interp(pop, cdf, x.flatten())

        # Update df_proc with the calculated x_value
        df_proc.at[df_proc[df_proc['Name'] == name].index[0], f'{pop:.3f}'] = x_value

# Assuming df_proc is your dataframe
df_proc.to_csv('df_proc_2GMM.csv', index=False)

# Extract the 'Experimental' column
experimental_values = df_proc['Experimental'].values

# Initialize variables to keep track of the column with the lowest MAE
min_rmsd = float('inf')
best_column = None

# Loop over columns in df_proc (excluding 'Name' and 'Experimental' columns)
for column in df_proc.columns[1:-1]:
    # Extract the column values
    column_values = df_proc[column].values

    # Remove NaN values from both arrays
    mask = ~np.isnan(experimental_values) & ~np.isnan(column_values)
    experimental_values_cleaned = experimental_values[mask]
    column_values_cleaned = column_values[mask]

    # Calculate Mean Squared Error using scikit-learn's function
    mse = mean_squared_error(experimental_values_cleaned, column_values_cleaned)

    # Calculate RMSD
    rmsd = math.sqrt(mse)

    # Calculate Mean Absolute Error
    mae = mean_absolute_error(experimental_values_cleaned, column_values_cleaned)

    # Calculate R-squared
    r2 = r2_score(experimental_values_cleaned, column_values_cleaned)
    
    # Check if the current column has a lower RMSD
    if rmsd < min_rmsd:
        min_rmsd = rmsd
        best_column = column


    # Print results
    print(f"Population: {column}")
    print(f"MAE: {mae:.4f}")
    print(f"R2: {r2:.4f}")
    print(f"RMSD: {rmsd:.4f}")
    print("--------------------")

# Print the column with the lowest MAE
print(f"Best Model Population: {best_column}")
print(f"Lowest RMSD: {min_rmsd:.4f}")

Population: 0.001
MAE: 3.1148
R2: 0.8922
RMSD: 3.7381
--------------------
Population: 0.002
MAE: 2.4425
R2: 0.9342
RMSD: 2.9214
--------------------
Population: 0.003
MAE: 2.2253
R2: 0.9490
RMSD: 2.5718
--------------------
Population: 0.004
MAE: 2.1229
R2: 0.9538
RMSD: 2.4472
--------------------
Population: 0.005
MAE: 2.1046
R2: 0.9538
RMSD: 2.4486
--------------------
Population: 0.006
MAE: 2.1571
R2: 0.9510
RMSD: 2.5213
--------------------
Population: 0.007
MAE: 2.2430
R2: 0.9465
RMSD: 2.6335
--------------------
Population: 0.008
MAE: 2.3422
R2: 0.9410
RMSD: 2.7664
--------------------
Population: 0.009
MAE: 2.4419
R2: 0.9347
RMSD: 2.9093
--------------------
Population: 0.010
MAE: 2.5330
R2: 0.9280
RMSD: 3.0559
--------------------
Population: 0.011
MAE: 2.6245
R2: 0.9209
RMSD: 3.2028
--------------------
Population: 0.012
MAE: 2.7149
R2: 0.9136
RMSD: 3.3478
--------------------
Population: 0.013
MAE: 2.8278
R2: 0.9061
RMSD: 3.4899
--------------------
Population: 0.014
MAE: 2.

# 3GMM Models

Same as above but using 3 gaussian functions

In [17]:
# Get the current working directory
current_directory = Path.cwd()

# Create an empty dataframe df_proc
df_proc = pd.DataFrame()

# Add species names
df_proc['Name'] = species

# Define the range of desired populations
desired_populations = np.arange(0.001, 0.051, 0.001)

# Add columns to df_proc for each desired population
for pop in desired_populations:
    df_proc[f'{pop:.3f}'] = np.nan

# Add experimental values
df_proc['Experimental'] = exptl_values

# Loop over the CSV filenames
for csv_filename in output_list:
    # Construct the full path to the CSV file in the current directory
    csv_path = current_directory / csv_filename
    
    df = pd.read_csv(csv_path)

    # Extract the file name without "_output.csv"
    file_name = csv_filename.replace("_output.csv", "")

    # Extract the data as a 1D array
    data = df['dE'].values

    # Create a Gaussian Mixture Model with 2 components
    model = GaussianMixture(n_components=3)

    # Fit the model to the data
    model.fit(data.reshape(-1, 1))

    # Get the parameters of the fitted distributions
    means = model.means_
    covariances = model.covariances_
    weights = model.weights_

    # Generate data points for plotting the distributions
    x = np.linspace(data.min(), data.max(), 1000).reshape(-1, 1)
    pdf1 = weights[0] * np.exp(-(x - means[0])**2 / (2 * covariances[0])) / np.sqrt(2 * np.pi * covariances[0])
    pdf2 = weights[1] * np.exp(-(x - means[1])**2 / (2 * covariances[1])) / np.sqrt(2 * np.pi * covariances[1])
    pdf3 = weights[2] * np.exp(-(x - means[2])**2 / (2 * covariances[2])) / np.sqrt(2 * np.pi * covariances[2])
    
    # Sum of the two distributions
    pdf_sum = pdf1 + pdf2 + pdf3
    cdf = np.cumsum(pdf_sum) * (x[1] - x[0])

    # Loop over desired populations
    for pop in desired_populations:
        # Find corresponding x value for desired y value
        x_value = np.interp(pop, cdf, x.flatten())

        # Update df_proc with the calculated x_value
        df_proc.at[df_proc[df_proc['Name'] == file_name].index[0], f'{pop:.3f}'] = x_value

# Assuming df_proc is your dataframe
df_proc.to_csv('df_proc_3GMM.csv', index=False)

# Extract the 'Experimental' column
experimental_values = df_proc['Experimental'].values

# Initialize variables to keep track of the column with the lowest MAE
min_rmsd = float('inf')
best_column = None

# Loop over columns in df_proc (excluding 'Name' and 'Experimental' columns)
for column in df_proc.columns[1:-1]:
    # Extract the column values
    column_values = df_proc[column].values

    # Remove NaN values from both arrays
    mask = ~np.isnan(experimental_values) & ~np.isnan(column_values)
    experimental_values_cleaned = experimental_values[mask]
    column_values_cleaned = column_values[mask]

    # Calculate Mean Squared Error using scikit-learn's function
    mse = mean_squared_error(experimental_values_cleaned, column_values_cleaned)

    # Calculate RMSD
    rmsd = math.sqrt(mse)

    # Calculate Mean Absolute Error
    mae = mean_absolute_error(experimental_values_cleaned, column_values_cleaned)

    # Calculate R-squared
    r2 = r2_score(experimental_values_cleaned, column_values_cleaned)
    
    # Check if the current column has a lower RMSD
    if rmsd < min_rmsd:
        min_rmsd = rmsd
        best_column = column


    # Print results
    print(f"Population: {column}")
    print(f"MAE: {mae:.4f}")
    print(f"R2: {r2:.4f}")
    print(f"RMSD: {rmsd:.4f}")
    print("--------------------")

# Print the column with the lowest MAE
print(f"Best Model Population: {best_column}")
print(f"Lowest RMSD: {min_rmsd:.4f}")

Population: 0.001
MAE: 2.8757
R2: 0.9049
RMSD: 3.5120
--------------------
Population: 0.002
MAE: 2.3486
R2: 0.9395
RMSD: 2.8006
--------------------
Population: 0.003
MAE: 2.1557
R2: 0.9511
RMSD: 2.5184
--------------------
Population: 0.004
MAE: 2.0703
R2: 0.9544
RMSD: 2.4321
--------------------
Population: 0.005
MAE: 2.1039
R2: 0.9537
RMSD: 2.4506
--------------------
Population: 0.006
MAE: 2.1765
R2: 0.9508
RMSD: 2.5267
--------------------
Population: 0.007
MAE: 2.2555
R2: 0.9465
RMSD: 2.6341
--------------------
Population: 0.008
MAE: 2.3263
R2: 0.9413
RMSD: 2.7580
--------------------
Population: 0.009
MAE: 2.4190
R2: 0.9356
RMSD: 2.8897
--------------------
Population: 0.010
MAE: 2.5056
R2: 0.9295
RMSD: 3.0243
--------------------
Population: 0.011
MAE: 2.5960
R2: 0.9230
RMSD: 3.1589
--------------------
Population: 0.012
MAE: 2.6845
R2: 0.9164
RMSD: 3.2919
--------------------
Population: 0.013
MAE: 2.8128
R2: 0.9097
RMSD: 3.4224
--------------------
Population: 0.014
MAE: 2.