In [2]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
# read dataset and set variables
df = pd.read_csv('reduced_dim_space_ddG.csv')
# We have lot of rows that have Enantiomeric Excess == 0. These ones we will exclude from the dataset.
df_cleaned = df[df['ddG (% ee)'] != 0]
X = df_cleaned[['x', 'y', 'z']]
y = df_cleaned['ddG (% ee)']

In [4]:
# Set KNN with the entire dataset
knn_train_dataset = X
# Perform k-nearest neighbors analysis
num_neighbors = 1  # Number of neighbors to consider
knn = NearestNeighbors(n_neighbors=num_neighbors)
knn.fit(knn_train_dataset.values)  # Using the remaining 150 catalysts for validation


In [5]:
from sklearn.cross_decomposition import PLSRegression
model = PLSRegression(n_components=2)
X_train = pd.DataFrame()
y_train = pd.DataFrame()
def add_data_remodel(X, y):
    global X_train
    X_train = pd.concat([X_train, X])
    global y_train
    y_train = pd.concat([y_train, y])
    # fitting the model (refresh of PLS with new rows included
    model.fit(X_train.values, y_train.values)
    print("We refreshed the PLS model with {0} rows".format(len(X_train)))

In [6]:
def optimize_catalysts(catalysts, iterations=100, cd_iterations=10, step_size=0.01):
    optimized_catalysts = np.copy(catalysts)
    for _ in range(iterations):
        for i in range(len(optimized_catalysts)):
            original_ee = pls_predict_ee(optimized_catalysts[i])  # Use your regression model
            for x in range(len(optimized_catalysts[i])):
                for cd in range(cd_iterations):
                    old_value = optimized_catalysts[i, x]
                    optimized_catalysts[i, x] = old_value + step_size
                    new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value - step_size
                        new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value
    return optimized_catalysts

def pls_predict_ee(properties):
    return model.predict(properties.reshape(1,-1))

In [7]:
# Approach 1: we iterate over all the catalysts in the original dataset
# start with N rows
chunk_size = 3

# Iterate over the DataFrame in chunks of size 'chunk_size'
for i in range(0, len(X), chunk_size):
    X_chunk = X.iloc[i:i + chunk_size]
    y_chunk = y.iloc[i:i + chunk_size]
    # create a new response surface. initial response surface from the chunk.
    # next response surface will be from the optimized dataset
    if i < 1:
        add_data_remodel(X_chunk, y_chunk)
    # perform cd on this chunk
    optimized_catalysts = optimize_catalysts(X_chunk)
    all_indices = []
    # knn on the optimized catalysts
    for i in range(len(optimized_catalysts)):
        distances, indices = knn.kneighbors([optimized_catalysts[i]])
        all_indices.extend(indices[0][:])
        neighbors_ee = [y[y.index[idx]] for idx in indices[0]]
        optimized_ee = pls_predict_ee(optimized_catalysts[i])
        average_neighbors_ee = np.mean(neighbors_ee)
        print(f"Original EE: {y[y.index[i]]:.4f} vs. Average Neighbors' EE: {average_neighbors_ee:.4f}")
    # refresh pls model with the newly found data
    add_data_remodel(X.iloc[all_indices], y.iloc[all_indices])
    

We refreshed the PLS model with 3 rows
Original EE: 1.2263 vs. Average Neighbors' EE: 0.2513
Original EE: 0.7190 vs. Average Neighbors' EE: 0.2513
Original EE: 0.6319 vs. Average Neighbors' EE: 0.9518
We refreshed the PLS model with 6 rows
Original EE: 1.2263 vs. Average Neighbors' EE: 0.8665
Original EE: 0.7190 vs. Average Neighbors' EE: 0.8665
Original EE: 0.6319 vs. Average Neighbors' EE: 0.1960
We refreshed the PLS model with 9 rows
Original EE: 1.2263 vs. Average Neighbors' EE: 0.2095
Original EE: 0.7190 vs. Average Neighbors' EE: 0.2095
Original EE: 0.6319 vs. Average Neighbors' EE: 0.3459
We refreshed the PLS model with 12 rows
Original EE: 1.2263 vs. Average Neighbors' EE: 0.4451
Original EE: 0.7190 vs. Average Neighbors' EE: 0.5579
Original EE: 0.6319 vs. Average Neighbors' EE: 0.5579
We refreshed the PLS model with 15 rows
Original EE: 1.2263 vs. Average Neighbors' EE: 0.3876
Original EE: 0.7190 vs. Average Neighbors' EE: 0.4587
Original EE: 0.6319 vs. Average Neighbors' EE: 