In [1]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv('data/reduced_dim_space_ddG.csv')
df = df[df['ddG (% ee)'] != 0]
df.set_index('Catalyst', inplace=True)
# start with a few randomly selected catalyst
START_SAMPLE_SIZE = 3
random_catalyst_df = df.sample(START_SAMPLE_SIZE)

In [3]:
random_catalyst_df

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13_3_2,-74.59328,20.131638,10.048185,0.068781
4_1_1,6.574696,-21.259307,-18.91918,1.555161
18_1_4,5.224752,-42.014744,-28.599824,1.518518


In [4]:
df

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_1_1,5.100125,-27.742489,-17.922393,1.226289
1_1_2,5.153813,-27.571266,-17.950507,0.719002
1_11_1,24.913739,-0.494436,-8.842801,0.631877
1_2_1,12.039033,-10.467777,-6.094131,0.216792
1_2_2,11.859191,-10.604692,-6.049362,0.111677
...,...,...,...,...
9_3_4,36.077375,-7.875347,-14.939623,0.128432
9_4_1,25.574518,-17.376362,4.663637,0.912247
9_7_1,22.777582,-19.792283,2.750864,0.056902
9_7_4,22.721846,-19.736566,2.713267,0.408758


In [5]:
# method to make a regression model from the provided catalysts
from sklearn.cross_decomposition import PLSRegression
def make_pls_model(df_catalysts):   
    X = df_catalysts[['x','y','z']]
    y = df_catalysts['ddG (% ee)']
    model = PLSRegression(n_components=2)
    # fitting the model
    model.fit(X, y)
    return model

In [6]:
pls_model = make_pls_model(random_catalyst_df)

In [7]:
def pls_predict_ee(properties):
    return pls_model.predict(properties.reshape(1,-1))

# Function to optimize catalyst properties using coordinate descent
def optimize_catalysts(catalysts, iterations=100, cd_iterations=10, step_size=0.01, ):
    optimized_catalysts = np.copy(catalysts)
    for _ in range(iterations):
        for i in range(len(optimized_catalysts)):
            original_ee = df.iloc[df.index.get_loc(catalysts[i:i+1].index[0])]['ddG (% ee)']
            for x in range(len(optimized_catalysts[i])):
                for cd in range(cd_iterations):
                    old_value = optimized_catalysts[i, x]
                    optimized_catalysts[i, x] = old_value + step_size
                    new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value - step_size
                        new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value
    return optimized_catalysts

new_optimized_catalysts = optimize_catalysts(random_catalyst_df[['x', 'y', 'z']])



In [8]:
random_catalyst_df

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13_3_2,-74.59328,20.131638,10.048185,0.068781
4_1_1,6.574696,-21.259307,-18.91918,1.555161
18_1_4,5.224752,-42.014744,-28.599824,1.518518


In [9]:
new_optimized_catalysts

array([[-64.59328023,  30.13163784,  20.04818489],
       [ 16.5746956 , -11.25930653,  -8.91918004],
       [ 15.22475178, -32.01474444, -18.59982409]])

In [10]:
# Perform k-nearest neighbors analysis
num_neighbors = 1  # Number of neighbors to consider
knn = NearestNeighbors(n_neighbors=num_neighbors)
knn.fit(df[['x', 'y', 'z']])  # Using all the catalysts 

In [11]:
original_ee = df['ddG (% ee)']

for i in range(len(new_optimized_catalysts)):
    distances, indices = knn.kneighbors([new_optimized_catalysts[i]])
    print(indices)

[[60]]
[[3]]
[[268]]




In [14]:
df.iloc[[60,3,268]]

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13_2_1,-56.691717,15.811833,17.375917,0.147644
1_2_1,12.039033,-10.467777,-6.094131,0.216792
6_1_2,16.136957,-28.808699,-17.891692,0.655428


In [15]:
new_optimized_catalysts

array([[-64.59328023,  30.13163784,  20.04818489],
       [ 16.5746956 , -11.25930653,  -8.91918004],
       [ 15.22475178, -32.01474444, -18.59982409]])