In [20]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv('data/reduced_dim_space_ddG.csv')
df = df[df['ddG (% ee)'] != 0]
df.set_index('Catalyst', inplace=True)
# start with a few randomly selected catalyst
START_SAMPLE_SIZE = 3
random_catalyst_df = df.sample(START_SAMPLE_SIZE)

In [22]:
random_catalyst_df

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8_1_2,19.436941,-26.275921,-13.75235,0.660195
21_2_4,-16.355725,-1.058726,8.262735,0.563795
3_2_1,15.386325,-1.180619,3.423324,0.033176


In [23]:
df

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_1_1,5.100125,-27.742489,-17.922393,1.226289
1_1_2,5.153813,-27.571266,-17.950507,0.719002
1_11_1,24.913739,-0.494436,-8.842801,0.631877
1_2_1,12.039033,-10.467777,-6.094131,0.216792
1_2_2,11.859191,-10.604692,-6.049362,0.111677
...,...,...,...,...
9_3_4,36.077375,-7.875347,-14.939623,0.128432
9_4_1,25.574518,-17.376362,4.663637,0.912247
9_7_1,22.777582,-19.792283,2.750864,0.056902
9_7_4,22.721846,-19.736566,2.713267,0.408758


In [24]:
# method to make a regression model from the provided catalysts
from sklearn.cross_decomposition import PLSRegression
def make_pls_model(df_catalysts):   
    X = df_catalysts[['x','y','z']]
    y = df_catalysts['ddG (% ee)']
    model = PLSRegression(n_components=2)
    # fitting the model
    model.fit(X, y)
    return model

In [25]:
pls_model = make_pls_model(random_catalyst_df)

In [None]:
def pls_predict_ee(properties):
    return pls_model.predict(properties.reshape(1,-1))

# Function to optimize catalyst properties using coordinate descent
def optimize_catalysts(catalysts, iterations=100, cd_iterations=10, step_size=0.01, ):
    optimized_catalysts = np.copy(catalysts)
    for _ in range(iterations):
        for i in range(len(optimized_catalysts)):
            original_ee = df.iloc[df.index.get_loc(catalysts[i:i+1].index[0])]['ddG (% ee)']
            for x in range(len(optimized_catalysts[i])):
                for cd in range(cd_iterations):
                    old_value = optimized_catalysts[i, x]
                    optimized_catalysts[i, x] = old_value + step_size
                    new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value - step_size
                        new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value
                        break
    return optimized_catalysts

new_optimized_catalysts = optimize_catalysts(random_catalyst_df[['x', 'y', 'z']])

In [46]:
random_catalyst_df

Unnamed: 0_level_0,ddG (% ee),x,y,z
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9_11_1,0.826636,33.390888,-7.294826,-1.650258
5_1_2,1.097592,-4.069721,-28.597117,-27.74654
2_4_3,0.734203,11.051511,-9.860034,-0.614863
3_1_2,1.067805,6.33648,-21.55471,-22.309178


In [47]:
new_optimized_catalysts

array([[ 33.39088794,  -7.29482568,  -1.65025781],
       [  0.33027921, -24.29711677, -23.44654025],
       [ 11.05151076,  -9.8600342 ,  -0.61486284],
       [  6.33648013, -21.55470973, -22.30917789]])

In [29]:
# Perform k-nearest neighbors analysis
num_neighbors = 1  # Number of neighbors to consider
knn = NearestNeighbors(n_neighbors=num_neighbors)
knn.fit(df[['x', 'y', 'z']])  # Using all the catalysts 

In [30]:
original_ee = df['ddG (% ee)']
def get_neighbors(knn, for_points):
    knn_indices = []
    for i in range(len(for_points)):
        distances, indices = knn.kneighbors([for_points[i]])
        knn_indices.append(indices[0][0])
    return knn_indices

knn_indices = get_neighbors(knn, new_optimized_catalysts)
knn_indices

[293, 169, 235]

In [31]:
found_neighbors = df.iloc[knn_indices]

In [32]:
random_catalyst_df = pd.concat([random_catalyst_df, found_neighbors], ignore_index=False, sort=True)

In [33]:
random_catalyst_df

Unnamed: 0_level_0,ddG (% ee),x,y,z
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8_1_2,0.660195,19.436941,-26.275921,-13.75235
21_2_4,0.563795,-16.355725,-1.058726,8.262735
3_2_1,0.033176,15.386325,-1.180619,3.423324
8_1_2,0.660195,19.436941,-26.275921,-13.75235
21_2_4,0.563795,-16.355725,-1.058726,8.262735
3_2_1,0.033176,15.386325,-1.180619,3.423324


In [34]:
random_catalyst_df = random_catalyst_df[~random_catalyst_df.index.duplicated(keep='first')]

In [35]:
random_catalyst_df

Unnamed: 0_level_0,ddG (% ee),x,y,z
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8_1_2,0.660195,19.436941,-26.275921,-13.75235
21_2_4,0.563795,-16.355725,-1.058726,8.262735
3_2_1,0.033176,15.386325,-1.180619,3.423324


In [36]:
def optimize_find_catalysts(catalysts_X):
    new_optimized_catalysts = optimize_catalysts(catalysts_X)
    knn_indices = get_neighbors(knn, new_optimized_catalysts)
    found_neighbors = df.iloc[knn_indices]
    return new_optimized_catalysts, found_neighbors

In [43]:
# We will now do all this in a single loop
random_catalyst_df = df.sample(START_SAMPLE_SIZE)
prev_loop_len = len(random_catalyst_df)
while(len(random_catalyst_df) < len(df)):
    new_optimized_catalysts, found_neighbors = optimize_find_catalysts(random_catalyst_df[['x', 'y', 'z']])
    random_catalyst_df = pd.concat([random_catalyst_df, found_neighbors], ignore_index=False, sort=True)
    random_catalyst_df = random_catalyst_df[~random_catalyst_df.index.duplicated(keep='first')]
    if(len(random_catalyst_df) == prev_loop_len):
        print("The processing has blocked and no new catalysts are being added. Exiting.")
        break
    prev_loop_len = len(random_catalyst_df)
    print(prev_loop_len)

4
The processing has blocked and no new catalysts are being added. Exiting.


In [44]:
random_catalyst_df

Unnamed: 0_level_0,ddG (% ee),x,y,z
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9_11_1,0.826636,33.390888,-7.294826,-1.650258
5_1_2,1.097592,-4.069721,-28.597117,-27.74654
2_4_3,0.734203,11.051511,-9.860034,-0.614863
3_1_2,1.067805,6.33648,-21.55471,-22.309178
