In [1]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/reduced_dim_space_ddG.csv')
df = df[df['ddG (% ee)'] != 0]
df.set_index('Catalyst', inplace=True)
# start with a few randomly selected catalyst
START_SAMPLE_SIZE = 3
random_catalyst_df = df.sample(START_SAMPLE_SIZE)

In [3]:
random_catalyst_df

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10_1_2,41.007284,-0.974831,-87.973747,0.638112
12_7_1,-43.760461,-27.829366,23.015997,0.44644
28_3_2,2.142702,22.092128,-44.495,0.003554


In [4]:
df

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1_1_1,5.100125,-27.742489,-17.922393,1.226289
1_1_2,5.153813,-27.571266,-17.950507,0.719002
1_11_1,24.913739,-0.494436,-8.842801,0.631877
1_2_1,12.039033,-10.467777,-6.094131,0.216792
1_2_2,11.859191,-10.604692,-6.049362,0.111677
...,...,...,...,...
9_3_4,36.077375,-7.875347,-14.939623,0.128432
9_4_1,25.574518,-17.376362,4.663637,0.912247
9_7_1,22.777582,-19.792283,2.750864,0.056902
9_7_4,22.721846,-19.736566,2.713267,0.408758


In [5]:
# method to make a regression model from the provided catalysts
from sklearn.cross_decomposition import PLSRegression
def make_pls_model(df_catalysts):   
    X = df_catalysts[['x','y','z']]
    y = df_catalysts['ddG (% ee)']
    model = PLSRegression(n_components=2)
    # fitting the model
    model.fit(X, y)
    return model

In [22]:
def pls_predict_ee(pls_model, properties):
    return pls_model.predict(properties.reshape(1,-1))

# Function to optimize catalyst properties using coordinate descent
def optimize_catalysts(catalysts, pls_model=None, iterations=100, cd_iterations=10, step_size=0.01):
    if pls_model == None:
        print("Creating a new model. # data items: " + str(len(catalysts)))
        pls_model = make_pls_model(random_catalyst_df)
    optimized_catalysts = np.copy(catalysts)
    for _ in range(iterations):
        for i in range(len(optimized_catalysts)):
            original_ee = df.iloc[df.index.get_loc(catalysts[i:i+1].index[0])]['ddG (% ee)']
            for x in range(len(optimized_catalysts[i])):
                for cd in range(cd_iterations):
                    old_value = optimized_catalysts[i, x]
                    optimized_catalysts[i, x] = old_value + step_size
                    new_ee = pls_predict_ee(pls_model, optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value - step_size
                        new_ee = pls_predict_ee(pls_model, optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value
                        break
    return optimized_catalysts

new_optimized_catalysts = optimize_catalysts(random_catalyst_df[['x', 'y', 'z']])

Creating a new model. # data items: 6


In [7]:
random_catalyst_df

Unnamed: 0_level_0,x,y,z,ddG (% ee)
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10_1_2,41.007284,-0.974831,-87.973747,0.638112
12_7_1,-43.760461,-27.829366,23.015997,0.44644
28_3_2,2.142702,22.092128,-44.495,0.003554


In [8]:
new_optimized_catalysts

array([[ 51.0072841 ,   1.02516941, -83.59374707],
       [-33.76046092, -25.82936563,  27.39599713],
       [ 12.14270188,  24.09212846, -40.11499953]])

In [9]:
# Perform k-nearest neighbors analysis
num_neighbors = 1  # Number of neighbors to consider
knn = NearestNeighbors(n_neighbors=num_neighbors)
knn.fit(df[['x', 'y', 'z']])  # Using all the catalysts 

In [10]:
original_ee = df['ddG (% ee)']
def get_neighbors(knn, for_points):
    knn_indices = []
    for i in range(len(for_points)):
        distances, indices = knn.kneighbors([for_points[i]])
        knn_indices.append(indices[0][0])
    return knn_indices

knn_indices = get_neighbors(knn, new_optimized_catalysts)
knn_indices

[22, 55, 211]

In [11]:
found_neighbors = df.iloc[knn_indices]

In [12]:
random_catalyst_df = pd.concat([random_catalyst_df, found_neighbors], ignore_index=False, sort=True)

In [13]:
random_catalyst_df

Unnamed: 0_level_0,ddG (% ee),x,y,z
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10_1_2,0.638112,41.007284,-0.974831,-87.973747
12_7_1,0.44644,-43.760461,-27.829366,23.015997
28_3_2,0.003554,2.142702,22.092128,-44.495
10_3_4,0.061652,60.06765,1.412118,-82.584672
12_7_2,1.241325,-43.602601,-27.703566,23.293155
26_3_2,0.001185,9.529914,18.422417,-31.093718


In [14]:
random_catalyst_df = random_catalyst_df[~random_catalyst_df.index.duplicated(keep='first')]

In [15]:
random_catalyst_df

Unnamed: 0_level_0,ddG (% ee),x,y,z
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10_1_2,0.638112,41.007284,-0.974831,-87.973747
12_7_1,0.44644,-43.760461,-27.829366,23.015997
28_3_2,0.003554,2.142702,22.092128,-44.495
10_3_4,0.061652,60.06765,1.412118,-82.584672
12_7_2,1.241325,-43.602601,-27.703566,23.293155
26_3_2,0.001185,9.529914,18.422417,-31.093718


In [16]:
def optimize_find_catalysts(catalysts_X):
    new_optimized_catalysts = optimize_catalysts(catalysts_X)
    knn_indices = get_neighbors(knn, new_optimized_catalysts)
    found_neighbors = df.iloc[knn_indices]
    return new_optimized_catalysts, found_neighbors

In [24]:
# We will now do all this in a single loop
random_catalyst_df = df.sample(START_SAMPLE_SIZE)
prev_loop_len = len(random_catalyst_df)
while(len(random_catalyst_df) < len(df)):
    new_optimized_catalysts, found_neighbors = optimize_find_catalysts(random_catalyst_df[['x', 'y', 'z']])
    random_catalyst_df = pd.concat([random_catalyst_df, found_neighbors], ignore_index=False, sort=True)
    random_catalyst_df = random_catalyst_df[~random_catalyst_df.index.duplicated(keep='first')]
    if(len(random_catalyst_df) == prev_loop_len):
        print("The processing has blocked and no new catalysts are being added. Exiting.")
        break
    prev_loop_len = len(random_catalyst_df)
    print(prev_loop_len)

Creating a new model. # data items: 3
6
Creating a new model. # data items: 6
8
Creating a new model. # data items: 8
9
Creating a new model. # data items: 9
The processing has blocked and no new catalysts are being added. Exiting.


In [25]:
random_catalyst_df

Unnamed: 0_level_0,ddG (% ee),x,y,z
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3_3_4,0.264932,26.224531,7.308104,-19.972957
22_3_4,0.054528,2.863381,11.112635,-41.215495
1_3_4,0.222921,23.530906,-7.777752,-30.037466
3_3_1,0.221694,26.302667,7.368519,-19.941451
26_3_2,0.001185,9.529914,18.422417,-31.093718
6_3_1,0.210674,29.790008,-5.312876,-19.88605
24_3_4,0.007107,9.935389,29.086411,-27.455809
2_11_1,0.732504,29.115667,1.301971,-3.7679
17_3_4,0.003554,12.312513,41.899813,-26.265284
