In [51]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Function to optimize catalyst properties using coordinate descent
def optimize_catalysts(catalysts, iterations=100, cd_iterations=10, step_size=0.01):
    optimized_catalysts = np.copy(catalysts)
    for _ in range(iterations):
        for i in range(len(optimized_catalysts)):
            original_ee = pls_predict_ee(optimized_catalysts[i])  # Use your regression model
            for x in range(len(optimized_catalysts[i])):
                for cd in range(cd_iterations):
                    old_value = optimized_catalysts[i, x]
                    optimized_catalysts[i, x] = old_value + step_size
                    new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value - step_size
                        new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value
    return optimized_catalysts


# Function to predict enantiomeric excess using regression model
def predict_ee(properties):
    # Use your regression model to predict EE based on properties
    coef =  [-0.00122995, -0.00340784, -0.00199706]
    intercept = 0.4323468984662589
    return np.dot(properties, coef) + intercept

from joblib import dump, load
pls = load('pls.joblib') 
def pls_predict_ee(properties):
    return pls.predict(properties.reshape(1,-1))
# Further analysis and interpretation can be added here

In [52]:
df = pd.read_csv('reduced_dim_space_ddG.csv')
# We have lot of rows that have Enantiomeric Excess == 0. These ones we will exclude from the dataset.
df_cleaned = df[df['ddG (% ee)'] != 0]
df_cleaned
# Lets separate out the x and y 
X = df_cleaned[['x', 'y', 'z']]
y = df_cleaned['ddG (% ee)']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=101)

catalyst_data = X_train
original_ee = y_train


In [None]:
#original_ee[original_ee.index[2]]

In [None]:
# Perform optimization on the first 150 catalysts
optimized_catalysts = optimize_catalysts(catalyst_data)

# Perform k-nearest neighbors analysis
num_neighbors = 1  # Number of neighbors to consider
knn = NearestNeighbors(n_neighbors=num_neighbors)
knn.fit(catalyst_data)  # Using the remaining 150 catalysts for validation

# Validate optimized catalysts using KNN
for i in range(len(optimized_catalysts)):
    distances, indices = knn.kneighbors([optimized_catalysts[i]])
    print(indices)
    neighbors_ee = [original_ee[original_ee.index[idx]] for idx in indices[0]]
    optimized_ee = pls_predict_ee(optimized_catalysts[i])
    average_neighbors_ee = np.mean(neighbors_ee)
    print(f"Original EE: {original_ee[original_ee.index[i]]:.4f} vs. Average Neighbors' EE: {average_neighbors_ee:.4f}")


In [None]:
catalyst_data[:1]

In [53]:
optimized_catalysts = optimize_catalysts(catalyst_data[:1])
optimized_catalysts



KeyboardInterrupt: 