<a href="https://colab.research.google.com/github/nirb28/ee-predict/blob/main/cd_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/ee-predict/' + relativePath
    else:
        return relativePath
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

ModuleNotFoundError: No module named 'google'

In [2]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np, os
import warnings
warnings.filterwarnings('ignore')

In [3]:
#DATAFILE_NAME = "reduced_dim_space_ddG.csv"
DATASET = "LARGE" # "LARGE" or "REDUCED"
if DATASET == "REDUCED":
  DATAFILE_NAME = "reduced_dim_space_ddG.csv"
  X_COLS = ['x', 'y', 'z']
else:
  DATAFILE_NAME = "large_cat_desc_col_names.csv"
  X_COLS = ['3954', '3955', '3957', '3958', '3959', '3960', '3961', '3962', '3963',
       '3964', '3965', '3966', '3967', '3968', '3969', '3970', '3971', '3972',
       '3973']

df = pd.read_csv(resolve_path_gdrive('data/' + DATAFILE_NAME))

In [4]:
df = df[df['ddG'] != 0]
df.set_index('Catalyst', inplace=True)
# start with a few randomly selected catalyst
START_SAMPLE_SIZE = 3
random_catalyst_df = df.sample(START_SAMPLE_SIZE)
random_catalyst_df_delete = random_catalyst_df.copy()

In [6]:
# method to make a regression model from the provided catalysts
from sklearn.cross_decomposition import PLSRegression
def make_pls_model(df_catalysts):
    X = df_catalysts[X_COLS]
    y = df_catalysts['ddG']
    model = PLSRegression(n_components=2)
    # fitting the model
    model.fit(X, y)
    return model

from joblib import dump, load
def load_model(path):
  return load(path)

pls_large_saved_model = load_model(resolve_path_gdrive('models/pls_large.joblib'))
pls_large_saved_model

In [None]:
original_ee = df['ddG']
def pls_predict_ee(pls_model, properties):
    return pls_model.predict(properties.reshape(1,-1))

# Function to optimize catalyst properties using coordinate descent
def optimize_catalysts(catalysts, pls_model=None, iterations=100, cd_iterations=10, step_size=0.000001):
    if pls_model == None:
        print("Creating a new model. # data items: " + str(len(catalysts)))
        pls_model = make_pls_model(random_catalyst_df)
    optimized_catalysts = np.copy(catalysts)
    for _ in range(iterations):
        for i in range(len(optimized_catalysts)):
            original_ee = df.iloc[df.index.get_loc(catalysts[i:i+1].index[0])]['ddG']
            for x in range(len(optimized_catalysts[i])):
                for cd in range(cd_iterations):
                    old_value = optimized_catalysts[i, x]
                    optimized_catalysts[i, x] = old_value + step_size
                    new_ee = pls_predict_ee(pls_model, optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value - step_size
                        new_ee = pls_predict_ee(pls_model, optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value
                        break
                    print('Found a direction for higher ddG')
    return optimized_catalysts

new_optimized_catalysts = optimize_catalysts(random_catalyst_df[X_COLS], pls_large_saved_model)

In [13]:
random_catalyst_df[X_COLS]

InvalidIndexError: (['3954', '3955', '3957', '3958', '3959', '3960', '3961', '3962', '3963', '3964', '3965', '3966', '3967', '3968', '3969', '3970', '3971', '3972', '3973'], 'ddG')

In [9]:
new_optimized_catalysts

array([[ 0.75812941,  0.67727654,  0.68692902,  0.57403813, -0.39901283,
        -0.83317389, -0.69366434, -0.76423736, -0.18775295, -0.89697628,
        -0.93897206, -0.64303136, -0.50508299, -0.82567658, -0.29773217,
        -0.23605932, -0.26722481,  0.02879822, -0.69189841],
       [ 0.75812941,  0.67727654,  0.68692902,  0.57403813, -0.39901283,
        -0.83317389, -0.69366434, -0.76423736, -0.18775295, -0.89697628,
        -0.93897206, -0.64303136, -0.50508299, -0.82567658, -0.29773217,
        -0.23605932, -0.26722481,  0.02879822, -0.69189841],
       [-0.9633824 ,  0.74276791, -0.31738391,  0.28121777,  2.02887051,
         1.65192225,  1.57538633,  1.616959  , -0.02832588,  1.05448598,
         1.11273202,  0.60103942,  0.6915266 ,  1.36068392,  0.61459764,
         0.52478116,  0.57033987, -1.51845156, -0.2306328 ]])

In [10]:
# Perform k-nearest neighbors analysis
knn_df = df[X_COLS].copy()
num_neighbors = 1  # Number of neighbors to consider
knn = NearestNeighbors(n_neighbors=num_neighbors)
knn.fit(knn_df)  # Using all the catalysts

In [13]:
def findUniqueNeighborsAndAdd(knn, point, addToDf, neighbor_number=1):
    distances, indices = knn.kneighbors([point], neighbor_number)
    neighbor_catalyst = df.iloc[indices[neighbor_number-1]]
    if neighbor_catalyst.index[0] in addToDf.index:
        #get next neighbor
        neighbor_catalyst = findUniqueNeighborsAndAdd(knn, point, addToDf, neighbor_number+1)
    return neighbor_catalyst

def get_neighbors(knn, for_points):
    knn_indices = []
    
    for i in range(len(for_points)):
        neighbor_catalyst = findUniqueNeighborsAndAdd(knn, for_points[i], random_catalyst_df)
        knn_indices.append(neighbor_catalyst)
    return knn_indices

knn_indices = get_neighbors(knn, new_optimized_catalysts)
knn_indices

[            1    2    3    4    5    6    7    8    9   10  ...      3965  \
 Catalyst                                                    ...             
 1_1_5     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  1.112732   
 
               3966      3967      3968      3969      3970     3971      3972  \
 Catalyst                                                                        
 1_1_5     0.601039  0.691527  1.360684  0.614598  0.524781  0.57034 -1.518452   
 
               3973       ddG  
 Catalyst                      
 1_1_5    -0.230633  1.463011  
 
 [1 rows x 3974 columns],
             1    2    3    4    5    6    7    8    9   10  ...      3965  \
 Catalyst                                                    ...             
 1_1_5     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  1.112732   
 
               3966      3967      3968      3969      3970     3971      3972  \
 Catalyst                                                                 

In [22]:
found_neighbors = df.iloc[knn_indices]

In [23]:
random_catalyst_df = pd.concat([random_catalyst_df, found_neighbors], ignore_index=False, sort=True)

In [24]:
random_catalyst_df

Unnamed: 0_level_0,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,991,992,993,994,995,996,997,998,999,ddG
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11_7_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.133333,0.133333,0.133333,0.2,0.0,-0.672194
27_7_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.463011
25_5_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,1.463011
1_1_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.672194
1_1_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.463011
1_1_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.463011


In [None]:
random_catalyst_df = random_catalyst_df[~random_catalyst_df.index.duplicated(keep='first')]

In [None]:
random_catalyst_df

Unnamed: 0_level_0,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,991,992,993,994,995,996,997,998,999,ddG
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22_11_5,0.0,0.0,0.0,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.090909,0.181818,0.0,0.181818,0.090909,0.090909,1.463011
16_5_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.672194
7_11_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.672194
10_2_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.028169,...,0.0,0.028169,0.042254,0.084507,0.225352,0.183099,0.169014,0.056338,0.014085,1.463011


In [None]:
def optimize_find_catalysts(catalysts_X):
    new_optimized_catalysts = optimize_catalysts(catalysts_X)
    knn_indices = get_neighbors(knn, new_optimized_catalysts)
    found_neighbors = df.iloc[knn_indices]
    return new_optimized_catalysts, found_neighbors

In [None]:
# We will now do all this in a single loop
random_catalyst_df = df.sample(START_SAMPLE_SIZE)
prev_loop_len = len(random_catalyst_df)
while(len(random_catalyst_df) < len(df)):
    new_optimized_catalysts, found_neighbors = optimize_find_catalysts(random_catalyst_df[X_COLS])
    random_catalyst_df = pd.concat([random_catalyst_df, found_neighbors], ignore_index=False, sort=True)
    random_catalyst_df = random_catalyst_df[~random_catalyst_df.index.duplicated(keep='first')]
    if(len(random_catalyst_df) == prev_loop_len):
        print("The processing has blocked and no new catalysts are being added. Exiting.")
        break
    prev_loop_len = len(random_catalyst_df)
    print(prev_loop_len)

Creating a new model. # data items: 3
4
Creating a new model. # data items: 4
The processing has blocked and no new catalysts are being added. Exiting.


In [None]:
random_catalyst_df

Unnamed: 0_level_0,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,991,992,993,994,995,996,997,998,999,ddG
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10_4_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049383,0.049383,...,0.0,0.0,0.08642,0.160494,0.246914,0.234568,0.185185,0.024691,0.012346,0.276786
7_1_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.751276
24_8_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276786
10_2_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.028169,...,0.0,0.028169,0.042254,0.084507,0.225352,0.183099,0.169014,0.056338,0.014085,1.463011


