<a href="https://colab.research.google.com/github/nirb28/ee-predict/blob/main/cd_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/ee-predict/' + relativePath
    else:
        return relativePath
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [3]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np, os, csv
import warnings
warnings.filterwarnings('ignore')

In [40]:
#DATAFILE_NAME = "reduced_dim_space_ddG.csv"
DATASET = "LARGE" # "LARGE" or "REDUCED"
PCA_DIMENSION = 14
if DATASET == "REDUCED":
  DATAFILE_NAME = "reduced_dim_space_ddG.csv"
  X_COLS = ['x', 'y', 'z']
else:
  DATAFILE_NAME = "merged_large_catalyst.csv"
  with open(resolve_path_gdrive("models/high_corr_cols.txt")) as f:
    reader = csv.reader(f)
    X_COLS = list(reader)[0][:-1]

df = pd.read_csv(resolve_path_gdrive('data/' + DATAFILE_NAME))
df = df[df['ddG'] != 0]
df.set_index('Catalyst', inplace=True)

In [41]:
# method to make a regression model from the provided catalysts
from sklearn.cross_decomposition import PLSRegression
def make_pls_model(df_catalysts):
    X = df_catalysts[X_COLS]
    y = df_catalysts['ddG']
    model = PLSRegression(n_components=PCA_DIMENSION)
    # fitting the model
    model.fit(X, y)
    return model

from joblib import dump, load
def load_model(path):
  return load(path)

pls_large_saved_model = load_model(resolve_path_gdrive('models/pls_large.joblib'))
print(pls_large_saved_model)

def pls_predict_ee(pls_model, properties):
    return pls_model.predict(properties.reshape(1,-1))

## Add a predicted ddG column to the df
df['pred_ddG'] = df.apply(lambda x: pls_predict_ee(pls_large_saved_model, np.array(x[X_COLS]))[0][0], axis=1)

PLSRegression(n_components=14)


In [43]:
# start with a few randomly selected catalyst
START_SAMPLE_SIZE = 3
random_catalyst_df = df.sample(START_SAMPLE_SIZE)
random_catalyst_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,3967,3968,3969,3970,3971,3972,3973,3974,ddG,pred_ddG
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11_6_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.471616,-0.798242,-0.277709,-0.280445,-0.279338,-0.246094,-1.153164,-1.146684,0.611829,0.392452
12_1_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.471616,-0.798242,-0.277709,-0.280445,-0.279338,-0.246094,-1.153164,-1.146684,1.105212,1.302128
8_7_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.4717,-0.76848,-0.178536,-0.29859,-0.23864,-0.27751,-0.230633,-0.672194,0.041476,0.370958


In [44]:
original_ee = df['ddG']
# Function to optimize catalyst properties using coordinate descent
def optimize_catalysts(catalysts, pls_model=None, iterations=100, cd_iterations=10, step_size=0.001):
    if pls_model == None:
        print("Creating a new model. # data items: " + str(len(catalysts)))
        pls_model = make_pls_model(random_catalyst_df)
    optimized_catalysts = np.copy(catalysts)
    for _ in range(iterations):
        for i in range(len(optimized_catalysts)):
            ## we have few options for the original_ee. We can use from the dataset or get the prediction
            ## from the model. The prediction makes sense as we want to traverse in the direction where
            ## ee improves according to the model

            ## when using ddG from dataset
            #original_ee = df.iloc[df.index.get_loc(catalysts[i:i+1].index[0])]['ddG']
            ## when using the model
            original_ee = pls_predict_ee(pls_model, optimized_catalysts[i])
            for x in range(len(optimized_catalysts[i])):
                for cd in range(cd_iterations):
                    old_value = optimized_catalysts[i, x]
                    optimized_catalysts[i, x] = old_value + step_size
                    new_ee = pls_predict_ee(pls_model, optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value - step_size
                        new_ee = pls_predict_ee(pls_model, optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value
                        break
                    #print('Found a direction for higher ddG')
    return optimized_catalysts

new_optimized_catalysts = optimize_catalysts(random_catalyst_df[X_COLS], pls_large_saved_model)

In [45]:
random_catalyst_df[X_COLS]

Unnamed: 0_level_0,1,4,5,12,28,29,30,35,38,42,...,3929,3932,3935,3936,3937,3939,3940,3945,3948,3951
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11_6_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12_1_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.081818,0.0,-0.081818,-0.027273,-0.068182,0.0,0.0,0.027273
8_7_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
new_optimized_catalysts

array([[1.        , 0.6       , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.6       , 1.        , ..., 1.        , 1.        ,
        1.02727273],
       [1.        , 0.6       , 1.        , ..., 1.        , 1.        ,
        1.        ]])

In [47]:
# Perform k-nearest neighbors analysis
num_neighbors = 1  # Number of neighbors to consider
knn = NearestNeighbors(n_neighbors=num_neighbors)
knn.fit(df[X_COLS])  # Using all the catalysts

In [48]:
## Recursive Loop
def find_unique_neighbors_add(knn, point, addToDf, neighbor_number=1):
    distances, indices = knn.kneighbors([point], neighbor_number)
    neighbor_catalyst = df.iloc[[indices[0][neighbor_number-1]]]
    if neighbor_catalyst.index[0] in addToDf.index:
        #get next neighbor
        neighbor_catalyst = find_unique_neighbors_add(knn, point, addToDf, neighbor_number+1)
    return neighbor_catalyst

def process_neighbors(knn, for_points, addToDf):
    for i in range(len(for_points)):
        neighbor_catalyst = find_unique_neighbors_add(knn, for_points[i], addToDf)
        if len(neighbor_catalyst) > 0:
            addToDf = pd.concat([addToDf, neighbor_catalyst], ignore_index=False, sort=True)
    return addToDf

random_catalyst_df = process_neighbors(knn, new_optimized_catalysts, random_catalyst_df)
random_catalyst_df[X_COLS]

Unnamed: 0_level_0,1,4,5,12,28,29,30,35,38,42,...,3929,3932,3935,3936,3937,3939,3940,3945,3948,3951
Catalyst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11_6_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12_1_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.081818,0.0,-0.081818,-0.027273,-0.068182,0.0,0.0,0.027273
8_7_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23_3_1,0.0,0.0,0.0,0.038462,0.0,0.038462,0.0,0.0,0.038462,0.0,...,-0.005769,-0.005769,0.040385,0.011538,-0.005769,-0.046154,0.0,0.011538,0.0,0.0
23_3_2,0.0,0.0,0.0,0.038462,0.0,0.038462,0.0,0.0,0.038462,0.0,...,-0.005769,-0.005769,0.040385,0.011538,-0.005769,-0.046154,0.0,0.011538,0.0,0.0
10_3_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.058065,-0.014516,-0.03871,-0.058065,0.033871,0.019355,-0.004839,0.0


In [None]:
# sample prediction with model
properties1 = (np.array([ 0.75813041,  0.67727654,  0.68692902,  0.57403813, -0.39901283,
       -0.83317389, -0.69366434, -0.76423736, -0.18775295, -0.89697628,
       -0.93897206, -0.64303136, -0.50508299, -0.82567658, -0.29773217,
       -0.23605932, -0.26722481,  0.02879822, -0.69189841]))
pls_predict_ee(pls_large_saved_model, properties1)
# random_catalyst_df = random_catalyst_df[~random_catalyst_df.index.duplicated(keep='first')]

array([[-0.78896142]])

In [None]:
# We will now do all this in a single loop
highest_ddg_index = df.index[df['ddG'] == df['ddG'].max()]
print("The max value (target) of the ddG is: " + str(df['ddG'].max()))

not_highest_index=[i for i in df.index if i not in highest_ddg_index]
random_catalyst_df = df.loc[not_highest_index].sample(START_SAMPLE_SIZE)
while(len(random_catalyst_df) < len(df)):
    print("The current maximum ddg is at: " + str(random_catalyst_df['pred_ddG'].max()))
    new_optimized_catalysts = optimize_catalysts(random_catalyst_df[X_COLS], pls_large_saved_model)
    random_catalyst_df = process_neighbors(knn, new_optimized_catalysts, random_catalyst_df)
    if len(pd.concat([highest_ddg_index.to_series(), random_catalyst_df.index.to_series()], axis=1, join='inner')) > 0:
        print("Found the highest ddG. Exiting.")
        break

The max value (target) of the ddG is: 1.46301068
The current maximum ddg is at: 0.33990871537413136
The current maximum ddg is at: 0.33990871537413136
The current maximum ddg is at: 0.33990871537413136
The current maximum ddg is at: 0.33990871537413136


KeyboardInterrupt: 

In [None]:
random_catalyst_df['pred_ddG'].max()

1.3886042742878204

In [None]:
highest_ddg_index

Index(['1_1_5', '1_10_5', '1_11_5', '1_2_5', '1_3_5', '1_4_5', '1_5_5',
       '1_6_5', '1_7_5', '1_8_5',
       ...
       '9_10_5', '9_11_5', '9_2_5', '9_3_5', '9_4_5', '9_5_5', '9_6_5',
       '9_7_5', '9_8_5', '9_9_5'],
      dtype='object', name='Catalyst', length=319)

In [None]:
len(pd.concat([highest_ddg_index.to_series(), random_catalyst_df.index.to_series()], axis=1, join='inner'))

0

In [None]:
not_highest_index=[i for i in df.index if i not in highest_ddg_index]
df.loc[not_highest_index].index

Index(['1_1_1', '1_1_2', '1_1_3', '1_1_4', '1_1_6', '1_10_1', '1_10_2',
       '1_10_3', '1_10_4', '1_10_6',
       ...
       '9_8_1', '9_8_2', '9_8_3', '9_8_4', '9_8_6', '9_9_1', '9_9_2', '9_9_3',
       '9_9_4', '9_9_6'],
      dtype='object', name='Catalyst', length=1584)