In [38]:
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

from factor_analyzer import FactorAnalyzer

from joblib import dump, load

In [30]:
# Original large_cat_desc_col_names.csv: 1903 rows; columns 1..3973 + ddG
lg_df = pd.read_csv('data/large_cat_desc_col_names.csv')
lg_df_cleaned = lg_df[lg_df['ddG'] != 0]
lg_df_cleaned # 1903 x 3975. 

Unnamed: 0,Identifier,1,2,3,4,5,6,7,8,9,...,3965,3966,3967,3968,3969,3970,3971,3972,3973,ddG
0,1_1_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.833714,-0.605952,-0.471700,-0.768480,-0.178536,-0.298590,-0.238640,-0.277510,-0.230633,-0.672194
1,1_1_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.880495,-0.602624,-0.471616,-0.798242,-0.277709,-0.280445,-0.279338,-0.246094,-1.153164,-1.146684
2,1_1_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.104378,1.789966,1.707972,-0.078629,-1.416062,-1.347984,-1.383415,0.492188,0.691898,0.276786
3,1_1_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.938972,-0.643031,-0.505083,-0.825677,-0.297732,-0.236059,-0.267225,0.028798,-0.691898,-0.672194
4,1_1_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.112732,0.601039,0.691527,1.360684,0.614598,0.524781,0.570340,-1.518452,-0.230633,1.463011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1898,9_9_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.880495,-0.602624,-0.471616,-0.798242,-0.277709,-0.280445,-0.279338,-0.246094,-1.153164,-1.146684
1899,9_9_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.104378,1.789966,1.707972,-0.078629,-1.416062,-1.347984,-1.383415,0.492188,0.691898,0.276786
1900,9_9_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.938972,-0.643031,-0.505083,-0.825677,-0.297732,-0.236059,-0.267225,0.028798,-0.691898,-0.672194
1901,9_9_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.112732,0.601039,0.691527,1.360684,0.614598,0.524781,0.570340,-1.518452,-0.230633,1.463011


In [31]:
lg_df_cleaned.iloc[1897]

Identifier       9_9_1
1                  0.0
2                  0.0
3                  0.0
4                  0.0
                ...   
3970          -0.29859
3971          -0.23864
3972          -0.27751
3973         -0.230633
ddG          -0.672194
Name: 1897, Length: 3975, dtype: object

In [27]:
# np.min(np.abs(np.array(large_df['ddG']))) # the min magnitude is 0.276, not zero (?)
np.mean(np.abs(np.array(df['ddG (% ee)']))) # many zeroes in the reduced ddG, though?

0.07730585978258517

In [32]:
# df is 1849 rows originally -- what features constructed this for the full Ru dataset?
# Later on, would be nice to find a way to use these in some sort of imbalanced training setup. 
df = pd.read_csv('reduced_dim_space_ddG.csv') 
df_cleaned = df[df['ddG (% ee)'] != 0]
df_cleaned # 318 x 5.

Unnamed: 0,Catalyst,x,y,z,ddG (% ee)
0,1_1_1,5.100125,-27.742489,-17.922393,1.226289
1,1_1_2,5.153813,-27.571266,-17.950507,0.719002
10,1_11_1,24.913739,-0.494436,-8.842801,0.631877
16,1_2_1,12.039033,-10.467777,-6.094131,0.216792
17,1_2_2,11.859191,-10.604692,-6.049362,0.111677
...,...,...,...,...,...
1810,9_3_4,36.077375,-7.875347,-14.939623,0.128432
1813,9_4_1,25.574518,-17.376362,4.663637,0.912247
1831,9_7_1,22.777582,-19.792283,2.750864,0.056902
1834,9_7_4,22.721846,-19.736566,2.713267,0.408758


In [3]:
X = df_cleaned[['x','y','z']]
y = df_cleaned['ddG (% ee)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [36]:
# OLS: create, fit, evaluate on test-set predictions. 
ols = LinearRegression()
ols.fit(X_train, y_train)
ols_predictions = ols.predict(X_test)

def predict_ee(properties):
    return np.dot(properties, ols.coef_) + ols.intercept_

print('OLS mean squared error:', mean_squared_error(y_test, ols_predictions))
print('OLS mean absolute error: ', mean_absolute_error(y_test, ols_predictions))

OLS mean squared error: 0.10686574075223942
OLS mean absolute error:  0.275571383948475


In [37]:
# PLS: create, fit, evaluate on test-set predictions.
pls = PLSRegression(n_components=2)
pls.fit(X_train, y_train)
pls_predictions = pls.predict(X_test) # predictions on test set

def pls_predict_ee(properties):
    return pls.predict(properties.reshape(1,-1)) 

print('PLS mean squared error: ', mean_squared_error(y_test, pls_predictions))
print('PLS mean absolute error: ', mean_absolute_error(y_test, pls_predictions))

PLS mean squared error:  0.10634806983628022
PLS mean absolute error:  0.27497913508936606


In [39]:
# Function to optimize catalyst properties using coordinate descent
def optimize_catalysts(catalysts, iterations=100, cd_iterations=10, step_size=0.01):
    optimized_catalysts = np.copy(catalysts)
    for _ in range(iterations): # 100 steps
        for i in range(len(optimized_catalysts)):
            original_ee = pls_predict_ee(optimized_catalysts[i])  # Call the loaded-in regression model
            for x in range(len(optimized_catalysts[i])):
                for cd in range(cd_iterations): # "coordinate descent iterations"
                    old_value = optimized_catalysts[i, x]
                    optimized_catalysts[i, x] = old_value + step_size
                    new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value - step_size
                        new_ee = pls_predict_ee(optimized_catalysts[i])
                    if new_ee < original_ee:
                        optimized_catalysts[i, x] = old_value
    return optimized_catalysts

In [42]:
len(X_train)

222

In [43]:
### Factor analysis on the already-reduced space (not expected to be quite as informative). 
factor_model = FactorAnalyzer(n_factors=len(X_train),rotation="varimax")
factor_model.fit(X_train) 
eigenvalues, _ = factor_model.get_eigenvalues()
number_of_factors = sum(eigenvalues > 1) 
factor_model = FactorAnalyzer(n_factors=number_of_factors,rotation="varimax")
factor_model.fit(X_train)
print('Number of factors selected = ', number_of_factors)

Number of factors selected =  2
