Notebook purpose: evaluate how efficiently we could search for catalysts using the ML model under various constraints.

The most conspicuous constraint is to find a set number of active catalysts without any unnecessary DFT calculations
What is unnecessary? --> 100% of O2 binding calculations are to actual binding sites
So we can accept a model with lower accuracy as long as it has no false positives --> only a small penalty for false negatives

Let's say we're only willing to run 5 DFT O2 binding calculations, and we want basically all of them to show that we found active sites. We'd probably want each of these to be per catalyst, to show that we've found 5 unique active catalysts. Assuming we're working with 10% of the data as a "test" set, that's about 27 calalysts, so we want to pick the ones that the model is most confident have at least 1 site that binds O2.

Really, this is a question of whether the active sites for a set of catalysts are most likely to actually be binding
Can order by log-loss and take that as an estimate of uncertainty (is that a fair expectation?)


In [1]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import GroupShuffleSplit

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

In [2]:
from ngcc_ml import data_tools
from ngcc_ml import skl_tools

In [3]:
df = pd.read_csv("/home/nricke/work/ngcc_ml/DidItBindv5.csv")
df["Doesitbind"] = df["Doesitbind"].astype("int")

In [4]:
df.columns

Index(['Unnamed: 0', 'Atom Number', 'Catalyst Name', 'CatalystO2File',
       'Element', 'SpinDensity', 'ChElPGPositiveCharge', 'ChElPGNeutralCharge',
       'ChargeDifference', 'Doesitbind', 'BondLength', 'IonizedFreeEnergy',
       'IonizationEnergy', 'BindingEnergy', 'NeutralFreeEnergy', 'OrthoOrPara',
       'Meta', 'FartherThanPara', 'DistanceToN', 'AverageBondLength',
       'BondLengthRange', 'NumberOfHydrogens', 'AromaticSize', 'IsInRingSize6',
       'IsInRingSize5', 'NeighborSpinDensity', 'NeighborChElPGCharge',
       'NeighborChargeDifference', 'AromaticExtent', 'RingEdge',
       'NumNitrogens', 'NumHeteroatoms', 'ring_nitrogens',
       'atom_plane_deviation', 'ring_plane_deviation', 'charge'],
      dtype='object')

In [5]:
df.shape

(4141, 36)

In [6]:
print(len(df["Catalyst Name"].unique()))
print(len(df[df["Doesitbind"] == 1]["Catalyst Name"].unique()))

267
250


In [7]:
feature_cols = {"SpinDensity", "ChElPGNeutralCharge", "ChargeDifference", "IonizationEnergy", "OrthoOrPara", "Meta", "FartherThanPara", "DistanceToN", "AverageBondLength",  "NumberOfHydrogens", "IsInRingSize6", "IsInRingSize5", "NeighborSpinDensity", 'NeighborChElPGCharge', 'NeighborChargeDifference', "AromaticExtent", "RingEdge", "NumNitrogens", "NumHeteroatoms", "charge", "atom_plane_deviation", "ring_plane_deviation", "ring_nitrogens"}
not_scaled_cols = {"OrthoOrPara", "Meta", "FartherThanPara", "NumberOfHydrogens", "IsInRingSize6", "IsInRingSize5", "RingEdge", "NumNitrogens", "NumHeteroatoms", "ring_nitrogens", "charge"}
df_scale = data_tools.process_data(df, scaledCols=list(feature_cols - not_scaled_cols))
train_inds, test_inds = next(GroupShuffleSplit(test_size=0.10, n_splits=2, random_state = 6).split(df, groups=df['Catalyst Name']))
train = df.iloc[train_inds]
test = df.iloc[test_inds]
X_train_group = train[feature_cols]
y_train_group = train["Doesitbind"]
X_test_group = test[feature_cols]
y_test_group = test["Doesitbind"]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [None]:
rfc = RandomForestClassifier(n_estimators=1000, max_depth=100, class_weight={0:1, 1:10})
rfc.fit(X_train_group, y_train_group)
print('Accuracy of RFC on test set: {:.3f}'.format(rfc.score(X_test_group, y_test_group)))
print('Accuracy of RFC on training set: {:.3f}'.format(rfc.score(X_train_group, y_train_group)))
y_pred_group = rfc.predict(X_test_group)
print(confusion_matrix(y_test_group, y_pred_group))

In [None]:
p = rfc.predict_proba(X_test_group)
test = test.assign(predict_proba=p[:,1], prediction=y_pred_group)
test_sort = test.sort_values(by="predict_proba", ascending=False)[["Catalyst Name", "Doesitbind", "prediction", "predict_proba"]]
first_false = list(test_sort["Doesitbind"]).index(0)
print(len(test_sort.head(first_false)["Catalyst Name"].unique()))

In [None]:
test

In [None]:
all_scores, best_catalysts, total_sites, first_false_list, false_proba_list = [], [], [], [], []
test_dfs = []
group_col = "Catalyst Name"
target_col = "Doesitbind"
model = RandomForestClassifier(n_estimators=1000, max_depth=100, class_weight={0:1, 1:10})
split_groups = GroupShuffleSplit(test_size=0.10, n_splits=10).split(df, groups=df[group_col])
for train_inds, test_inds in split_groups:
    train = df.iloc[train_inds]
    test = df.iloc[test_inds]
    X_train_group = train[feature_cols]
    X_test_group = test[feature_cols]
    y_test_group = test[target_col]
    y_train_group = train[target_col]
    model.fit(X_train_group, y_train_group)
    score = model.score(X_test_group, y_test_group)
    all_scores.append(score)
    print('Accuracy of RFC on test set: {:.2f}'.format(score))
    print('Accuracy of RFC on training set: {:.2f}'.format(model.score(X_train_group, y_train_group)))
    y_pred_group = model.predict(X_test_group)
    print(confusion_matrix(y_test_group, y_pred_group))
    p = model.predict_proba(X_test_group)
    test = test.assign(predict_proba=p[:,1], prediction=y_pred_group)
    test_dfs.append(test)
    test_sort = test.sort_values(by="predict_proba", ascending=False)[["Catalyst Name", "Doesitbind", "prediction", "predict_proba"]]
    first_false = list(test_sort["Doesitbind"]).index(0)
    first_false_list.append(first_false)
    best_catalysts.append(len(test_sort.head(first_false)["Catalyst Name"].unique()))
    total_sites.append(test_sort.shape[0]) # first false is 0 indexed. If the first false is the 10th place, the value is 9, so this index is the same as the number of catalysts before the first false
    false_proba_list.append(test_sort.iloc[first_false]["predict_proba"])
print("mean:", np.mean(all_scores))
print("mean:", np.mean(best_catalysts))
print("mean:", np.mean(total_sites))

In [None]:
all_scores, best_catalysts, total_sites, first_false_list, false_proba_list = [], [], [], [], []
test_dfs = []
group_col = "Catalyst Name"
target_col = "Doesitbind"
model = RandomForestClassifier(n_estimators=1000, max_depth=100, class_weight={0:1, 1:10})
split_groups = GroupShuffleSplit(test_size=0.10, n_splits=10).split(df, groups=df[group_col])
for train_inds, test_inds in split_groups:
    train = df.iloc[train_inds]
    test = df.iloc[test_inds]
    X_train_group = train[feature_cols]
    X_test_group = test[feature_cols]
    y_test_group = test[target_col]
    y_train_group = train[target_col]
    model.fit(X_train_group, y_train_group)
    score = model.score(X_test_group, y_test_group)
    all_scores.append(score)
    print('Accuracy of RFC on test set: {:.2f}'.format(score))
    print('Accuracy of RFC on training set: {:.2f}'.format(model.score(X_train_group, y_train_group)))
    y_pred_group = model.predict(X_test_group)
    print(confusion_matrix(y_test_group, y_pred_group))
    p = model.predict_proba(X_test_group)
    test = test.assign(predict_proba=p[:,1], prediction=y_pred_group)
    test_dfs.append(test)
    test_sort = test.sort_values(by="predict_proba", ascending=False)[["Catalyst Name", "Doesitbind", "prediction", "predict_proba"]]
    first_false = list(test_sort["Doesitbind"]).index(0)
    first_false_list.append(first_false)
    best_catalysts.append(len(test_sort.head(first_false)["Catalyst Name"].unique()))
    total_sites.append(test_sort.shape[0]) # first false is 0 indexed. If the first false is the 10th place, the value is 9, so this index is the same as the number of catalysts before the first false
    false_proba_list.append(test_sort.iloc[first_false]["predict_proba"])
print("mean:", np.mean(all_scores))
print("mean:", np.mean(best_catalysts))
print("mean:", np.mean(total_sites))

In [None]:
df_test_all = pd.concat(test_dfs)

In [None]:
for measure_list in [best_catalysts, false_proba_list, first_false_list]:
    print(np.mean(measure_list), np.min(measure_list), np.max(measure_list))

Now that we've seen this is relatively successful in this framework, the next step is to do a head-to-head search comparison.
For a set of C catalysts, search until a subset A are found that are active, with the goal of checking O2 binding for as few as possible.
This is really quite similar to above, but we just want to keep track of slightly different metrics. For each group, we now want to instead ask 

In [None]:
def search_for_active_catalysts(df_catalysts, order_col, feature_cols, target_col="Doesitbind", find_num=10):
    """
    df_catalysts (pandas dataframe): catalysts to search
    order_col (str): column name to sort catalysts by. Expected for predict_proba or random values
    """
    df_sort = df_catalysts.sort_values(by=order_col, ascending=False)
    found_list = []
    count = 0
    for index, row in df_sort.iterrows():
        if row["Catalyst Name"] not in found_list:
            if row[target_col] == 1:
                found_list.append(row["Catalyst Name"])
            count += 1
            assert len(found_list) <= find_num
            if len(found_list) == find_num:
                break
    return found_list, count

In [None]:
df_ts = df.copy()
df_ts

In [None]:
df_ts = df_ts.assign(random_ordering=np.random.rand(df_ts.shape[0]))

In [None]:
df_ts.iloc[0].random_ordering

In [None]:
l, c = search_for_active_catalysts(df_ts, order_col="random_ordering", feature_cols=feature_cols, find_num=100)
print(len(l))
print(c)

In [None]:
df_test_all = df_test_all.drop_duplicates()

In [None]:
df_test_all = df_test_all.assign(random_ordering=np.random.rand(df_test_all.shape[0]))

In [None]:
l_O2, c_O2 = search_for_active_catalysts(df_test_all, order_col="random_ordering", feature_cols=feature_cols, find_num=100)
print(len(l_O2), c_O2)
l_t, c_t = search_for_active_catalysts(df_test_all, order_col="predict_proba", feature_cols=feature_cols, find_num=100)
print(len(l_t), c_t)