# Use model checkpoints for prediction

In [16]:
from transformers import BertModel, BertTokenizer, RobertaTokenizer,AutoTokenizer, AutoModelForMaskedLM,AutoModel 
import torch

embed_model_name = 'DeepChem/ChemBERTa-77M-MLM'

tokenizer = AutoTokenizer.from_pretrained(embed_model_name)
embed_model = AutoModel.from_pretrained(embed_model_name, output_hidden_states=True)

def get_bert_embeddings(smiles_strings):
    encoded_input = tokenizer(smiles_strings, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = embed_model(**encoded_input)
    embeddings = outputs.last_hidden_state[:, 0, :]  # Using the [CLS] token embedding from last hidden state
    return embeddings


Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prediction without features

In [4]:
sample_data = """
C=CC(=O)N1CN(CC=2C=CC=CC2)C(=O)C1,0
C=CC(=O)NCC(=O)N1CCC=2C=CC=CC2C1,0
CN(C)S(=O)(=O)C=1C=CC=2CN(CC2C1)C(=O)C=C,0
CN1N=CC=2C=CC(CNC(=O)C=C)=CC12,0
CC(NC(=O)C=C)C(=O)N1CCC=2C=CC=CC12,0
C=CC(=O)N1CCC(CC1)C2=NC=3C=CC=CC3O2,1
CCC(=O)N(C)C=1C=CC(NC(=O)C=C)=CC1,1
OCC1(CO)CC(C1)NC(=O)C=C,1
C=CC(=O)N1CCN(CC1)C(=O)CN2CCCC2,1
CN(CC=1C=CC=C2OCOC21)C(=O)C=C,1"""

In [None]:
feature_columns = ['Molecular Weight', 'LogP', 'Number of Atoms',
       'Number of Bonds', 'Number of Rings', 'Rotatable Bonds Count',
       'Hydrogen Bond Donors', 'Hydrogen Bond Acceptors',
       'Number of Stereocenters', 'Topological Polar Surface Area (TPSA)']

In [14]:
import pandas as pd
from io import StringIO
data = StringIO(sample_data)  # Treat the string as file-like for pandas
df = pd.read_csv(data, header=None, names=["SMILES", "result"])
df

Unnamed: 0,SMILES,result
0,C=CC(=O)N1CN(CC=2C=CC=CC2)C(=O)C1,0
1,C=CC(=O)NCC(=O)N1CCC=2C=CC=CC2C1,0
2,CN(C)S(=O)(=O)C=1C=CC=2CN(CC2C1)C(=O)C=C,0
3,CN1N=CC=2C=CC(CNC(=O)C=C)=CC12,0
4,CC(NC(=O)C=C)C(=O)N1CCC=2C=CC=CC12,0
5,C=CC(=O)N1CCC(CC1)C2=NC=3C=CC=CC3O2,1
6,CCC(=O)N(C)C=1C=CC(NC(=O)C=C)=CC1,1
7,OCC1(CO)CC(C1)NC(=O)C=C,1
8,C=CC(=O)N1CCN(CC1)C(=O)CN2CCCC2,1
9,CN(CC=1C=CC=C2OCOC21)C(=O)C=C,1


In [42]:
import joblib
import numpy as np
# List of model names
model_names = [
 'KNeighborsClassifier',
 'GradientBoostingClassifier',
 'SVC',
 'GaussianNB',
 'LogisticRegression',
 'RandomForestClassifier',
 'DecisionTreeClassifier'
]

embeddings = get_bert_embeddings(df.SMILES.tolist())

# Predict using each model
for model_name in model_names:
    file_name = f"ml-checkpoints/wo_features/Chembert_embedding_best_{model_name}.pkl"
    try:
        # Load the trained model
        model = joblib.load(file_name)
        
        # Predict labels
        df[model_name] = model.predict(embeddings)
        print(f"Predictions added for {model_name}.")
    except FileNotFoundError:
        print(f"Checkpoint for {model_name} not found. Skipping.")

Predictions added for KNeighborsClassifier.
Predictions added for GradientBoostingClassifier.
Predictions added for SVC.
Predictions added for GaussianNB.
Predictions added for LogisticRegression.
Predictions added for RandomForestClassifier.
Predictions added for DecisionTreeClassifier.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [43]:
df

Unnamed: 0,SMILES,result,KNeighborsClassifier,GradientBoostingClassifier,SVC,GaussianNB,LogisticRegression,RandomForestClassifier,DecisionTreeClassifier
0,C=CC(=O)N1CN(CC=2C=CC=CC2)C(=O)C1,0,0,1,1,1,1,0,0
1,C=CC(=O)NCC(=O)N1CCC=2C=CC=CC2C1,0,0,0,0,1,0,0,0
2,CN(C)S(=O)(=O)C=1C=CC=2CN(CC2C1)C(=O)C=C,0,0,1,1,1,0,0,0
3,CN1N=CC=2C=CC(CNC(=O)C=C)=CC12,0,0,0,0,0,0,0,1
4,CC(NC(=O)C=C)C(=O)N1CCC=2C=CC=CC12,0,1,0,0,1,1,1,0
5,C=CC(=O)N1CCC(CC1)C2=NC=3C=CC=CC3O2,1,1,0,0,1,1,1,0
6,CCC(=O)N(C)C=1C=CC(NC(=O)C=C)=CC1,1,0,0,0,0,0,0,0
7,OCC1(CO)CC(C1)NC(=O)C=C,1,0,0,1,0,0,0,0
8,C=CC(=O)N1CCN(CC1)C(=O)CN2CCCC2,1,1,1,1,1,1,1,1
9,CN(CC=1C=CC=C2OCOC21)C(=O)C=C,1,1,1,0,1,1,1,1


In [45]:
# Assuming you have the ground truth
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# Initialize a results list
metrics = []

# Calculate metrics for each model
for model in model_names:
    accuracy = accuracy_score(df["result"], df[model])
    f1 = f1_score(df["result"], df[model])
    precision = precision_score(df["result"], df[model])
    recall = recall_score(df["result"], df[model])
    
    # Append the metrics to the list
    metrics.append({
        "Model": model,
        "Accuracy": accuracy,
        "F1 Score": f1,
        "Precision": precision,
        "Recall": recall
    })

# Convert the metrics list to a DataFrame
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,KNeighborsClassifier,0.7,0.666667,0.75,0.6
1,GradientBoostingClassifier,0.5,0.444444,0.5,0.4
2,SVC,0.5,0.444444,0.5,0.4
3,GaussianNB,0.4,0.5,0.428571,0.6
4,LogisticRegression,0.6,0.6,0.6,0.6
5,RandomForestClassifier,0.7,0.666667,0.75,0.6
6,DecisionTreeClassifier,0.6,0.5,0.666667,0.4


## Prediction with features

In [18]:
data = [
  {
    "SMILES": "NC1=C2C(C=CC=C2)=C(S([O-])(=O)=O)C=C1/N=N/C(C=CC(C3=CC(C)=C(/N=N/C(C=C(S([O-])(=O)=O)C4=C5C=CC=C4)=C5N)C=C3)=C6)=C6C",
    "Molecular Weight": 678.752,
    "LogP": 8.08024,
    "Number of Atoms": 48,
    "Number of Bonds": 53,
    "Number of Rings": 6,
    "Rotatable Bonds Count": 7,
    "Hydrogen Bond Donors": 2,
    "Hydrogen Bond Acceptors": 12,
    "Number of Stereocenters": 0,
    "Topological Polar Surface Area (TPSA)": 215.88,
    "Results": 1
  },
  {
    "SMILES": "O=C(C1=C2C=CC=C1)C(SCCO)=C(SCCO)C2=O",
    "Molecular Weight": 310.396,
    "LogP": 1.7282,
    "Number of Atoms": 20,
    "Number of Bonds": 21,
    "Number of Rings": 2,
    "Rotatable Bonds Count": 6,
    "Hydrogen Bond Donors": 2,
    "Hydrogen Bond Acceptors": 6,
    "Number of Stereocenters": 0,
    "Topological Polar Surface Area (TPSA)": 74.6,
    "Results": 1
  },
  {
    "SMILES": "NCCCNC(C=CC1=C2C(C3=C(C=CC(O)=C34)O)=NN1CCNCCO)=C2C4=O",
    "Molecular Weight": 411.462,
    "LogP": 1.0015,
    "Number of Atoms": 30,
    "Number of Bonds": 33,
    "Number of Rings": 4,
    "Rotatable Bonds Count": 9,
    "Hydrogen Bond Donors": 6,
    "Hydrogen Bond Acceptors": 9,
    "Number of Stereocenters": 0,
    "Topological Polar Surface Area (TPSA)": 145.66,
    "Results": 1
  },
  {
    "SMILES": "O=C(C1=CC(O)=C(O)C(O)=C1)O[C@H]2[C@@H](C3=CC(O)=C(O)C(O)=C3)OC4=C(C(O)=CC(O)=C4)C2",
    "Molecular Weight": 458.375,
    "LogP": 2.2332,
    "Number of Atoms": 33,
    "Number of Bonds": 36,
    "Number of Rings": 4,
    "Rotatable Bonds Count": 3,
    "Hydrogen Bond Donors": 8,
    "Hydrogen Bond Acceptors": 11,
    "Number of Stereocenters": 2,
    "Topological Polar Surface Area (TPSA)": 197.37,
    "Results": 1
  },
  {
    "SMILES": "O=C1C=C(C2=CC=CC=C2)OC3=C1C(O)=C(C(O)=C3)O",
    "Molecular Weight": 270.24,
    "LogP": 2.5768,
    "Number of Atoms": 20,
    "Number of Bonds": 22,
    "Number of Rings": 3,
    "Rotatable Bonds Count": 1,
    "Hydrogen Bond Donors": 3,
    "Hydrogen Bond Acceptors": 5,
    "Number of Stereocenters": 0,
    "Topological Polar Surface Area (TPSA)": 90.9,
    "Results": 1
  },
  {
    "SMILES": "CC(C)N1N=CC=2C=C(NC(=O)C=C)C=CC12",
    "Molecular Weight": 229.283,
    "LogP": 2.7417,
    "Number of Atoms": 17,
    "Number of Bonds": 18,
    "Number of Rings": 2,
    "Rotatable Bonds Count": 3,
    "Hydrogen Bond Donors": 1,
    "Hydrogen Bond Acceptors": 3,
    "Number of Stereocenters": 0,
    "Topological Polar Surface Area (TPSA)": 46.92,
    "Results": 0
  },
  {
    "SMILES": "CCN1C=C(C=N1)C2CCN(CC2)C(=O)C=C",
    "Molecular Weight": 233.315,
    "LogP": 1.795,
    "Number of Atoms": 17,
    "Number of Bonds": 18,
    "Number of Rings": 2,
    "Rotatable Bonds Count": 3,
    "Hydrogen Bond Donors": 0,
    "Hydrogen Bond Acceptors": 3,
    "Number of Stereocenters": 0,
    "Topological Polar Surface Area (TPSA)": 38.13,
    "Results": 0
  }
]


In [28]:
from sklearn.preprocessing import StandardScaler
# Function to process the data (smiles and additional features)
def process_X_data(smiles, features):
    """
    Combine embeddings from SMILES and additional features, then scale the data.

    Parameters:
    - smiles: List of SMILES strings.
    - features: NumPy array of additional features.
    - scaler: Pre-fitted StandardScaler for transforming the data (optional).

    Returns:
    - Transformed feature matrix ready for prediction.
    """
    embeddings = get_bert_embeddings(smiles).numpy()
    combined_features = np.concatenate((embeddings, features), axis=1)

    scaler = StandardScaler()
    return scaler.fit_transform(combined_features)

In [23]:
test_data = pd.DataFrame(data)
test_smiles = test_data["SMILES"].tolist()
test_features = test_data.drop(columns=["SMILES","Results"]).to_numpy()

In [29]:
X_test = process_X_data(test_smiles, test_features)

In [37]:
import joblib
import numpy as np

# List of model names
model_names = [
 'KNeighborsClassifier',
 'GradientBoostingClassifier',
 'SVC',
 'GaussianNB',
 'LogisticRegression',
 'RandomForestClassifier',
 'DecisionTreeClassifier'
]


# Predict using each model
for model_name in model_names:
    file_name = f"ml-checkpoints/w_features/Chembert_embedding_best_{model_name}.pkl"
    try:
        # Load the trained model
        model = joblib.load(file_name)
        
        # Predict labels
        test_data[model_name] = model.predict(X_test)
        print(f"Predictions added for {model_name}.")
    except FileNotFoundError:
        print(f"Checkpoint for {model_name} not found. Skipping.")

Predictions added for KNeighborsClassifier.
Predictions added for GradientBoostingClassifier.
Predictions added for SVC.
Predictions added for GaussianNB.
Predictions added for LogisticRegression.
Predictions added for RandomForestClassifier.
Predictions added for DecisionTreeClassifier.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [38]:
test_data

Unnamed: 0,SMILES,Molecular Weight,LogP,Number of Atoms,Number of Bonds,Number of Rings,Rotatable Bonds Count,Hydrogen Bond Donors,Hydrogen Bond Acceptors,Number of Stereocenters,Topological Polar Surface Area (TPSA),Results,KNeighborsClassifier,GradientBoostingClassifier,SVC,GaussianNB,LogisticRegression,RandomForestClassifier,DecisionTreeClassifier
0,NC1=C2C(C=CC=C2)=C(S([O-])(=O)=O)C=C1/N=N/C(C=...,678.752,8.08024,48,53,6,7,2,12,0,215.88,1,0,1,0,0,0,0,1
1,O=C(C1=C2C=CC=C1)C(SCCO)=C(SCCO)C2=O,310.396,1.7282,20,21,2,6,2,6,0,74.6,1,1,1,0,1,1,0,0
2,NCCCNC(C=CC1=C2C(C3=C(C=CC(O)=C34)O)=NN1CCNCCO...,411.462,1.0015,30,33,4,9,6,9,0,145.66,1,1,1,1,1,1,1,1
3,O=C(C1=CC(O)=C(O)C(O)=C1)O[C@H]2[C@@H](C3=CC(O...,458.375,2.2332,33,36,4,3,8,11,2,197.37,1,1,0,0,1,1,0,1
4,O=C1C=C(C2=CC=CC=C2)OC3=C1C(O)=C(C(O)=C3)O,270.24,2.5768,20,22,3,1,3,5,0,90.9,1,0,0,1,0,0,0,0
5,CC(C)N1N=CC=2C=C(NC(=O)C=C)C=CC12,229.283,2.7417,17,18,2,3,1,3,0,46.92,0,0,1,0,1,1,0,0
6,CCN1C=C(C=N1)C2CCN(CC2)C(=O)C=C,233.315,1.795,17,18,2,3,0,3,0,38.13,0,1,0,0,0,0,1,0


In [41]:
# Assuming you have the ground truth
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# Initialize a results list
metrics = []

# Calculate metrics for each model
for model in model_names:
    accuracy = accuracy_score(test_data["Results"], test_data[model])
    f1 = f1_score(test_data["Results"], test_data[model])
    precision = precision_score(test_data["Results"], test_data[model])
    recall = recall_score(test_data["Results"], test_data[model])
    
    # Append the metrics to the list
    metrics.append({
        "Model": model,
        "Accuracy": accuracy,
        "F1 Score": f1,
        "Precision": precision,
        "Recall": recall
    })

# Convert the metrics list to a DataFrame
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,KNeighborsClassifier,0.571429,0.666667,0.75,0.6
1,GradientBoostingClassifier,0.571429,0.666667,0.75,0.6
2,SVC,0.571429,0.571429,1.0,0.4
3,GaussianNB,0.571429,0.666667,0.75,0.6
4,LogisticRegression,0.571429,0.666667,0.75,0.6
5,RandomForestClassifier,0.285714,0.285714,0.5,0.2
6,DecisionTreeClassifier,0.714286,0.75,1.0,0.6
