In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

# Load ChemBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM")

# Function to extract ChemBERTa embeddings for a single SMILES string
def get_chemberta_embedding(smiles):
    if pd.isna(smiles) or not isinstance(smiles, str):  # Handle NaN and non-string values
        return np.zeros(768)  # Return a zero vector for consistency
    inputs = tokenizer(smiles, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # Mean pooling

# Load dataset
file_path = "split_smiles.csv"  # Ensure this file is in the working directory
df = pd.read_csv(file_path)

# Ensure the required columns exist
if "SMILES_part1" in df.columns and "SMILES_part2" in df.columns:
    # Apply embedding function to each SMILES column
    df["embedding_1"] = df["SMILES_part1"].apply(get_chemberta_embedding)
    df["embedding_2"] = df["SMILES_part2"].apply(get_chemberta_embedding)

    # Convert embeddings into separate feature columns
    embedding_1_df = pd.DataFrame(df["embedding_1"].to_list(), columns=[f"feat_1_{i}" for i in range(384)])
    embedding_2_df = pd.DataFrame(df["embedding_2"].to_list(), columns=[f"feat_2_{i}" for i in range(384)])

    # Merge embeddings with original data
    df = df.drop(columns=["embedding_1", "embedding_2"]).join([embedding_1_df, embedding_2_df])

    # Save to CSV
    output_path = "chemberta_features.csv"
    df.to_csv(output_path, index=False)
else:
    print("Error: Required columns 'SMILES_part1' and 'SMILES_part2' not found in the CSV file.")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
