In [4]:
import pandas as pd

# === 1. Carregar ===
pairs_df = pd.read_csv("pairs_df.csv")
compounds_df = pd.read_csv("compounds.csv")

# === 2. Normalizar nomes das colunas ===
pairs_df.columns = pairs_df.columns.str.strip().str.lower()
compounds_df.columns = compounds_df.columns.str.strip().str.lower()

# === 3. Identificar colunas de ID e SMILES ===
mol_id_col = "id" if "id" in compounds_df.columns else compounds_df.columns[0]
smiles_col = "smiles" if "smiles" in compounds_df.columns else compounds_df.columns[1]

# === 4. Add smiles ===
merged = pairs_df.merge(compounds_df[[mol_id_col, smiles_col]], left_on="mol1", right_on=mol_id_col)
merged = merged.merge(compounds_df[[mol_id_col, smiles_col]], left_on="mol2", right_on=mol_id_col, suffixes=("_mol1", "_mol2"))

# ✅ FILTRAR separation != -1
merged = merged[merged["separation"] != -1]

# === 5. Manter e ordenar pela coluna separation ===
merged = merged[["mol1", "smiles_mol1", "mol2", "smiles_mol2", "separation"]]
merged = merged.sort_values(by="separation").reset_index(drop=True)

# === 6. Exportar ===
merged.to_csv("merged_pairs_with_smiles.csv", index=False)

print("✅ Dataset criado: 'merged_pairs_with_smiles.csv'")



✅ Dataset criado: 'merged_pairs_with_smiles.csv'


In [8]:
from rdkit.Chem import MolFromSmiles
from deepmol.compound_featurization import NPClassifierFP

fp = NPClassifierFP()
mol = MolFromSmiles("CCO")  # etanol, por exemplo
vector = fp._featurize(mol)

print(len(vector))  # este é o max_distance


6144


In [13]:
import numpy as np
float(np.load("NPClassifierFP_distance_max_value.txt.npy"))

1800.0

Ciclo for para ir buscar os 2 smiles ao dataset:

In [17]:
import pandas as pd
from rdkit.Chem import MolFromSmiles
from deepmol.compound_featurization import NPClassifierFP

df = pd.read_csv("merged_pairs_with_smiles.csv")

fp = NPClassifierFP()

max_distance = 500

sim = []

for idx, row in df.iterrows():
    smile1 = row["smiles_mol1"]
    smile2 = row["smiles_mol2"]
    mol1 = MolFromSmiles(smile1)
    mol2 = MolFromSmiles(smile2)
    
    if mol1 and mol2:
        fp1 = fp._featurize(mol1)
        fp2 = fp._featurize(mol2)
        d = np.sum(np.abs(fp1 - fp2))
        print(d)  
        similarity = 1 - (d - 0) / (max_distance - 0)
        
    sim.append(similarity)


df["NPClassifierFP"] = sim


df.to_csv("merged_pairs_with_npclassifierfp_manhattan.csv", index=False)
    
        



22.0
26.0
25.0
24.0
26.0
20.0
45.0
144.0
20.0
149.0
183.0
26.0
133.0
41.0
146.0
22.0
61.0
15.0
0.0
30.0
45.0
17.0
20.0
51.0
26.0
13.0
52.0
20.0
48.0
17.0
88.0
11.0
132.0
88.0
62.0
29.0
34.0
24.0
2.0
2.0
2.0
2.0
26.0
22.0
10.0
66.0
89.0
16.0
25.0
2.0
2.0
2.0
2.0
2.0
47.0
20.0
30.0
73.0
20.0
30.0
16.0
24.0
40.0
128.0
122.0
18.0
18.0
35.0
22.0
28.0
20.0
32.0
52.0
180.0
81.0
26.0
34.0
30.0
30.0
30.0
30.0
17.0
73.0
42.0
47.0
24.0
155.0
41.0
42.0
30.0
43.0
26.0
24.0
7.0
62.0
20.0
35.0
48.0
15.0
20.0
16.0
71.0
49.0
15.0
25.0
19.0
28.0
27.0
133.0
30.0
16.0
24.0
82.0
22.0
17.0
136.0
26.0
24.0
18.0
29.0
0.0
75.0
77.0
12.0
12.0
12.0
26.0
39.0
46.0
66.0
29.0
34.0
80.0
51.0
40.0
12.0
20.0
15.0
37.0
47.0
73.0
73.0
73.0
73.0
20.0
78.0
66.0
26.0
22.0
47.0
15.0
30.0
80.0
24.0
30.0
22.0
42.0
20.0
22.0
37.0
12.0
36.0
76.0
37.0
118.0
50.0
16.0
16.0
16.0
16.0
73.0
29.0
58.0
34.0
30.0
120.0
22.0
22.0
22.0
26.0
149.0
73.0
24.0
15.0
27.0
62.0
64.0
2.0
40.0
100.0
16.0
18.0
36.0
18.0
29.0
35.0
178.0
43.0
85.0
6

In [None]:
import pandas as pd
import numpy as np
from rdkit.Chem import MolFromSmiles
from deepmol.compound_featurization import NPClassifierFP

# Carregar o dataset
df = pd.read_csv("merged_pairs_with_smiles.csv")

# Inicializar o featurizador
fp = NPClassifierFP()

# Lista para armazenar os resultados
sim = []

# Função para cosine similarity
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_product = np.linalg.norm(v1) * np.linalg.norm(v2)
    if norm_product == 0:
        return np.nan
    return dot_product / norm_product

# Loop pelas linhas do DataFrame
for idx, row in df.iterrows():
    smile1 = row["smiles_mol1"]
    smile2 = row["smiles_mol2"]
    mol1 = MolFromSmiles(smile1)
    mol2 = MolFromSmiles(smile2)
    
    if mol1 and mol2:
        fp1 = fp._featurize(mol1)
        fp2 = fp._featurize(mol2)
        similarity = cosine_similarity(fp1, fp2)
    else:
        similarity = np.nan

    sim.append(similarity)

# Adicionar ao DataFrame
df[fp.__class__.__name__] = sim

# Guardar (opcional)
df.to_csv("merged_pairs_with_npclassifierfp_cosine.csv", index=False)

print("✅ Similaridade do cosseno calculada com sucesso.")


✅ Similaridade do cosseno calculada com sucesso.


In [None]:
import pandas as pd
import numpy as np
from rdkit.Chem import MolFromSmiles
from deepmol.compound_featurization import NPClassifierFP, BiosynfoniKeys, NeuralNPFP, MHFP, MorganFingerprint
from deepmol.compound_featurization import LLM
from transformers import BertConfig, BertModel 
from deepmol.tokenizers import NPBERTTokenizer
import os



# === 1. Carregar o dataset ===
df = pd.read_csv("merged_pairs_with_smiles.csv")

# === 2. Lista de featurizadores ===
featurizers = {
    "NPClassifierFP": NPClassifierFP(),
    "Biosynfoni": BiosynfoniKeys(),
    "NeuralNPFP": NeuralNPFP(),
    "MHFP": MHFP(),
    "MorganFingerprint": MorganFingerprint(),
    "NPBERT": LLM(model_path="NPBERT", model=BertModel, config_class=BertConfig,
                          tokenizer=NPBERTTokenizer(vocab_file=os.path.join("NPBERT", "vocab.txt")), device="cuda:0")
}

# === 3. Função de similaridade do cosseno ===
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_product = np.linalg.norm(v1) * np.linalg.norm(v2)
    if norm_product == 0:
        return np.nan
    return dot_product / norm_product

# === 4. Aplicar para cada featurizador ===
for name, featurizer in featurizers.items():
    print(f"🔄 A calcular similaridade com: {name}")
    similarities = []

    for idx, row in df.iterrows():
        smile1 = row["smiles_mol1"]
        smile2 = row["smiles_mol2"]
        mol1 = MolFromSmiles(smile1)
        mol2 = MolFromSmiles(smile2)

        if mol1 and mol2:
            try:
                fp1 = featurizer._featurize(mol1)
                fp2 = featurizer._featurize(mol2)
                sim = cosine_similarity(fp1, fp2)
            except Exception as e:
                sim = np.nan
        else:
            sim = np.nan

        similarities.append(sim)

    df[name] = similarities

# === 5. Guardar resultado final ===
df.to_csv("merged_pairs_with_all_fps_cosine.csv", index=False)
print("✅ Todas as similaridades foram calculadas com sucesso.")


Some weights of BertModel were not initialized from the model checkpoint at NPBERT/model.pt and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔄 A calcular similaridade com: NPClassifierFP
🔄 A calcular similaridade com: Biosynfoni
🔄 A calcular similaridade com: NeuralNPFP


  fp = torch.tensor([fp], dtype=torch.float)
