In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from molfeat_hype.trans.llm_embeddings import LLMEmbeddingsTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import datamol as dm
smiles = dm.freesolv()["smiles"].values[:5]
smiles

array(['CN(C)C(=O)c1ccc(cc1)OC', 'CS(=O)(=O)Cl', 'CC(C)C=C', 'CCc1cnccn1',
       'CCCCCCCO'], dtype=object)

In these examples we will explore various embeddings provided by the `molfeat-hype` plugin of `molfeat`

## Classical Embeddings

### Using the OPENAI API for embeddings

In [9]:
embedder = LLMEmbeddingsTransformer(kind="openai/text-embedding-ada-002")
out = embedder(smiles)
out.shape

(5, 1536)

In [10]:
len(embedder)



1536

In [8]:
# the cache should have this molecule
len(embedder.precompute_cache.get("CCCCCCCO"))

1536

### Using the Sentence-Transformers models

In [11]:
embedder = LLMEmbeddingsTransformer(kind="sentence-transformers/all-mpnet-base-v2")
out = embedder(smiles)
out.shape

Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 399kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 74.1kB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 3.72MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<00:00, 247kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 34.8kB/s]
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 8.73MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:52<00:00, 8.32MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 13.4kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 79.9kB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 5.11MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 88.3kB/s]
Downloading (…)8e1d/train_script.py: 100%|█

(5, 768)

### Using the Llama weights

To use the Llama weights, you need to obtain them first, then follow the instruction provided in the [llama.cpp](https://github.com/ggerganov/llama.cpp) repo to get 4-bits quantization of model weight.

In [None]:
lama_quantized_model_path = "~/Code/llama.cpp/models/7B/ggml-model-q4_0.bin"

### Case 1 : Importing the calculator from molfeat_hype

In [3]:
from molfeat.trans import MoleculeTransformer
from molfeat_padel.calc import PadelDescriptors

mol_transf = MoleculeTransformer(featurizer=PadelDescriptors(), dtype=float)
out = mol_transf(smiles)
out.shape

(5, 2756)

### Case 2: auto registration of calculators

In [4]:
import molfeat_padel

# importing molfeat_pade makes PadelDescriptors one of the available calculators in molfeat
from molfeat.calc import _CALCULATORS
pprint(_CALCULATORS)

{'AtomCalculator': <class 'molfeat.calc.atom.AtomCalculator'>,
 'AtomMaterialCalculator': <class 'molfeat.calc.atom.AtomMaterialCalculator'>,
 'BondCalculator': <class 'molfeat.calc.bond.BondCalculator'>,
 'CATS': <class 'molfeat.calc.cats.CATS'>,
 'DGLCanonicalAtomCalculator': <class 'molfeat.calc.atom.DGLCanonicalAtomCalculator'>,
 'DGLCanonicalBondCalculator': <class 'molfeat.calc.bond.DGLCanonicalBondCalculator'>,
 'DGLWeaveAtomCalculator': <class 'molfeat.calc.atom.DGLWeaveAtomCalculator'>,
 'DGLWeaveEdgeCalculator': <class 'molfeat.calc.bond.DGLWeaveEdgeCalculator'>,
 'EdgeMatCalculator': <class 'molfeat.calc.bond.EdgeMatCalculator'>,
 'ElectroShapeDescriptors': <class 'molfeat.calc.shape.ElectroShapeDescriptors'>,
 'FPCalculator': <class 'molfeat.calc.fingerprints.FPCalculator'>,
 'MordredDescriptors': <class 'molfeat.calc.descriptors.MordredDescriptors'>,
 'PadelDescriptors': <class 'molfeat_padel.calc.padel.PadelDescriptors'>,
 'Pharmacophore2D': <class 'molfeat.calc.pharmacop

In [5]:
# this is now possible
mol_transf = MoleculeTransformer(featurizer="PadelDescriptors")
out = mol_transf(smiles)
out.shape

(5, 2756)

### Case 3: Use (auto-)discovery

In [6]:
from molfeat.trans import MoleculeTransformer
from molfeat.plugins import load_registered_plugins
# In this example we specifiy the plugins we want to load, 
# but you can also load all plugins by keeping the default value of `plugins`
load_registered_plugins(add_submodules=True, plugins=["molfeat_padel"])

In [7]:
# PadelDescriptors is also now a part of molfeat
from molfeat.calc import PadelDescriptors
mol_transf = MoleculeTransformer(featurizer=PadelDescriptors())
out = mol_transf(smiles)
out.shape

(5, 2756)

In [8]:
# it's also in the list of calculators, similar to case 2
from molfeat.calc import _CALCULATORS
pprint(_CALCULATORS)

{'AtomCalculator': <class 'molfeat.calc.atom.AtomCalculator'>,
 'AtomMaterialCalculator': <class 'molfeat.calc.atom.AtomMaterialCalculator'>,
 'BondCalculator': <class 'molfeat.calc.bond.BondCalculator'>,
 'CATS': <class 'molfeat.calc.cats.CATS'>,
 'DGLCanonicalAtomCalculator': <class 'molfeat.calc.atom.DGLCanonicalAtomCalculator'>,
 'DGLCanonicalBondCalculator': <class 'molfeat.calc.bond.DGLCanonicalBondCalculator'>,
 'DGLWeaveAtomCalculator': <class 'molfeat.calc.atom.DGLWeaveAtomCalculator'>,
 'DGLWeaveEdgeCalculator': <class 'molfeat.calc.bond.DGLWeaveEdgeCalculator'>,
 'EdgeMatCalculator': <class 'molfeat.calc.bond.EdgeMatCalculator'>,
 'ElectroShapeDescriptors': <class 'molfeat.calc.shape.ElectroShapeDescriptors'>,
 'FPCalculator': <class 'molfeat.calc.fingerprints.FPCalculator'>,
 'MordredDescriptors': <class 'molfeat.calc.descriptors.MordredDescriptors'>,
 'PadelDescriptors': <class 'molfeat_padel.calc.padel.PadelDescriptors'>,
 'Pharmacophore2D': <class 'molfeat.calc.pharmacop