In [1]:
# This notebook adapts code from IBM's SMI-TED (SMI-TED 289M) model in the fm4m repository.   
# Original code: https://github.com/IBM/materials 
# I modified the notebook to load our metabolite list and generate embeddings for our project.    

In [15]:
import sys
sys.path.append("../models")
sys.path.append("../")

In [16]:
import torch
print(torch.__version__)

2.3.1


In [17]:
import models.fm4m as fm4m
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("metabolites_to_SMILES.csv")

In [42]:
print(f"shape of df: {df.shape}")
df.head()

shape of df: (301, 3)


Unnamed: 0.1,Unnamed: 0,Exact Match to Standard (* = isomer family),SMILES
0,HILIC-neg_Cluster_0622,"1,2,3,4-tetrahydro-1-methyl-beta-carboline-3-c...",CC1NC(Cc2c1[nH]c3ccccc23)C(O)=O
1,C18-neg_Cluster_0183,"1,2,3,4-tetrahydro-b-carboline-1,3-dicarboxyli...",OC(=O)C1Cc2c([nH]c3ccccc23)C(N1)C(O)=O
2,C18-neg_Cluster_0393,12.13-diHOME,CCCCCC(O)C(O)C\C=C/CCCCCCCC(O)=O
3,HILIC-neg_Cluster_0480,1-3-7-trimethylurate,CN1C(=O)N(C)C2=C(N(C)C(=O)N2)C1=O
4,C18-neg_Cluster_0530,13-docosenoate,CCCCCCCCC=CCCCCCCCCCCCC([O-])=O


In [None]:
input = "SMILES"
x = list(df[input].values)

In [44]:
fm4m.avail_models()

Unnamed: 0,Model Name,Description
0,SMI-TED,SMILES based encoder decoder model
1,SELFIES-TED,BART model for string based SELFIES modality
2,MolFormer,MolFormer model for string based SMILES modality
3,MHG-GED,Molecular hypergraph model
4,POS-EGNN,3D atom position model
5,Mordred,Baseline: A descriptor-calculation software ap...
6,MorganFingerprint,Baseline: Circular atom environments based des...


In [None]:
x_batch, _ = fm4m.get_representation(x, ["C"], model_type = 'SMI-TED', return_tensor = False)

Random Seed: 12345
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding


  return bound(*args, **kwds)


Vocab size: 2393
[INFERENCE MODE - smi-ted-Light]


100%|██████████| 3/3 [00:03<00:00,  1.22s/it]
100%|██████████| 3/3 [00:03<00:00,  1.22s/it]
100%|██████████| 1/1 [00:00<00:00, 38.17it/s]



In [46]:
x_batch.index = df["SMILES"].values

In [None]:
print(f"x_batch shape: {x_batch.shape}")
x_batch

x_batch shape: (301, 768)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
CC1NC(Cc2c1[nH]c3ccccc23)C(O)=O,0.396364,-0.479849,0.029331,0.412555,0.483709,-0.672527,1.218773,0.049687,0.792117,0.237337,...,-1.286233,-0.159936,0.113961,0.324219,-0.737211,0.671974,-0.658567,0.843720,0.621036,-0.203670
OC(=O)C1Cc2c([nH]c3ccccc23)C(N1)C(O)=O,0.373362,-0.525701,0.114096,0.410386,0.536962,-0.554213,1.127852,0.100569,0.782503,0.229514,...,-1.301916,-0.072891,0.189045,0.329801,-0.704215,0.701316,-0.683768,0.820262,0.615056,-0.244778
CCCCCC(O)C(O)C\C=C/CCCCCCCC(O)=O,0.429987,-0.495715,0.084854,0.393842,0.474923,-0.659842,1.207936,0.099024,0.822189,0.184729,...,-1.327696,-0.238864,0.233488,0.289160,-0.485545,0.575598,-0.649370,0.821234,0.543072,-0.091636
CN1C(=O)N(C)C2=C(N(C)C(=O)N2)C1=O,0.412181,-0.488206,0.068995,0.409006,0.525677,-0.660098,1.085802,0.127317,0.819204,0.168899,...,-1.156412,-0.219103,0.082331,0.297099,-0.628251,0.528107,-0.730679,0.861326,0.635826,-0.120889
CCCCCCCCC=CCCCCCCCCCCCC([O-])=O,0.306383,-0.431526,0.088750,0.408827,0.541020,-0.638241,1.247241,0.124539,0.833546,0.227461,...,-1.219691,-0.258938,0.234831,0.278468,-0.497933,0.537282,-0.646706,0.856944,0.507465,0.069915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OC(=O)/C=C/c1[nH]cnc1,0.414063,-0.602291,0.077173,0.399463,0.376349,-0.576629,1.148321,0.076537,0.827023,0.267655,...,-1.282969,-0.164353,0.199121,0.200877,-0.717913,0.522378,-0.709920,0.920468,0.585280,-0.261188
CC(C)[C@H](N)C(O)=O,0.269893,-0.493490,0.101624,0.418190,0.451939,-0.629112,1.181610,0.026472,0.871132,0.206574,...,-1.242208,-0.096259,0.173340,0.307831,-0.590213,0.562042,-0.686225,0.929747,0.671628,-0.223394
CC(C)[C@H](N)C(O)=O,0.269893,-0.493490,0.101624,0.418190,0.451939,-0.629112,1.181610,0.026472,0.871132,0.206574,...,-1.242208,-0.096259,0.173340,0.307831,-0.590213,0.562042,-0.686225,0.929747,0.671628,-0.223394
O=C1NC(=O)c2[nH]cnc2N1,0.460566,-0.499067,0.104201,0.371852,0.328459,-0.640671,1.095554,0.034319,0.824624,0.223048,...,-1.138119,-0.156273,0.123117,0.298206,-0.741723,0.483435,-0.636165,0.944731,0.637915,-0.206417


In [53]:
embedding_columns = [f'emb_{i}' for i in range(x_batch.shape[1])]
x_batch.columns = embedding_columns
x_batch = x_batch.reset_index().rename(columns={"index": "SMILES"})
x_batch

Unnamed: 0,SMILES,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,CC1NC(Cc2c1[nH]c3ccccc23)C(O)=O,0.396364,-0.479849,0.029331,0.412555,0.483709,-0.672527,1.218773,0.049687,0.792117,...,-1.286233,-0.159936,0.113961,0.324219,-0.737211,0.671974,-0.658567,0.843720,0.621036,-0.203670
1,OC(=O)C1Cc2c([nH]c3ccccc23)C(N1)C(O)=O,0.373362,-0.525701,0.114096,0.410386,0.536962,-0.554213,1.127852,0.100569,0.782503,...,-1.301916,-0.072891,0.189045,0.329801,-0.704215,0.701316,-0.683768,0.820262,0.615056,-0.244778
2,CCCCCC(O)C(O)C\C=C/CCCCCCCC(O)=O,0.429987,-0.495715,0.084854,0.393842,0.474923,-0.659842,1.207936,0.099024,0.822189,...,-1.327696,-0.238864,0.233488,0.289160,-0.485545,0.575598,-0.649370,0.821234,0.543072,-0.091636
3,CN1C(=O)N(C)C2=C(N(C)C(=O)N2)C1=O,0.412181,-0.488206,0.068995,0.409006,0.525677,-0.660098,1.085802,0.127317,0.819204,...,-1.156412,-0.219103,0.082331,0.297099,-0.628251,0.528107,-0.730679,0.861326,0.635826,-0.120889
4,CCCCCCCCC=CCCCCCCCCCCCC([O-])=O,0.306383,-0.431526,0.088750,0.408827,0.541020,-0.638241,1.247241,0.124539,0.833546,...,-1.219691,-0.258938,0.234831,0.278468,-0.497933,0.537282,-0.646706,0.856944,0.507465,0.069915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,OC(=O)/C=C/c1[nH]cnc1,0.414063,-0.602291,0.077173,0.399463,0.376349,-0.576629,1.148321,0.076537,0.827023,...,-1.282969,-0.164353,0.199121,0.200877,-0.717913,0.522378,-0.709920,0.920468,0.585280,-0.261188
297,CC(C)[C@H](N)C(O)=O,0.269893,-0.493490,0.101624,0.418190,0.451939,-0.629112,1.181610,0.026472,0.871132,...,-1.242208,-0.096259,0.173340,0.307831,-0.590213,0.562042,-0.686225,0.929747,0.671628,-0.223394
298,CC(C)[C@H](N)C(O)=O,0.269893,-0.493490,0.101624,0.418190,0.451939,-0.629112,1.181610,0.026472,0.871132,...,-1.242208,-0.096259,0.173340,0.307831,-0.590213,0.562042,-0.686225,0.929747,0.671628,-0.223394
299,O=C1NC(=O)c2[nH]cnc2N1,0.460566,-0.499067,0.104201,0.371852,0.328459,-0.640671,1.095554,0.034319,0.824624,...,-1.138119,-0.156273,0.123117,0.298206,-0.741723,0.483435,-0.636165,0.944731,0.637915,-0.206417


In [None]:
x_batch.insert(0, 'Metabolite', df['Exact Match to Standard (* = isomer family)'].values)
x_batch

Unnamed: 0,Metabolite,SMILES,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,"1,2,3,4-tetrahydro-1-methyl-beta-carboline-3-c...",CC1NC(Cc2c1[nH]c3ccccc23)C(O)=O,0.396364,-0.479849,0.029331,0.412555,0.483709,-0.672527,1.218773,0.049687,...,-1.286233,-0.159936,0.113961,0.324219,-0.737211,0.671974,-0.658567,0.843720,0.621036,-0.203670
1,"1,2,3,4-tetrahydro-b-carboline-1,3-dicarboxyli...",OC(=O)C1Cc2c([nH]c3ccccc23)C(N1)C(O)=O,0.373362,-0.525701,0.114096,0.410386,0.536962,-0.554213,1.127852,0.100569,...,-1.301916,-0.072891,0.189045,0.329801,-0.704215,0.701316,-0.683768,0.820262,0.615056,-0.244778
2,12.13-diHOME,CCCCCC(O)C(O)C\C=C/CCCCCCCC(O)=O,0.429987,-0.495715,0.084854,0.393842,0.474923,-0.659842,1.207936,0.099024,...,-1.327696,-0.238864,0.233488,0.289160,-0.485545,0.575598,-0.649370,0.821234,0.543072,-0.091636
3,1-3-7-trimethylurate,CN1C(=O)N(C)C2=C(N(C)C(=O)N2)C1=O,0.412181,-0.488206,0.068995,0.409006,0.525677,-0.660098,1.085802,0.127317,...,-1.156412,-0.219103,0.082331,0.297099,-0.628251,0.528107,-0.730679,0.861326,0.635826,-0.120889
4,13-docosenoate,CCCCCCCCC=CCCCCCCCCCCCC([O-])=O,0.306383,-0.431526,0.088750,0.408827,0.541020,-0.638241,1.247241,0.124539,...,-1.219691,-0.258938,0.234831,0.278468,-0.497933,0.537282,-0.646706,0.856944,0.507465,0.069915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,urocanic acid,OC(=O)/C=C/c1[nH]cnc1,0.414063,-0.602291,0.077173,0.399463,0.376349,-0.576629,1.148321,0.076537,...,-1.282969,-0.164353,0.199121,0.200877,-0.717913,0.522378,-0.709920,0.920468,0.585280,-0.261188
297,valine,CC(C)[C@H](N)C(O)=O,0.269893,-0.493490,0.101624,0.418190,0.451939,-0.629112,1.181610,0.026472,...,-1.242208,-0.096259,0.173340,0.307831,-0.590213,0.562042,-0.686225,0.929747,0.671628,-0.223394
298,valine,CC(C)[C@H](N)C(O)=O,0.269893,-0.493490,0.101624,0.418190,0.451939,-0.629112,1.181610,0.026472,...,-1.242208,-0.096259,0.173340,0.307831,-0.590213,0.562042,-0.686225,0.929747,0.671628,-0.223394
299,xanthine,O=C1NC(=O)c2[nH]cnc2N1,0.460566,-0.499067,0.104201,0.371852,0.328459,-0.640671,1.095554,0.034319,...,-1.138119,-0.156273,0.123117,0.298206,-0.741723,0.483435,-0.636165,0.944731,0.637915,-0.206417


In [None]:
x_batch.to_csv("SMI-TED289M_embeddings.csv", index=False)