# LIPID MAPS Structure Database (LMSD)
https://www.lipidmaps.org/databases/lmsd/download

In [1]:
from rdkit import Chem
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
f = open('structures.sdf', 'r')
lines = f.readlines()
f.close()
category = []
for i in range(len(lines)):
    if '<CATEGORY>' in lines[i]:
        category.append(lines[i+1].split(' [')[0])

In [3]:
sppl = Chem.SDMolSupplier('structures.sdf')

category_list = []
smiles_list = []
for i in range(len(category)):
    mol = sppl[i]
    if mol is not None:
        smi = Chem.MolToSmiles(mol)
        smiles_list.append(smi)
        category_list.append(category[i])



In [4]:
df = pd.DataFrame({'SMILES':smiles_list, 'Category':category_list})
df

Unnamed: 0,SMILES,Category
0,C=CCCCCC(C)CCCCCC#CCCC(OC)C(=O)OC(=O)C(CCC#CCC...,Fatty Acyls
1,CCCCCCC[C@H](O)CC(=O)N[C@@H](CO)C(=O)O,Fatty Acyls
2,CCCCCCCCCCCCCCCC(=O)OC(CCCCCCCCCCCCCC)CC(=O)N[...,Fatty Acyls
3,CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)N[C@@H](CCC(N)=...,Fatty Acyls
4,CC(C)CCCCCCCCCCCCCC(=O)OC(CCCCCCCCCCCC(C)C)CC(...,Fatty Acyls
...,...,...
47334,CCCCCCCCCCC[C@H](CC1=CC(=O)C=C(OC)C1=O)OC(C)=O,Polyketides
47335,CCCCCCCCCCCC1=C(O)C(=O)C=C(O)C1=O,Polyketides
47336,CCCCCCCCCCCC1=C(O)C(=O)C=C(OC)C1=O,Polyketides
47337,CCCCCCCCCCCCCCCC1=C(O)C(=O)C=C(O)C1=O,Polyketides


In [5]:
df['SP3_N'] = 0
for i in tqdm(range(df.shape[0])):
    smi = df['SMILES'].iloc[i]
    mol = Chem.MolFromSmiles(smi)
    sp3 = 0
    for atom in mol.GetAtoms():
        if atom.GetSymbol() == 'N' and str(atom.GetHybridization()) == 'SP3':
            sp3 += 1
    df.loc[i, 'SP3_N'] = sp3
df

  0%|          | 0/47339 [00:00<?, ?it/s]

Unnamed: 0,SMILES,Category,SP3_N
0,C=CCCCCC(C)CCCCCC#CCCC(OC)C(=O)OC(=O)C(CCC#CCC...,Fatty Acyls,0
1,CCCCCCC[C@H](O)CC(=O)N[C@@H](CO)C(=O)O,Fatty Acyls,0
2,CCCCCCCCCCCCCCCC(=O)OC(CCCCCCCCCCCCCC)CC(=O)N[...,Fatty Acyls,1
3,CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)N[C@@H](CCC(N)=...,Fatty Acyls,0
4,CC(C)CCCCCCCCCCCCCC(=O)OC(CCCCCCCCCCCC(C)C)CC(...,Fatty Acyls,0
...,...,...,...
47334,CCCCCCCCCCC[C@H](CC1=CC(=O)C=C(OC)C1=O)OC(C)=O,Polyketides,0
47335,CCCCCCCCCCCC1=C(O)C(=O)C=C(O)C1=O,Polyketides,0
47336,CCCCCCCCCCCC1=C(O)C(=O)C=C(OC)C1=O,Polyketides,0
47337,CCCCCCCCCCCCCCCC1=C(O)C(=O)C=C(O)C1=O,Polyketides,0


In [6]:
df = df.drop_duplicates(['SMILES']).reset_index(drop=True)
df

Unnamed: 0,SMILES,Category,SP3_N
0,C=CCCCCC(C)CCCCCC#CCCC(OC)C(=O)OC(=O)C(CCC#CCC...,Fatty Acyls,0
1,CCCCCCC[C@H](O)CC(=O)N[C@@H](CO)C(=O)O,Fatty Acyls,0
2,CCCCCCCCCCCCCCCC(=O)OC(CCCCCCCCCCCCCC)CC(=O)N[...,Fatty Acyls,1
3,CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)N[C@@H](CCC(N)=...,Fatty Acyls,0
4,CC(C)CCCCCCCCCCCCCC(=O)OC(CCCCCCCCCCCC(C)C)CC(...,Fatty Acyls,0
...,...,...,...
47319,CCCCCCCCCCC[C@H](CC1=CC(=O)C=C(OC)C1=O)OC(C)=O,Polyketides,0
47320,CCCCCCCCCCCC1=C(O)C(=O)C=C(O)C1=O,Polyketides,0
47321,CCCCCCCCCCCC1=C(O)C(=O)C=C(OC)C1=O,Polyketides,0
47322,CCCCCCCCCCCCCCCC1=C(O)C(=O)C=C(O)C1=O,Polyketides,0


In [7]:
df.to_csv('./LMSD_dataset.csv', index=False)