In [1]:
import pandas as pd

# read file
df = pd.read_csv('20240723_XR0.csv')
df.head()

Unnamed: 0,name/name,isomeric_smiles,Model,BetaT,GammaT,BetaV,GammaV
0,Methane/Krypton ...,C.[Kr],XR0,1.0,0.993085,1.0,1.000383
1,Methane/Nitrous oxide ...,C.[N-]=[N+]=O,XR0,0.98876,1.02354,1.0,1.0
2,Methane/Carbonyl sulfide ...,C.C(=O)=S,XR0,1.0,1.0391,1.0,1.1443
3,Methane/Propylene ...,C.CC=C,XR0,0.998,1.117,1.0,1.0
4,Methane/R22 ...,C.C(F)(F)Cl,XR0,1.0,1.111477,1.0,1.021993


In [2]:
# split the SMILES code at the '.'
df[['SMILES_part1', 'SMILES_part2']] = df['isomeric_smiles'].str.split(pat='.', n = 1, expand = True)
df.drop(columns=['isomeric_smiles', 'name/name', 'Model'], inplace = True)
df.head()

Unnamed: 0,BetaT,GammaT,BetaV,GammaV,SMILES_part1,SMILES_part2
0,1.0,0.993085,1.0,1.000383,C,[Kr]
1,0.98876,1.02354,1.0,1.0,C,[N-]=[N+]=O
2,1.0,1.0391,1.0,1.1443,C,C(=O)=S
3,0.998,1.117,1.0,1.0,C,CC=C
4,1.0,1.111477,1.0,1.021993,C,C(F)(F)Cl


In [3]:
from rdkit import Chem
from rdkit.Chem import RemoveHs

# convert isomeric SMILES to canonical SMILES
def convert_to_canonical(smiles):
    mol = Chem.MolFromSmiles(smiles)
    # Convert molecule to Kekulé form to ensure stability before removing Hs
    Chem.Kekulize(mol, clearAromaticFlags=True)
    mol = RemoveHs(mol)  # Remove explicit hydrogens
    return Chem.MolToSmiles(mol, isomericSmiles=False) if mol else None

# Apply function separately to each column
df['SMILES_part1'] = df['SMILES_part1'].apply(convert_to_canonical)
df['SMILES_part2'] = df['SMILES_part2'].apply(convert_to_canonical)

df.head()



Unnamed: 0,BetaT,GammaT,BetaV,GammaV,SMILES_part1,SMILES_part2
0,1.0,0.993085,1.0,1.000383,C,[Kr]
1,0.98876,1.02354,1.0,1.0,C,[N-]=[N+]=O
2,1.0,1.0391,1.0,1.1443,C,O=C=S
3,0.998,1.117,1.0,1.0,C,C=CC
4,1.0,1.111477,1.0,1.021993,C,FC(F)Cl


In [4]:
df.to_csv('split_smiles.csv', index= False)