In [1]:
import pandas as pd

# read file
df = pd.read_csv('20250408_XR0.csv')
df.head()

Unnamed: 0,name/name,isomeric_smiles,Model,BetaT,GammaT,BetaV,GammaV
0,Methane/R22 ...,C.C(F)(F)Cl,XR0,1.0,1.111477,1,1.021993
1,Nitrogen/R14 ...,N#N.C(F)(F)(F)F,XR0,1.0,1.0674,1,1.1693
2,Nitrogen/R13 ...,N#N.C(F)(F)(F)Cl,XR0,1.0,1.181,1,1.1727
3,Nitrogen/R22 ...,N#N.C(F)(F)Cl,XR0,1.0,1.236055,1,1.031103
4,Nitrogen/R12 ...,N#N.C(F)(F)(Cl)Cl,XR0,1.0,1.298156,1,1.062585


In [2]:
# split the SMILES code at the '.'
df[['SMILES_part1', 'SMILES_part2']] = df['isomeric_smiles'].str.split(pat='.', n = 1, expand = True)
df.drop(columns=['isomeric_smiles', 'name/name', 'Model'], inplace = True)
df.head()

Unnamed: 0,BetaT,GammaT,BetaV,GammaV,SMILES_part1,SMILES_part2
0,1.0,1.111477,1,1.021993,C,C(F)(F)Cl
1,1.0,1.0674,1,1.1693,N#N,C(F)(F)(F)F
2,1.0,1.181,1,1.1727,N#N,C(F)(F)(F)Cl
3,1.0,1.236055,1,1.031103,N#N,C(F)(F)Cl
4,1.0,1.298156,1,1.062585,N#N,C(F)(F)(Cl)Cl


In [3]:
from rdkit import Chem
from rdkit.Chem import RemoveHs

# convert isomeric SMILES to canonical SMILES
def convert_to_canonical(smiles):
    mol = Chem.MolFromSmiles(smiles)
    # Convert molecule to Kekulé form to ensure stability before removing Hs
    Chem.Kekulize(mol, clearAromaticFlags=True)
    mol = RemoveHs(mol)  # Remove explicit hydrogens
    return Chem.MolToSmiles(mol, isomericSmiles=False) if mol else None

# Apply function separately to each column
df['SMILES_part1'] = df['SMILES_part1'].apply(convert_to_canonical)
df['SMILES_part2'] = df['SMILES_part2'].apply(convert_to_canonical)

df.head()

Unnamed: 0,BetaT,GammaT,BetaV,GammaV,SMILES_part1,SMILES_part2
0,1.0,1.111477,1,1.021993,C,FC(F)Cl
1,1.0,1.0674,1,1.1693,N#N,FC(F)(F)F
2,1.0,1.181,1,1.1727,N#N,FC(F)(F)Cl
3,1.0,1.236055,1,1.031103,N#N,FC(F)Cl
4,1.0,1.298156,1,1.062585,N#N,FC(F)(Cl)Cl


In [4]:
# Save to .csv file
df.to_csv('split_smiles.csv', index=False)