In [1]:
import pandas as pd

# read file
df = pd.read_csv('refprop_kw0.csv')
df.head()

Unnamed: 0,Name,Isomeric SMILES,Model,BetaT,GammaT,BetaV,GammaV,Fij,Tred_0,Tred_0_1,...,Dred_0_1,Dred_0_2,Dred_0_3,Dred_0_4,Dred_0_5,Dred_0_6,Dred_0_7,Dred_0_8,Dred_0_9,Dred_1
0,Methane/Helium ...,C.[He],KW0,1.0,3.159777,1.0,0.881406,0.0,5.1953,24.009779,...,16.756787,16.060489,15.316977,14.546597,13.767032,12.992897,12.235694,11.504011,10.803868,10.139128
1,Methane/Carbon monoxide ...,C.[C-]#[O+],KW0,0.987412,0.987473,0.997341,1.006103,0.0,132.86,137.518594,...,10.767333,10.685615,10.605515,10.527665,10.45266,10.381062,10.313407,10.250207,10.191954,10.139128
2,Methane/Argon ...,C.[Ar],KW0,0.990954,0.989843,1.03463,1.014679,0.0,150.687,153.935357,...,12.895413,12.471141,12.109699,11.792017,11.503058,11.230729,10.965225,10.698652,10.424819,10.139128
3,Methane/Oxygen ...,C.O=O,KW0,1.0,0.95,1.0,1.0,0.0,154.599,156.481626,...,12.94991,12.576617,12.221008,11.881902,11.558215,11.248957,10.953221,10.670173,10.399044,10.139128
4,Methane/Ethylene ...,C.C=C,KW0,1.004,1.021,1.0,0.99,0.106,282.35,273.375107,...,7.851064,8.072773,8.30212,8.539327,8.784611,9.038182,9.300241,9.570974,9.850553,10.139128


In [3]:
# split the SMILES code at the '.'
df[['SMILES_part1', 'SMILES_part2']] = df['Isomeric SMILES'].str.split(pat='.', n = 1, expand = True)
df.drop(columns=['Isomeric SMILES', 'Name', 'Model', 'BetaT', 'GammaT', 'BetaV', 'GammaV', 'Fij'], inplace = True)
df.head()

Unnamed: 0,Tred_0,Tred_0_1,Tred_0_2,Tred_0_3,Tred_0_4,Tred_0_5,Tred_0_6,Tred_0_7,Tred_0_8,Tred_0_9,...,Dred_0_3,Dred_0_4,Dred_0_5,Dred_0_6,Dred_0_7,Dred_0_8,Dred_0_9,Dred_1,SMILES_part1,SMILES_part2
0,5.1953,24.009779,42.762567,61.453665,80.083071,98.650787,117.156811,135.601145,153.983787,172.304739,...,15.316977,14.546597,13.767032,12.992897,12.235694,11.504011,10.803868,10.139128,C,[He]
1,132.86,137.518594,142.549399,147.906317,153.542779,159.411746,165.465701,171.656644,177.936084,184.255032,...,10.605515,10.527665,10.45266,10.381062,10.313407,10.250207,10.191954,10.139128,C,[C-]#[O+]
2,150.687,153.935357,157.444324,161.178243,165.101196,169.177003,173.369222,177.641143,181.955791,186.275917,...,12.109699,11.792017,11.503058,11.230729,10.965225,10.698652,10.424819,10.139128,C,[Ar]
3,154.599,156.481626,158.745113,161.38946,164.414669,167.820738,171.607669,175.77546,180.324113,185.253626,...,12.221008,11.881902,11.558215,11.248957,10.953221,10.670173,10.399044,10.139128,C,O=O
4,282.35,273.375107,264.294099,255.129921,245.905445,236.643471,227.366724,218.097857,208.859453,199.674022,...,8.30212,8.539327,8.784611,9.038182,9.300241,9.570974,9.850553,10.139128,C,C=C


In [4]:
from rdkit import Chem
from rdkit.Chem import RemoveHs

# convert isomeric SMILES to canonical SMILES
def convert_to_canonical(smiles):
    mol = Chem.MolFromSmiles(smiles)
    # Convert molecule to Kekulé form to ensure stability before removing Hs
    Chem.Kekulize(mol, clearAromaticFlags=True)
    mol = RemoveHs(mol)  # Remove explicit hydrogens
    return Chem.MolToSmiles(mol, isomericSmiles=False) if mol else None

# Apply function separately to each column
df['SMILES_part1'] = df['SMILES_part1'].apply(convert_to_canonical)
df['SMILES_part2'] = df['SMILES_part2'].apply(convert_to_canonical)

df.head()



Unnamed: 0,Tred_0,Tred_0_1,Tred_0_2,Tred_0_3,Tred_0_4,Tred_0_5,Tred_0_6,Tred_0_7,Tred_0_8,Tred_0_9,...,Dred_0_3,Dred_0_4,Dred_0_5,Dred_0_6,Dred_0_7,Dred_0_8,Dred_0_9,Dred_1,SMILES_part1,SMILES_part2
0,5.1953,24.009779,42.762567,61.453665,80.083071,98.650787,117.156811,135.601145,153.983787,172.304739,...,15.316977,14.546597,13.767032,12.992897,12.235694,11.504011,10.803868,10.139128,C,[He]
1,132.86,137.518594,142.549399,147.906317,153.542779,159.411746,165.465701,171.656644,177.936084,184.255032,...,10.605515,10.527665,10.45266,10.381062,10.313407,10.250207,10.191954,10.139128,C,[C-]#[O+]
2,150.687,153.935357,157.444324,161.178243,165.101196,169.177003,173.369222,177.641143,181.955791,186.275917,...,12.109699,11.792017,11.503058,11.230729,10.965225,10.698652,10.424819,10.139128,C,[Ar]
3,154.599,156.481626,158.745113,161.38946,164.414669,167.820738,171.607669,175.77546,180.324113,185.253626,...,12.221008,11.881902,11.558215,11.248957,10.953221,10.670173,10.399044,10.139128,C,O=O
4,282.35,273.375107,264.294099,255.129921,245.905445,236.643471,227.366724,218.097857,208.859453,199.674022,...,8.30212,8.539327,8.784611,9.038182,9.300241,9.570974,9.850553,10.139128,C,C=C


In [5]:
df.to_csv('split_smiles.csv', index= False)