In [None]:
import pandas as pd
from rdkit import Chem
import requests

In [None]:
def getSmiles(compound):
    smilesUrl = "https://opsin.ch.cam.ac.uk/opsin/"+compound+".smi"
    requestSmiles = requests.get(smilesUrl)
    if requestSmiles.status_code == 400 or requestSmiles.status_code == 404:
        return None
    else:
        return requestSmiles.text
def CtoK(num):
    return num + 273.15

## Carroll

In [None]:
data = pd.read_csv('../data/Carroll11.csv', encoding = "ISO-8859-1")

In [None]:
carroll = data[['Compound', 'TFP(K)Ref.']]
carroll['source'] = 'carroll11'
carroll['pure substance'] = 1

In [None]:
carroll['smiles'] = carroll['Compound'].apply(getSmiles)

In [None]:
carroll.dropna(subset=['smiles'], inplace=True)

In [None]:
carroll.dropna(subset=['Compound'], inplace=True)

In [None]:
carroll.head()

In [None]:
carroll.rename(index=str, columns={"Compound": "compound", "TFP(K)Ref.": "flashpoint"}, inplace=True)

In [None]:
carroll = carroll[['smiles', 'compound', 'flashpoint', 'source', 'pure substance']]
carroll.head()
carroll.to_csv('Carroll11.csv', index=False)

In [None]:
print(carroll['flashpoint'].mean())
print(carroll['flashpoint'].std())

## Saldana

In [None]:
data = pd.read_csv('../data/Saldana11(fuels).csv', encoding = "ISO-8859-1")

In [None]:
fuels = data[['Name', 'Family', 'SMILES', 'FP Exp.']]
fuels['source'] = 'Saldana11'
fuels['pure substance'] = 1
fuels.rename(index=str, columns={"Name": "compound", "FP Exp.": "flashpoint", "SMILES": "smiles"}, inplace=True)
fuels = fuels[['smiles', 'compound', 'flashpoint', 'source', 'pure substance', 'Family']]
fuels.head()

In [None]:
fuels.to_csv('fuels.csv', index=False)

In [None]:
print(fuels['flashpoint'].mean())
print(fuels['flashpoint'].std())

## Pan12

In [None]:
data = pd.read_csv('../data/pan12.csv', encoding = "ISO-8859-1")

In [None]:
pan12 = data[['Compounds Name','smiles', 'Observed']]
pan12.rename(index=str,columns={"Compounds Name": "compound", 'Observed': 'flashpoint'}, inplace=True)
pan12['flashpoint'] = pan12['flashpoint'].apply(CtoK)
pan12['source'] = 'pan12'
pan12['pure substance'] = 1
pan12 = pan12[['smiles', 'compound', 'flashpoint', 'source', 'pure substance']]
pan12.head()

In [None]:
pan12.to_csv('pan12.csv', index=False)

## chen14

In [None]:
data = pd.read_csv('../data/chen14.csv', encoding = "ISO-8859-1")

In [None]:
data.head()

In [None]:
chen14 = data[['Compond Name','smiles', 'Observed']]
chen14.rename(index=str,columns={"Compond Name": "compound", 'Observed': 'flashpoint'}, inplace=True)
chen14['flashpoint'] = chen14['flashpoint'].apply(CtoK)
chen14['source'] = 'chen14'
chen14['pure substance'] = 1
chen14 = chen14[['smiles', 'compound', 'flashpoint', 'source', 'pure substance']]
chen14.head()

In [None]:
chen14.to_csv('chen14.csv', index=False)

In [None]:
carroll = pd.read_csv('Carroll11.csv', encoding = "ISO-8859-1")
fuels = pd.read_csv('fuels.csv', encoding = "ISO-8859-1")
pan12 = pd.read_csv('pan12.csv', encoding = "ISO-8859-1")
chen14 = pd.read_csv('chen14.csv', encoding = "ISO-8859-1")

In [None]:
# canonicalize carroll smiles
for i in range(0, len(carroll)):
    carroll.iloc[i]['smiles'] = Chem.MolToSmiles(Chem.MolFromSmiles(carroll.iloc[i]['smiles']))


In [None]:
fuels.drop(['Family'], axis=1, inplace=True)
print(carroll.columns)
print(fuels.columns)
print(pan12.columns)
print(chen14.columns)

In [None]:
carroll['smiles']

In [None]:
frames = [fuels, carroll, pan12, chen14]
result = pd.concat(frames)

In [None]:
result.shape

In [None]:
#canonicalize smiles with rdkit
result['canonical smiles'] = None
for idx, row in result.iterrows():
    result.loc[idx,'canonical smiles'] = Chem.MolToSmiles(Chem.MolFromSmiles(result.iloc[idx]['smiles']))

In [None]:
result.dropna(inplace=True)

In [None]:
result.shape

In [None]:
result.to_csv('integrated_dataset.csv', index=False)

In [None]:
result = pd.read_csv('../data/integrated_dataset.csv')

In [None]:
result['canonical smiles'] = None
result.head(5)

In [None]:
for idx, row in result.iterrows():
    result.loc[idx,'canonical smiles'] = Chem.MolToSmiles(Chem.MolFromSmiles(result.loc[idx]['smiles']))

In [None]:
result.to_csv('../data/integrated_dataset.csv', index=False)

## Add Gelest to dataset

In [None]:
#result = pd.read_csv('../data/integrated_dataset.csv')
result.drop(columns=['smiles'], inplace=True)
result.rename(index=str, columns={"canonical smiles" : "smiles"}, inplace=True)

In [None]:
silanes = pd.read_csv('../data/silanes_all.csv')
metal_organic = pd.read_csv('../data/metal-organics_all.csv')
tin = pd.read_csv('../data/tin_all.csv')
germanium  = pd.read_csv('../data/germanium_all.csv')

In [None]:
result = result[['compound', 'smiles', 'flashpoint', 'pure substance', 'source']]
result.head()

In [None]:
frames = [silanes, metal_organic, tin, germanium, result]
resultv2 = pd.concat(frames)

In [None]:
resultv2.shape
resultv2.to_csv('../data/integrated_dataset.csv')

## pubchem integration

In [2]:
pubchem = pd.read_csv('../data/pubchem_data.csv')
carroll = pd.read_csv('../data/Carroll11.csv')
chen14 = pd.read_csv('../data/chen14.csv')
fuels = pd.read_csv('../data/fuels.csv')
germanium = pd.read_csv('../data/germanium_all.csv')
metal_organics = pd.read_csv('../data/metal-organics_all.csv')
pan12 = pd.read_csv('../data/pan12.csv')
silanes = pd.read_csv('../data/silanes_all.csv')
tin = pd.read_csv('../data/tin_all.csv')

In [14]:
print(list(pubchem))
print(list(carroll))
print(list(chen14))
print(list(fuels))
print(list(germanium))
print(list(metal_organics))
print(list(pan12))
print(list(silanes))
print(list(tin))

['compound', 'smiles', 'flashpoint', 'source', 'pure substance']
['compound', 'smiles', 'flashpoint', 'pure substance', 'source']
['compound', 'smiles', 'flashpoint', 'pure substance', 'source']
['compound', 'smiles', 'flashpoint', 'pure substance', 'source']
['compound', 'smiles', 'flashpoint', 'pure substance', 'source']
['compound', 'smiles', 'flashpoint', 'pure substance', 'source']
['compound', 'smiles', 'flashpoint', 'pure substance', 'source']
['compound', 'smiles', 'flashpoint', 'pure substance', 'source']
['compound', 'smiles', 'flashpoint', 'pure substance', 'source']


In [51]:
def canoicalize_smiles(data):
    for idx, row in data.iterrows():
        m = Chem.MolFromSmiles(data.iloc[idx]['smiles'])
        if m != None:
            data.iloc[idx]['smiles'] = Chem.MolToSmiles(m)
        else:
            data.iloc[idx]['smiles'] = None
    return data

In [16]:
frames = [pubchem, carroll, chen14, fuels, germanium, metal_organics, pan12, silanes, tin]
result = pd.concat(frames)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [52]:
canonical_smiles_data = canoicalize_smiles(result)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [59]:
canonical_smiles_data[canonical_smiles_data.duplicated()].shape

(12, 5)

In [60]:
canonical_smiles_data.to_csv('../data/integrated_dataset.csv', index=False)

In [63]:
data = canonical_smiles_data

In [66]:
data.drop_duplicates().shape

(3971, 5)

In [105]:
#data.drop_duplicates('smiles').
rows = data.loc[data['smiles'] == 'C[Sn](C)(C)Cl']
rows.iloc[0]#['flashpoint'].std()

compound          TRIMETHYLCHLOROTIN
flashpoint                    370.15
pure substance                     1
smiles                 C[Sn](C)(C)Cl
source                    gelest_tin
Name: 41, dtype: object

In [129]:
def remove_duplicates(data):
    duplicates = data[data.duplicated(subset='smiles')]
    result = data.drop_duplicates(subset='smiles', keep=False)#[~duplicates]
    #for each unique smiles that has duplicates
    for smiles in data[data.duplicated(subset='smiles')]['smiles'].unique():
        dup_rows = data.loc[data['smiles'] == smiles]
        if dup_rows['flashpoint'].unique().shape[0] == 1:
            # remove all but one
            result = result.append(dup_rows.iloc[0], sort=False)
        else:
            if dup_rows['flashpoint'].std() < 5:
                # add 1 back
                result = result.append(dup_rows.iloc[0], sort=False)
    return result  
            

In [96]:
pure_compounds = data.loc[data['pure substance'] == 1]
pure_compounds.shape

(3914, 5)

In [95]:
data[data.duplicated(subset='smiles')].shape

(550, 5)

In [130]:
data = data.loc[data['pure substance'] == 1]
print(data.shape)
no_dups = remove_duplicates(data)
print(no_dups.shape)

(3914, 5)
(3316, 5)


In [133]:
no_dups[no_dups.duplicated(subset='smiles')]
#no_dups.loc[no_dups['smiles'] == 'CC1=CCCC(=C)C2CC(C2CC1)(C)C']
no_dups.to_csv('../data/integrated_dataset.csv', index=False)

In [116]:
result = data.drop_duplicates(subset='smiles', keep=False)
result.shape

(2948, 5)

In [119]:
data[data.duplicated(subset='smiles')]['smiles'].unique()

array(['CC1(C2CCC1(C(C2)O)C)C', 'CC(CC1=CC=CC=C1)NC',
       'CC1CCC(C(C1)O)C(C)C', 'CC1=CCCC(=C)C2CC(C2CC1)(C)C',
       'CCCCCCCCCCCCCC(=O)OC1C(C2(C(C=C(CC3(C2C=C(C3=O)C)O)CO)C4C1(C4(C)C)OC(=O)C)O)C',
       'CC(=O)OC1CC2CCC1(C2(C)C)C', 'CC(C(=O)O)O', 'CC=CC=C',
       'CC(CC1=CC=CC=C1)NC.Cl', 'CN1CCCC1C2=CN=CC=C2',
       'C1C=CCC2C1C(=O)OC2=O', 'CC1(C2CCC1(C(=O)C2)C)C', 'C(C=CCO)O',
       'CC1(C2CCC1(C(C2)OC(=O)CSC#N)C)C', 'CC1=CCC(CC1)C(=C)C',
       'CC1(C2CCC(=C)C1C2)C', 'CC1=CCC2CC1C2(C)C', 'CC=CC(=O)O',
       'C1=CC=C(C=C1)C=CC=O', 'CC(=CCCl)Cl', 'CC(=CCCC(=CC=O)C)C',
       'C1CC=CCCC=CCCC=C1', 'C(=CCl)Cl', 'C(C=CCCl)Cl', 'CC=CC=CC(=O)O',
       'CC(=CCCC(=CCO)C)C', 'CC1CC(=O)C=C2C1(CC(CC2)C(=C)C)C',
       'CCC=CCC1=C(CCC1=O)C', 'C(=O)C(=C(C(=O)O)Cl)Cl',
       'CC1C(SC(=O)N1C(=O)NC2CCCCC2)C3=CC=C(C=C3)Cl',
       'C1=CC=C(C=C1)C=CCO', 'CCC=CCCO',
       'CC1=C(C=CC=C1COC(=O)C2C(C2(C)C)C=C(C(F)(F)F)Cl)C3=CC=CC=C3',
       'CC1=CCCC(C1C=CC(=O)C)(C)C', 'CC(C)N(C(C)C)C(=O)SCC