In [14]:
import pandas as pd
from rdkit import Chem
import requests
#from helpers.integration_helpers import canoicalize_smiles

## Carroll

In [None]:
data = pd.read_csv('../data/Carroll11.csv', encoding = "ISO-8859-1")

In [None]:
carroll = data[['Compound', 'TFP(K)Ref.']]
carroll['source'] = 'carroll11'
carroll['pure substance'] = 1

In [None]:
carroll['smiles'] = carroll['Compound'].apply(getSmiles)

In [None]:
carroll.dropna(subset=['smiles'], inplace=True)

In [None]:
carroll.dropna(subset=['Compound'], inplace=True)

In [None]:
carroll.head()

In [None]:
carroll.rename(index=str, columns={"Compound": "compound", "TFP(K)Ref.": "flashpoint"}, inplace=True)

In [None]:
carroll = carroll[['smiles', 'compound', 'flashpoint', 'source', 'pure substance']]
carroll.head()
carroll.to_csv('Carroll11.csv', index=False)

In [None]:
print(carroll['flashpoint'].mean())
print(carroll['flashpoint'].std())

## Saldana

In [None]:
data = pd.read_csv('../data/Saldana11(fuels).csv', encoding = "ISO-8859-1")

In [None]:
fuels = data[['Name', 'Family', 'SMILES', 'FP Exp.']]
fuels['source'] = 'Saldana11'
fuels['pure substance'] = 1
fuels.rename(index=str, columns={"Name": "compound", "FP Exp.": "flashpoint", "SMILES": "smiles"}, inplace=True)
fuels = fuels[['smiles', 'compound', 'flashpoint', 'source', 'pure substance', 'Family']]
fuels.head()

In [None]:
fuels.to_csv('fuels.csv', index=False)

In [None]:
print(fuels['flashpoint'].mean())
print(fuels['flashpoint'].std())

## Pan12

In [None]:
data = pd.read_csv('../data/pan12.csv', encoding = "ISO-8859-1")

In [None]:
pan12 = data[['Compounds Name','smiles', 'Observed']]
pan12.rename(index=str,columns={"Compounds Name": "compound", 'Observed': 'flashpoint'}, inplace=True)
pan12['flashpoint'] = pan12['flashpoint'].apply(CtoK)
pan12['source'] = 'pan12'
pan12['pure substance'] = 1
pan12 = pan12[['smiles', 'compound', 'flashpoint', 'source', 'pure substance']]
pan12.head()

In [None]:
pan12.to_csv('pan12.csv', index=False)

## chen14

In [None]:
data = pd.read_csv('../data/chen14.csv', encoding = "ISO-8859-1")

In [None]:
data.head()

In [None]:
chen14 = data[['Compond Name','smiles', 'Observed']]
chen14.rename(index=str,columns={"Compond Name": "compound", 'Observed': 'flashpoint'}, inplace=True)
chen14['flashpoint'] = chen14['flashpoint'].apply(CtoK)
chen14['source'] = 'chen14'
chen14['pure substance'] = 1
chen14 = chen14[['smiles', 'compound', 'flashpoint', 'source', 'pure substance']]
chen14.head()

In [None]:
chen14.to_csv('chen14.csv', index=False)

In [None]:
carroll = pd.read_csv('Carroll11.csv', encoding = "ISO-8859-1")
fuels = pd.read_csv('fuels.csv', encoding = "ISO-8859-1")
pan12 = pd.read_csv('pan12.csv', encoding = "ISO-8859-1")
chen14 = pd.read_csv('chen14.csv', encoding = "ISO-8859-1")

In [None]:
# canonicalize carroll smiles
for i in range(0, len(carroll)):
    carroll.iloc[i]['smiles'] = Chem.MolToSmiles(Chem.MolFromSmiles(carroll.iloc[i]['smiles']))


In [None]:
fuels.drop(['Family'], axis=1, inplace=True)
print(carroll.columns)
print(fuels.columns)
print(pan12.columns)
print(chen14.columns)

In [None]:
carroll['smiles']

In [None]:
frames = [fuels, carroll, pan12, chen14]
result = pd.concat(frames)

In [None]:
result.shape

In [None]:
#canonicalize smiles with rdkit
result['canonical smiles'] = None
for idx, row in result.iterrows():
    result.loc[idx,'canonical smiles'] = Chem.MolToSmiles(Chem.MolFromSmiles(result.iloc[idx]['smiles']))

In [None]:
result.dropna(inplace=True)

In [None]:
result.shape

In [None]:
result.to_csv('integrated_dataset.csv', index=False)

In [None]:
result = pd.read_csv('../data/integrated_dataset.csv')

In [None]:
result['canonical smiles'] = None
result.head(5)

In [None]:
for idx, row in result.iterrows():
    result.loc[idx,'canonical smiles'] = Chem.MolToSmiles(Chem.MolFromSmiles(result.loc[idx]['smiles']))

In [None]:
result.to_csv('../data/integrated_dataset.csv', index=False)

## Add Gelest to dataset

In [None]:
#result = pd.read_csv('../data/integrated_dataset.csv')
result.drop(columns=['smiles'], inplace=True)
result.rename(index=str, columns={"canonical smiles" : "smiles"}, inplace=True)

In [None]:
silanes = pd.read_csv('../data/silanes_all.csv')
metal_organic = pd.read_csv('../data/metal-organics_all.csv')
tin = pd.read_csv('../data/tin_all.csv')
germanium  = pd.read_csv('../data/germanium_all.csv')

In [None]:
result = result[['compound', 'smiles', 'flashpoint', 'pure substance', 'source']]
result.head()

In [None]:
frames = [silanes, metal_organic, tin, germanium, result]
resultv2 = pd.concat(frames)

In [None]:
resultv2.shape
resultv2.to_csv('../data/integrated_dataset.csv')

## pubchem integration

In [None]:
pubchem = pd.read_csv('../data/pubchem_data.csv')
carroll = pd.read_csv('../data/Carroll11.csv')
chen14 = pd.read_csv('../data/chen14.csv')
fuels = pd.read_csv('../data/fuels.csv')
germanium = pd.read_csv('../data/germanium_all.csv')
metal_organics = pd.read_csv('../data/metal-organics_all.csv')
pan12 = pd.read_csv('../data/pan12.csv')
silanes = pd.read_csv('../data/silanes_all.csv')
tin = pd.read_csv('../data/tin_all.csv')

In [None]:
print(list(pubchem))
print(list(carroll))
print(list(chen14))
print(list(fuels))
print(list(germanium))
print(list(metal_organics))
print(list(pan12))
print(list(silanes))
print(list(tin))

In [None]:
def canoicalize_smiles(data):
    for idx, row in data.iterrows():
        m = Chem.MolFromSmiles(data.iloc[idx]['smiles'])
        if m != None:
            data.iloc[idx]['smiles'] = Chem.MolToSmiles(m)
        else:
            data.iloc[idx]['smiles'] = None
    return data

In [None]:
frames = [pubchem, carroll, chen14, fuels, germanium, metal_organics, pan12, silanes, tin]
result = pd.concat(frames)

In [None]:
canonical_smiles_data = canoicalize_smiles(result)

In [None]:
canonical_smiles_data[canonical_smiles_data.duplicated()].shape

In [None]:
canonical_smiles_data.to_csv('../data/integrated_dataset.csv', index=False)

In [None]:
data = canonical_smiles_data

In [None]:
data.drop_duplicates().shape

In [None]:
#data.drop_duplicates('smiles').
rows = data.loc[data['smiles'] == 'C[Sn](C)(C)Cl']
rows.iloc[0]#['flashpoint'].std()

In [59]:
def remove_duplicates(data):
    duplicates = data[data.duplicated(subset='smiles')]
    result = data.drop_duplicates(subset='smiles', keep=False)#[~duplicates]
    #for each unique smiles that has duplicates
    for smiles in data[data.duplicated(subset='smiles')]['smiles'].unique():
        dup_rows = data.loc[data['smiles'] == smiles]
        if dup_rows['flashpoint'].unique().shape[0] == 1:
            # remove all but one
            result = result.append(dup_rows.iloc[0], sort=False)
        else:
            if dup_rows['flashpoint'].std() < 5:
                # add 1 back
                result = result.append(dup_rows.iloc[0], sort=False)
    return result  
            

In [None]:
pure_compounds = data.loc[data['pure substance'] == 1]
pure_compounds.shape

In [None]:
data[data.duplicated(subset='smiles')].shape

In [None]:
data = data.loc[data['pure substance'] == 1]
print(data.shape)
no_dups = remove_duplicates(data)
print(no_dups.shape)

In [None]:
no_dups[no_dups.duplicated(subset='smiles')]
#no_dups.loc[no_dups['smiles'] == 'CC1=CCCC(=C)C2CC(C2CC1)(C)C']
no_dups.to_csv('../data/integrated_dataset.csv', index=False)

In [None]:
result = data.drop_duplicates(subset='smiles', keep=False)
result.shape

In [None]:
data[data.duplicated(subset='smiles')]['smiles'].unique()

### Hazardous Materials

In [40]:
data = pd.read_csv('../data/hazardous_materials.csv')
haz = data[['Common name', 'Data type', 'Measurement name', 'Measurement value', 'Measurement units']]

In [41]:
flashpoints = haz.loc[haz['Measurement name'] == 'Flash point']

In [42]:
print(list(flashpoints))

['Common name', 'Data type', 'Measurement name', 'Measurement value', 'Measurement units']


In [43]:
flashpoints['smiles'] = flashpoints['Common name'].apply(get_smiles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
def get_compound_with_element(data, element):
    return data[data['smiles'].str.contains(element)]

def get_smiles(compound):
    smilesUrl = "https://opsin.ch.cam.ac.uk/opsin/"+compound+".smi"
    requestSmiles = requests.get(smilesUrl)
    if requestSmiles.status_code == 400 or requestSmiles.status_code == 404:
        return None
    else:
        return Chem.MolToSmiles(Chem.MolFromSmiles(requestSmiles.text))  
    
def celsiusToKelvin(temp):
    return temp + 273.15

def get_compound_with_element(data, element):
    return data[data['smiles'].str.contains(element)]

In [18]:
flashpoints = flashpoints[~flashpoints['Measurement value'].str.contains('<')]
flashpoints = flashpoints[~flashpoints['Measurement value'].str.contains('>')]
flashpoints = flashpoints[~flashpoints['Measurement value'].str.contains('~')]
pattern = '\\w+-\\w'
flashpoints['Measurement value'] = flashpoints['Measurement value'].str.replace(pattern, 'none', regex=True)
#flashpoints = flashpoints[~flashpoints['Measurement value'].str.contains('none')]
flashpoints['Measurement value'] = pd.to_numeric(flashpoints['Measurement value'], errors='coerce')

flashpoints.head()

NameError: name 'flashpoints' is not defined

In [46]:
flashpoints['flashpoint'] = flashpoints['Measurement value'].apply(celsiusToKelvin)
flashpoints.drop(columns=['Measurement name','Data type', 'Measurement units', 'Measurement value'], inplace=True)
#flashpoints.rename(columns={'Common name' : 'compound', 'Measurement value' : 'flashpoint'}, inplace=True)

In [62]:
flashpoints.dropna(inplace=True)
flashpoints.rename(columns={'Common name' : 'compound'}, inplace=True)
#flashpoints.drop(df.columns[i], axis=1)
flashpoints['pure substance'] = 1
flashpoints['source'] = 'hazardous materials'
hazardous_mat = flashpoints[['compound', 'flashpoint', 'pure substance', 'smiles', 'source']]
hazardous_mat.shape

(368, 5)

In [61]:
#data = pd.read_csv('../data/integrated_dataset.csv')
#frames = [data, hazardous_mat]
#result = pd.concat(frames)
no_dups = remove_duplicates(result)
print(result.shape)
print(no_dups.shape)
no_dups.to_csv('../data/integrated_dataset.csv', index=False)

(3684, 5)
(3471, 5)


In [13]:
dataset = pd.read_csv('../data/integrated_dataset.csv')
Si_data = get_compound_with_element(dataset, 'Si')
Si_data = Si_data[['smiles', 'flashpoint']]
print(Si_data.shape)
Si_data.to_csv('../data/sidata.csv', index=False)
data = dataset[~dataset['smiles'].str.contains('Si')]
data = data[['smiles', 'flashpoint']]
data.to_csv('../data/notSidata.csv')
print(data.shape)


(298, 2)
(3173, 2)


In [None]:
data = dataset[['smiles', 'flashpoint']]
data.to_csv('../data/flashpoints.csv')
dataset['source'].unique()

In [None]:
data = dataset[dataset['source'] != 'gelest_metal-organics']
data = data[['smiles', 'flashpoint']]
data.to_csv('../data/flashpointsNoGelestMetallics.csv', index=False)

In [3]:
data = pd.read_csv('../data/integrated_dataset.csv')
print(data['flashpoint'].mean())
print(data['flashpoint'].std())

341.2367156439067
65.30367258221965


In [7]:
carroll = data[data['source'] == 'carroll11']
carroll = carroll[['smiles', 'flashpoint']]
carroll.head()
carroll.to_csv('../data/carroll.csv', index=False)

In [8]:
carroll = data[data['source'] == 'pubchem']
carroll = carroll[['smiles', 'flashpoint']]
carroll.head()
carroll.to_csv('../data/pubchem.csv', index=False)

In [55]:
pubchem = pd.read_csv('../data/carroll.csv')
integrated_dataset = pd.read_csv('../data/integrated_dataset.csv')

pubchem.head()
#print(pubchem['flashpoint'].mean())
#print(pubchem['flashpoint'].std())

Unnamed: 0,smiles,flashpoint
0,CC(C)CC,222.0
1,CC(C)CCC,241.0
2,CC(C)CCCCC,277.0
3,CC(C)CCCCCC,297.0
4,CC(C)CCCCCCC,311.0


In [91]:
def LOG(dataset, test_group):  # leave out group
        """
        split dataset by leaving out a specific source as test set
        dataset: data frame
        test_group: string
        """
        # remove duplicates in train group.
        test_df = dataset[dataset['source'] == test_group]
        train_df = dataset[dataset['source'] != test_group]

        # remove data points in  train dataframe that match smiles strings in
        # test dataframe
        for index, row in test_df.iterrows():
            smi = row['smiles']
            train_df = train_df[train_df['smiles'] != smi]

        frames = [train_df, test_df]
        dataset = pd.concat(frames)
        dataset.reset_index(drop=True, inplace=True)
        test_indices = []
        train_indices = list(range(len(dataset.index)))
        print("||||||||||||||||||| "+test_group+ " will be used as test set|||||||||||||||||||")
        for i in range(0,len(dataset.index)):
            if dataset.iloc[i]['source'] == test_group:
                test_indices.append(i)
                train_indices.remove(i)
        return (train_indices, test_indices, dataset)

In [92]:
data = pd.read_csv('../data/integrated_dataset.csv')
x, y, new_df = LOG(data, 'pan12')

||||||||||||||||||| pan12 will be used as test set|||||||||||||||||||


In [93]:
y

[3459, 3460, 3461, 3462, 3463, 3464, 3465, 3466, 3467, 3468, 3469, 3470]

In [99]:
new_df.iloc[x]['source'].unique()

array(['pubchem', 'carroll11', 'chen14', 'Saldana11', 'gelest_germanium',
       'gelest_metal-organics', 'gelest_silanes', 'gelest_tin',
       'hazardous materials'], dtype=object)

In [73]:
data.head()

Unnamed: 0,compound,flashpoint,pure substance,smiles,source
0,1-aminopropan-2-ol,350.15,1,CC(CN)O,pubchem
1,"1-chloro-2,4-dinitrobenzene",467.15,1,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl,pubchem
2,"1,2-dichloroethane",286.15,1,C(CCl)Cl,pubchem
3,"1,2,4-trichlorobenzene",383.15,1,C1=CC(=C(C=C1Cl)Cl)Cl,pubchem
4,2-chloroacetaldehyde,361.15,1,C(C=O)Cl,pubchem


In [96]:
data.iloc[y[10]]

compound            1,3-xylene
flashpoint                 298
pure substance               1
smiles            Cc1cccc(C)c1
source               Saldana11
Name: 3469, dtype: object