# Build Training Datasets Notebook

## Purpose and Context

This notebook combines PhotochemCAD 3, Dyomics and Deep4Chem data into a development and validation dataset with known epsilons to be used to train and validate models to predict epsilon.

Since this dataset has SMILES format the chemical compound and molecular weight will come from it instead of the main source if it was provided.


## Setup

import libraries

In [None]:
import pandas as pd
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import utils
import features

from rdkit import Chem

import tqdm.notebook
tqdm.notebook.tqdm_notebook.pandas()

## Load Data

### Deep4Chem

In [None]:
temp = utils.LoadDataFromOutput('extraction-deep4Chem')
temp['Source'] = 'Deep4Chem'
print(len(temp))
temp.head(1)

In [None]:
temp['Epsilon'] = temp['Log(Epsilon)'].apply(lambda x: 10**x)

temp['Smiles'] = temp['Chromophore']
temp.rename(columns = {'Chromophore': 'Source Key'}, inplace = True)

temp = temp[['Source', 'Source Key', 'Epsilon', 'Smiles']]

data = temp.copy()

### PhotoChemCad 3

In [None]:
temp = utils.LoadDataFromOutput('extraction-photochemCad3')
temp['Source'] = 'PhotoChemCAD3'
temp.columns = temp.columns.str.replace('_', ' ').str.title()
print(len(temp))
temp.head(1)

In [None]:
temp.rename(columns = {'Name': 'Source Key'}, inplace = True)

temp = temp[['Source', 'Source Key', 'Epsilon', 'Smiles']]

data = data.append(temp)

### Dyomics

In [None]:
temp = utils.LoadDataFromOutput('extraction-dyomics')
temp['Source'] = 'Dyomics'
temp.columns = temp.columns.str.replace('_', ' ').str.title()
print(len(temp))
temp.head(1)

In [None]:
temp.rename(columns = {'Molar Absorbance': 'Epsilon', 'Name': 'Source Key'}, inplace = True)

temp = temp[['Source', 'Source Key', 'Epsilon', 'Smiles']]

data = data.append(temp)
data.reset_index(drop = True, inplace = True)
data

## Compute Features

In [None]:
temp = data['Smiles'].drop_duplicates().to_frame()

temp = temp.join(temp['Smiles'].progress_apply(features.ComputeAllFeatures).apply(lambda x: pd.Series(x, dtype = 'object'))).fillna(0)

data = data.merge(temp, on = 'Smiles')

Removing data that didn't calculate all the features

In [None]:
if ('Error' in data.columns):
    data = data[data['Error'] != True].reset_index(drop = True)
    data.drop(['Error'], axis = 'columns', inplace = True)

### Saving for later use

In [None]:
# Standardizing Column names
data.columns = data.columns.str.replace('_', ' ').str.title()

# Compressing data
utils.ConvertFloatColumnsToIntegerIfNoDataLoss(data)
utils.CompressIntegerColumns(data)

In [None]:
utils.SaveDataToOutput(data, 'dataset-allKnownEpsilon')
utils.LoadDataFromOutput('dataset-allKnownEpsilon')

### Cleaning up data

In [None]:
data.drop(['Source', 'Source Key', 'Smiles', 'Inchikey'], axis = 'columns', inplace = True)
data.head(1)

Removing any entry that has an Epsilon >= 800K as it seems too good to be true and any entry that has an infinate value

In [None]:
limit = 800000
print('Number of entries >= 800K: ' + str(len(data[data['Epsilon'] >= limit])))
data = data[data['Epsilon'] < limit].copy()

print('Columns with infinate values: ' + str(data.columns[np.isinf(data).any()].values))
print('Number of entries with infinate values: ' + str(len(data.index[np.isinf(data).any(1)])))

data.replace([np.inf, -np.inf], np.nan, inplace = True)
data.dropna(inplace = True)
data.reset_index(drop = True, inplace = True)

In [None]:
utils.RemoveStaticColumns(data)
print('-----------------')
print('-----------------')
print('-----------------')
utils.RemoveDuplicateColumns(data)

## Basic Analysis

In [None]:
data.info()

In [None]:
utils.InspectColumnValues(data)

In [None]:
data.describe()

In [None]:
utils.ShowHistogramCharts(data)

## Building Validation and Development Dataset and saving for use later

In [None]:
def SplitData(data):
    validation = data.sample(frac = .1, random_state = 82219)
    development_mask = pd.Series(True, index = data.index)
    development_mask[validation.index] = False
    development = data[development_mask].copy()
    development.reset_index(drop = True, inplace = True)
    validation.reset_index(drop = True, inplace = True)
    
    return development, validation

In [None]:
development, validation = SplitData(data)

In [None]:
utils.SaveDataToOutput(development, 'dataset-development')
utils.LoadDataFromOutput('dataset-development')

In [None]:
utils.SaveDataToOutput(validation, 'dataset-validation')
utils.LoadDataFromOutput('dataset-validation')

In [None]:
print('Number of entries in development dataset: ' + str(len(development)))
print('Number of entries in validation dataset: ' + str(len(validation)))