# Build Unkown Dataset Notebook

## Purpose and Context

This notebook gets the experimental data we already have in a usable format for us to do predictions on

## Setup

import libraries

In [None]:
import pandas as pd
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import utils
import features

from rdkit import Chem

import tqdm.notebook
tqdm.notebook.tqdm_notebook.pandas()

## Load Data

In [None]:
data = utils.LoadDataFromOutput('extraction-pubChem')
data['Source'] = 'PubChem'
data.columns = data.columns.str.replace('_', ' ').str.title()
print(len(data))
data.head(1)

In [None]:
data['Source Key'] = data['Cid'].astype(str)
data.rename(columns = {'Isosmiles': 'Smiles'}, inplace = True)
data = data[['Source', 'Source Key', 'Smiles']]

## Compute Features

In [None]:
temp = data['Smiles'].drop_duplicates().to_frame()

temp = temp.join(temp['Smiles'].progress_apply(features.ComputeAllFeatures).apply(lambda x: pd.Series(x, dtype = 'object'))).fillna(0)

# Removing any entry that failed to compute all features
temp = temp[temp['Total Atom Count'].isna() == False].drop_duplicates()

data = data.merge(temp, on = 'Smiles')

len(data)

Removing data that didn't calculate all the features

In [None]:
data = data[data['Error'] != True].reset_index(drop = True)
data.drop(['Error'], axis = 'columns', inplace = True)

Removing epsilon values we actually do know that are in one of the other datasets

In [None]:
knownEpsilons = utils.LoadDataFromOutput('dataset-allKnownEpsilon')['Smiles'].progress_apply(lambda x: Chem.inchi.MolToInchiKey(Chem.MolFromSmiles(x))).to_list()

data = data[data['InchiKey'].isin(knownEpsilons) == False].reset_index(drop = True)
len(data)

### Cleaning up data

In [None]:
data.drop(['InchiKey'], axis = 'columns', inplace = True)

# Standardizing Column names
data.columns = data.columns.str.replace('_', ' ').str.title()

# Compressing data
utils.ConvertFloatColumnsToIntegerIfNoDataLoss(data)
utils.CompressIntegerColumns(data)

In [None]:
utils.RemoveStaticColumns(data)
print('-----------------')
print('-----------------')
print('-----------------')
utils.RemoveDuplicateColumns(data)

## Basic Analysis

In [None]:
data.info()

In [None]:
utils.InspectColumnValues(data)

In [None]:
data.describe()

In [None]:
utils.ShowHistogramCharts(data)

### Saving for later use

In [None]:
utils.SaveDataToOutput(data, 'dataset-unknownEpsilon')
utils.LoadDataFromOutput('dataset-unknownEpsilon')