# Build Experimental Dataset Notebook

## Purpose and Context

This notebook gets the experimental data we already have in a usable format for us to do predictions on to compare to the actual values we found in our experiments

## Setup

import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import utils
import features

import tqdm.notebook
tqdm.notebook.tqdm_notebook.pandas()

## Load Data

In [None]:
data = pd.read_csv('../rawData/Experimental_SMILES_Predictions.csv')

data.rename(columns = {'SMILES': 'Smiles', 'Min ε': 'Min Epsilon', 'Max ε': 'Max Epsilon', 'Dye': 'Source Key', 'TD-DFT μ (D)': 'TD-DFT (Debye)'}, inplace = True)

data = data[['Source Key', 'TD-DFT (Debye)', 'Min Epsilon', 'Max Epsilon', 'Smiles']]
data

## Compute Features

In [None]:
data = data.join(data['Smiles'].progress_apply(features.ComputeAllFeatures).apply(lambda x: pd.Series(x, dtype = 'object'))).fillna(0)

### Saving for later use

In [None]:
# Standardizing Column names
data.columns = data.columns.str.replace('_', ' ').str.title()

# Compressing data
utils.ConvertFloatColumnsToIntegerIfNoDataLoss(data)
utils.CompressIntegerColumns(data)

### Cleaning up data

In [None]:
data.drop(['Smiles', 'Inchikey'], axis = 'columns', inplace = True)
data.head(1)

In [None]:
utils.RemoveStaticColumns(data)
print('-----------------')
print('-----------------')
print('-----------------')
utils.RemoveDuplicateColumns(data)

## Basic Analysis

In [None]:
data.info()

In [None]:
utils.InspectColumnValues(data)

In [None]:
data.describe()

In [None]:
utils.ShowHistogramCharts(data)

## Saving for use later

In [None]:
utils.SaveDataToOutput(data, 'dataset-experimental')
utils.LoadDataFromOutput('dataset-experimental')