# Exploring multiple approaches to machine learning across multiple small EHD experimental datasets
 - The task is to predict (regress) the size of printed features, as a function of waveform inputs to the EHD printer
 - A second task is to predict (classify) whether a given waveform input will produce any printed pattern at all. (This is equivalent to understanding the print onset voltage threshold.)
 - Multiple hidden confounding variables are likely, such as ink/tip/substrate/atmospheric condition; ink and tip clogging; print tip height from the substrate. Some of these will be relatively constant for each run of experiments, some will vary from feature to feature.
 - Ink dynamics at the printing tip are complex and nonlinear, with electrical and fluid/acoustic phenomena that interact with each other.

## Goals
 - In regression, aim for <3% mean absolute error (from the predicted outcome)
 - In classification, aim for >0.9 ROC AUC
 - For a new/test set, achieve these in <=100 experiments

In [1]:
import sys

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from scipy.stats import pearsonr

sys.path.append('..')
from ehd_dataset import EHD_Loader
from ehd_models import EHD_Model


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
INDEX = "C:\Dropbox\SPEED\Self Driving EHD\Datasets\dataset_index.xlsx"
loader = EHD_Loader(INDEX)
print("Datasets Loaded!\n>> Quick correlation validation check -- [auac; vec L2] <<")

for i, df in enumerate(loader.datasets):
    # print(loader.names[i])
    AUAC, _ = pearsonr(df.area,
                       df.wave.apply(lambda x: np.sum(np.abs(x))))
    # print(f"AUAC correlation: {corr}")
    # Area has >0.7 corr vs L2-norm of the wavevector
    VL2, _ = pearsonr(df.area,
                       df.vector.apply(lambda x: np.sqrt(np.sum(x**2))))
    print(f"<<{loader.names[i]} -- [{AUAC:.3f}; {VL2:.3f}]", end='>> ')

Failed to load 10-Mar-2022 large nozzle mosaic: 'DataFrame' object has no attribute 'note'
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\29-Mar-2022 lg 1cm 300 points	offset 0	corr 0.4960008364961841
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\2-May-2022__run 1	offset 7	corr 0.20797462285908108
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\2-May-2022__run 2	offset 4	corr 0.5656140913387249
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\23-May-2022_squares	offset 8	corr 0.5137128998522692
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\24-May-2022 large harmonics	offset 0	corr 0.5869081626562916
Datasets Loaded!
>> Quick correlation validation check -- [auac; vec L2] <<
<<29-Mar-2022 lg 1cm 300 points -- [0.496; 0.650]>> <<2-May-2022__run 1 -- [0.208; 0.239]>> <<2-May-2022__run 2 -- [0.566; 0.748]>> <<23-May-2022_squares -- [0.514; 0.630]>> <<24-May-2022 large harmonics -- [0.587; 0.764]>> 

In [46]:
from ehd_models import EHD_Model

#reg_results = pd.DataFrame(columns=('architecture', 'fold', 'train_size', 'MAE', 'PAE'))
reg_results = []
for architecture in ['MLE']:  # Print Area Regression Models <<<<<<<<<<<<<<<<<<<<
    break
    model = EHD_Model(architecture=architecture)

    for fold in range(loader.num_folds):
        pretrain_set, eval_set, eval_name = loader.folded_dataset(fold=fold, xtype='wave', ytype='area',
                                                                 # filters=[('vector', lambda x: len(x), 6)])
                                                                 filters=[])
        model.pretrain(pretrain_set)
        output = model.evaluate(eval_set)

        output['architecture'] = architecture
        output['eval_dataset'] = eval_name
        reg_results.append(output)
#reg_results = pd.DataFrame.from_dict(reg_results)


#class_results = pd.DataFrame(columns=('architecture', 'fold', 'train_size', 'AUC'))
class_results = []
for architecture in ['MLE_class']:  # Jetting Classification Models <<<<<<<<<<<<<<<<<<<<
    model = EHD_Model(architecture=architecture)

    for fold in range(loader.num_folds):
        pretrain_set, eval_set, eval_name = loader.folded_dataset(fold=fold, xtype='wave', ytype='jetted',
                                                       # filters=[('vector', lambda x: len(x), 6)])
                                                       filters=[])
        model.pretrain(pretrain_set)
        output = model.evaluate(eval_set)

        output['architecture'] = architecture
        output['eval_dataset'] = eval_name
        class_results.append(output)

# TODO visualize results