# Exploring multiple approaches to machine learning across multiple small EHD experimental datasets
 - The task is to predict (regress) the size of printed features, as a function of waveform inputs to the EHD printer
 - A second task is to predict (classify) whether a given waveform input will produce any printed pattern at all. (This is equivalent to understanding the print onset voltage threshold.)
 - Multiple hidden confounding variables are likely, such as ink/tip/substrate/atmospheric condition; ink and tip clogging; print tip height from the substrate. Some of these will be relatively constant for each run of experiments, some will vary from feature to feature.
 - Ink dynamics at the printing tip are complex and nonlinear, with electrical and fluid/acoustic phenomena that interact with each other.

## Goals
 - In regression, aim for <3% mean absolute error (from the predicted outcome)
 - In classification, aim for >0.9 ROC AUC
 - For a new/test set, achieve these in <=100 experiments

In [2]:
import sys

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from scipy.stats import pearsonr

sys.path.append('..')
from ehd_dataset import EHD_Loader
from ehd_models import EHD_Model


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [5]:
INDEX = "C:\Dropbox\SPEED\Self Driving EHD\Datasets\dataset_index.xlsx"
loader = EHD_Loader(INDEX)
print("Datasets Loaded!\n\nQuick correlation validation check:")

for i, df in enumerate(loader.datasets):
    print(loader.names[i])
    corr, _ = pearsonr(df.area,
                       df.wave.apply(lambda x: np.sum(np.abs(x))))
    print(f"AUAC correlation: {corr}")
    # Area has >0.7 corr vs L2-norm of the wavevector
    corr, _ = pearsonr(df.area,
                       df.vector.apply(lambda x: np.sqrt(np.sum(x**2))))
    print(f"Vec L2 correlation: {corr}")

Failed to load 10-Mar-2022 large nozzle mosaic: 'DataFrame' object has no attribute 'note'
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\29-Mar-2022 lg 1cm 300 points	offset 0	corr 0.4960008364961841
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\2-May-2022__run 1	offset 7	corr 0.20797462285908108
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\2-May-2022__run 2	offset 4	corr 0.5656140913387249
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\23-May-2022_squares	offset 8	corr 0.5137128998522692
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\24-May-2022 large harmonics	offset 0	corr 0.5869081626562916
Datasets Loaded!

Quick correlation validation check:
29-Mar-2022 lg 1cm 300 points
AUAC correlation: 0.49600083649618404
Vec L2 correlation: 0.64987002472606
2-May-2022__run 1
AUAC correlation: 0.20797462285908108
Vec L2 correlation: 0.23905650694665798
2-May-2022__run 2
AUAC correlation: 0.5656140913387248
Vec L2 correlation: 0.7481951849730066
23-May-2022_squares
AUAC 

In [12]:
from ehd_models import EHD_Model

reg_results = pd.DataFrame(columns=('architecture', 'fold', 'train_size', 'MAE', 'PAE'))
for architecture in ('MLE'):  # Print Area Regression Models <<<<<<<<<<<<<<<<<<<<
    model = EHD_Model(architecture=architecture)

    for fold in range(loader.num_folds):
        pretrain_set, eval_set, fold_name = loader.folded_dataset(fold=fold, xtype='vector', ytype='area')
        model.pretrain(pretrain_set)
        output = model.evaluate(eval_set)

        output['architecture'] = architecture
        output['fold'] = fold_name
        reg_results.append(output, ignore_index=True)


class_results = pd.DataFrame(columns=('architecture', 'fold', 'train_size', 'AUC'))
for architecture in ('MLE_class'):  # Jetting Classification Models <<<<<<<<<<<<<<<<<<<<
    model = EHD_Model(architecture=architecture)

    for fold in range(loader.num_folds):
        pretrain_set, eval_set = loader.folded_dataset(fold=fold, xtype='vector', ytype='area')
        model.pretrain(pretrain_set)
        output = model.evaluate(eval_set)

        output['architecture'] = architecture
        output['fold'] = fold
        class_results.append(output, ignore_index=True)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Anaconda3\envs\ehd\lib\site-packages\IPython\core\interactiveshell.py", line 3397, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Oliver\AppData\Local\Temp\ipykernel_3924\330442334.py", line 8, in <cell line: 4>
    pretrain_set, eval_set, fold_name = loader.folded_dataset(fold=fold, xtype='vector', ytype='area')
  File "C:\Dropbox\Python\ehd_exsitu\notebooks\..\ehd_dataset\dataset.py", line 130, in folded_dataset
    eval_set['X'] = np.concatenate((eval_set['X'],
  File "<__array_function__ internals>", line 5, in concatenate
ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Anaconda3\envs\ehd\lib\site-packages\IPython\core\interactiveshell.py", line 1992, in showtraceback
    stb = s