# Exploring multiple approaches to machine learning across multiple small EHD experimental datasets
 - The task is to predict (regress) the size of printed features, as a function of waveform inputs to the EHD printer
 - A second task is to predict (classify) whether a given waveform input will produce any printed pattern at all. (This is equivalent to understanding the print onset voltage threshold.)
 - Multiple hidden confounding variables are likely, such as ink/tip/substrate/atmospheric condition; ink and tip clogging; print tip height from the substrate. Some of these will be relatively constant for each run of experiments, some will vary from feature to feature.
 - Ink dynamics at the printing tip are complex and nonlinear, with electrical and fluid/acoustic phenomena that interact with each other.

## Goals
 - In regression, aim for <3% mean absolute error (from the predicted outcome)
 - In classification, aim for >0.9 ROC AUC
 - For a new/test set, achieve these in <=100 experiments

In [1]:
import sys

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from scipy.stats import pearsonr

sys.path.append('..')
from ehd_dataset import EHD_Loader
from ehd_models import EHD_Model


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
INDEX = "C:\Dropbox\SPEED\Self Driving EHD\Datasets\dataset_index.xlsx"
loader = EHD_Loader(INDEX)
print("Datasets Loaded!\n>> Quick correlation validation check -- [auac; vec L2] <<")

for i, df in enumerate(loader.datasets):
    AUAC, _ = pearsonr(df.area,
                       df.wave.apply(lambda x: np.sum(np.abs(x))))
    VL2, _ = pearsonr(df.area,
                       df.vector.apply(lambda x: np.sqrt(np.sum(x**2))))
    print(f"<<{loader.names[i]} -- [{AUAC:.3f}; {VL2:.3f}]", end='>> ')

Failed to load 10-Mar-2022 large nozzle mosaic: 'DataFrame' object has no attribute 'note'
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\29-Mar-2022 lg 1cm 300 points	offset 0	corr 0.4960008364961841
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\2-May-2022__run 1	offset 7	corr 0.20797462285908108
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\2-May-2022__run 2	offset 4	corr 0.5656140913387249
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\23-May-2022_squares	offset 8	corr 0.5137128998522692
dataset C:\Dropbox\SPEED\Self Driving EHD\Datasets\24-May-2022 large harmonics	offset 0	corr 0.5869081626562916
Datasets Loaded!
>> Quick correlation validation check -- [auac; vec L2] <<
<<29-Mar-2022 lg 1cm 300 points -- [0.496; 0.650]>> <<2-May-2022__run 1 -- [0.208; 0.239]>> <<2-May-2022__run 2 -- [0.566; 0.748]>> <<23-May-2022_squares -- [0.514; 0.630]>> <<24-May-2022 large harmonics -- [0.587; 0.764]>> 

In [21]:
from ehd_models import EHD_Model


# Print Area Regression Models <<<<<<<<<<<<<<<<<<<<
model_architectures = {'MLE': {},
                       'cold_RF': {},
                       'cold_MLP': {'max_iter': 100_000}}
reg_results = []
for architecture, params in model_architectures.items():
    print(f'\nEvaluating regression model type: {architecture}')
    model = EHD_Model(architecture=architecture, params=params)
    
    # for dataset_fold in range(loader.num_folds()):
    for dataset_fold in range(loader.num_folds(model.filters)):  # TODO loader be smart about folds based on filters
        print(f"fold {dataset_fold}", end=" ")
        pretrain_set, eval_set, eval_name = loader.folded_dataset(fold=dataset_fold, xtype=model.xtype, ytype=model.ytype,
                                                                 pretrain=model.pretrainer, filters=model.filters)
        # try:
        if model.pretrainer:
            model.pretrain(pretrain_set)
        output = model.evaluate(eval_set)

        output['architecture'] = architecture
        output['eval_dataset'] = eval_name
        reg_results.append(output)


Evaluating regression model type: MLE
fold 0 fold 1 fold 2 fold 3 fold 4 
Evaluating regression model type: cold_RF
fold 0 fold 1 fold 2 fold 3 fold 4 
Evaluating regression model type: cold_MLP
fold 0 



fold 1 



fold 2 



fold 3 



fold 4 




Evaluating classification model type: MLE_class
fold 0 fold 1 fold 2 fold 3 fold 4 
Evaluating classification model type: cold_RF_class
fold 0 fold 1 fold 2 fold 3 fold 4 
Evaluating classification model type: cold_MLP_class


KeyError: 'max_iter'

In [24]:
# Jetting Classification Models <<<<<<<<<<<<<<<<<<<<
model_architectures = {'MLE_class': {},
                       'cold_RF_class': {},
                       'cold_MLP_class': {'max_iter': 100_000}}
class_results = []
for architecture, params in model_architectures.items():
    print(f'\nEvaluating classification model type: {architecture}')
    model = EHD_Model(architecture=architecture, params=params)

    for dataset_fold in range(loader.num_folds(model.filters)):
        print(f"fold {dataset_fold}", end=" ")
        pretrain_set, eval_set, eval_name = loader.folded_dataset(fold=dataset_fold, xtype=model.xtype, ytype=model.ytype,
                                                                 pretrain=model.pretrainer, filters=model.filters)
        model.pretrain(pretrain_set)
        output = model.evaluate(eval_set)

        output['architecture'] = architecture
        output['eval_dataset'] = eval_name
        class_results.append(output)


Evaluating classification model type: MLE_class
fold 0 fold 1 fold 2 fold 3 fold 4 
Evaluating classification model type: cold_RF_class
fold 0 fold 1 fold 2 fold 3 fold 4 
Evaluating classification model type: cold_MLP_class
fold 0 fold 1 fold 2 fold 3 fold 4 

In [34]:
import seaborn as sns

OUT_XLSX = 'eval_summary.xlsx'
xlsx_writer = pd.ExcelWriter(OUT_XLSX)

def n_walk_result_vis(df, x, y, trends, outfile=None):
    bar_ci = 0.95  # 'sd' for standard deviation
    sns.lineplot(
        data=df.loc[~df['singleton']],
        x=x, y=y, hue=trends, style=trends,
        markers=True,
        err_style="bars", ci=bar_ci,
        linewidth=0.3
    )
    ticks = df[x].loc[~df['singleton']].unique()
    plt.xticks(ticks=ticks,
               labels=np.round(10**ticks).astype(int))
    plt.xlabel('dataset size')
    if y == 'MSE':
        plt.ylim(0, 6e9)
    if y == 'MAE':
        plt.ylim(0, 100_000)
    if y == 'MAPE':
        plt.ylim(0, 100)
    if outfile is None:
        plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
        plt.show()
    else:
        plt.rcParams['figure.figsize'] = (7.0, 5.0) # set default size of plots
        plt.savefig(outfile, bbox_inches='tight', pad_inches=0.1, transparent=False, dpi=500)
        plt.clf()

# Expand, log, and vis regression results
df = pd.concat(reg_results, ignore_index=True)
df['log_size'] = df['train_size'].apply(lambda x: np.log10(x).round(3))
df['MAPE'] = df['MAPE'].apply(lambda x: 100 * x)
df['singleton'] = df['train_size'].apply(lambda x: (df['train_size'] == x).sum() <= len(df['architecture'].unique()))

for metric in ['MAE', 'MAPE', 'MSE', 'r']:  # MSLE left out for now
    n_walk_result_vis(df, x='log_size', y=metric, trends='architecture', outfile=f"{metric}.png")
    
df.to_excel(xlsx_writer, index=False, sheet_name='Regression')

# Expand, log, and vis classification results
df = pd.concat(class_results, ignore_index=True)
df['log_size'] = df['train_size'].apply(lambda x: np.log10(x).round(3))
df['singleton'] = df['train_size'].apply(lambda x: (df['train_size'] == x).sum() <= len(df['architecture'].unique()))
    
df.to_excel(xlsx_writer, index=False, sheet_name='Classification')

for metric in ['F1', 'AUC']:
    n_walk_result_vis(df, x='log_size', y=metric, trends='architecture', outfile=f"{metric}.png")

xlsx_writer.save()

<Figure size 504x360 with 0 Axes>