# Exploring multiple approaches to machine learning across multiple small EHD experimental datasets
 - The task is to predict (regress) the size of printed features, as a function of waveform inputs to the EHD printer
 - A second task is to predict (classify) whether a given waveform input will produce any printed pattern at all. (This is equivalent to understanding the print onset voltage threshold.)
 - Multiple hidden confounding variables are likely, such as ink/tip/substrate/atmospheric condition; ink and tip clogging; print tip height from the substrate. Some of these will be relatively constant for each run of experiments, some will vary from feature to feature.
 - Ink dynamics at the printing tip are complex and nonlinear, with electrical and fluid/acoustic phenomena that interact with each other.

## Goals
 - In regression, aim for <3% mean absolute error (from the predicted outcome)
 - In classification, aim for >0.9 ROC AUC
 - For a new/test set, achieve these in <=100 experiments

In [1]:
import sys
import pickle

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

sys.path.append('..')
from ehd_dataset import EHD_Loader
from ehd_models import EHD_Model


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

Dataset_Pkl = "C:/Dropbox/SPEED/Self Driving EHD/Datasets/compiled_data.pickle"

# Load up the dataset
loader = EHD_Loader(Dataset_Pkl)

In [9]:
# REGRESSION MODELS
from ehd_models import EHD_Model

# Print Area Regression Models <<<<<<<<<<<<<<<<<<<<
REGRESSION_model_architectures = {
                                  # 'MLE': {},
                                  # 'cold_RF': {},
                                  # # 'only_pretrained_RF': {},  # RF_Regressor_Allpre
                                  # 'normed_RF': {
                                  #     'bootstrap': True,
                                  #     'max_depth': 49,
                                  #     'max_leaf_nodes': 42,
                                  #     'max_samples': 13,
                                  #     'min_samples_leaf': 1,
                                  #     'min_samples_split': 4,
                                  #     'n_estimators': 434,
                                  # },  # RF_Regressor_Allpre; xtype=normed_squares
                                  # 'normed_MLP': {
                                  #     'activation': 'relu',
                                  #     'alpha': 	4.105E-05,
                                  #     'batch_size': 184,
                                  #     'beta_1': 0.070320529,
                                  #     'beta_2': 0.000569048,
                                  #     'layer1_size': 36,
                                  #     'layer2_size': 36,
                                  #     'lr': 1.96866E-05,
                                  # },  # MLP_Regressor_Allpre; xtype=normed_squares
                                  # 'warm_MLP': {
                                  #     'activation': 'tanh',
                                  #     'alpha': 8.35434E-06,
                                  #     'batch_size': 143,
                                  #     'beta_1': 0.199856952,
                                  #     'beta_2': 0.000731856,
                                  #     'layer1_size': 152,
                                  #     'layer2_size': 51,
                                  #     'lr': 1.54152E-05,
                                  #     'retrain_alpha': 8.85273E-05,
                                  #     'retrain_lr': 0.006031221,
                                  # },
                                  # 'warm_RF': {
                                  #     'bootstrap': False,
                                  #     'max_depth': 87,
                                  #     'max_leaf_nodes': 43,
                                  #     'max_samples': 32,
                                  #     'min_samples_leaf': 1,
                                  #     'min_samples_split': 4,
                                  #     'n_estimators': 25,
                                  #     'new_estimators': 125,
                                  # },
                                  # 'reweight_RF_wide': {
                                  #     'bootstrap': True,
                                  #     'max_depth': 1,
                                  #     'max_leaf_nodes': 7,
                                  #     'max_samples': 23,
                                  #     'min_samples_leaf': 1,
                                  #     'min_samples_split': 7,
                                  #     'n_estimators': 616,
                                  #     'alpha': 0.048167146,
                                  # },
                                  # 'reweight_RF_tall': {
                                  #     'bootstrap': True,
                                  #     'max_depth': 68,
                                  #     'max_leaf_nodes': 78,
                                  #     'max_samples': 14,
                                  #     'min_samples_leaf': 1,
                                  #     'min_samples_split': 5,
                                  #     'n_estimators': 49,
                                  #     'alpha': 0.00290017,
                                  # },
                                  'scaling_MLP': {
                                      'activation': 'relu',
                                      'alpha': 1.55161E-05,
                                      'batch_size': 24,
                                      'beta_1': 0.059863626,
                                      'beta_2': 0.000977064,
                                      'final_alpha': 8.545432376,
                                      'layer1_size': 44,
                                      'layer2_size': 253,
                                      'lr': 2.07268E-05,
                                  },
                                  # 'v_normed_RF': {},  # RF_Regressor_Allpre; xtype=v_normed_squares
                                  # 'v_normed_Ridge': {},  # Ridge_Regressor_Allpre; xtype=v_normed_squares
                                  # 'v_normed_MLP': {},  # MLP_Regressor_Allpre; xtype=v_normed_squares
                                  # 'cold_MLP': {'max_iter': 100_000},
                                  # 'lastXY_MLP': {'max_iter': 200_000},
                                  # 'lastXY_RF': {}
                                 }
NEW_CACHE = False
OVERWRITE_RESULTS = True

XTYPE = "normed_squares"  # vector, normed_squares, v_normed_squares
YTYPE = "max_width"
FILTERS = [
           ('vector',  lambda x: len(x), 2),
           ('Wavegen', lambda x: x, 'square'),
           ('V Thresh [V] @ .5s', np.isnan, False),
           ('SIJ Tip', lambda x: x, 'Std'),
           ('jetted',  lambda x: x, True),
           ('clogging', lambda x: x, False)
          ]

CACHE_NAME = 'regression.cache'

if NEW_CACHE:
    reg_results = []
    reg_names = []
else:
    with open(CACHE_NAME, 'rb') as f:
        (reg_results, reg_names) = pickle.load(f)

for architecture, params in REGRESSION_model_architectures.items():
    if (architecture in reg_names) and not OVERWRITE_RESULTS:
        print(f'{architecture} was already evaluated - loaded from cache')
    else:
        print('')
        if architecture in reg_names:
            print(f"Overwriting previous results from {architecture}")
            reg_results = [reg_results[i] for i in range(len(reg_results)) if reg_names[i] != architecture]
            reg_names = [r for r in reg_names if r != architecture]
        print(f'Evaluating regression model type: {architecture}')
        model = EHD_Model(architecture=architecture, params=params)

        # for dataset_fold in range(loader.num_folds()):
        for dataset_fold in range(loader.num_folds(FILTERS)):
            print(f"fold {dataset_fold}", end=" ")
            pretrain_set, eval_set, eval_name = loader.folded_dataset(fold=dataset_fold, xtype=XTYPE, ytype=YTYPE,
                                                                     pretrain=model.pretrainer, filters=FILTERS)
            # try:
            if model.pretrainer:
                print('pretrain...', end=' ')
                model.pretrain(pretrain_set)
                print('done', end=' ')

            output = model.evaluate(eval_set)

            output['architecture'] = architecture
            output['eval_dataset'] = eval_name
            reg_names.append(architecture)
            reg_results.append(output)

with open(CACHE_NAME, 'wb') as f:
    pickle.dump((reg_results, reg_names), f)


Evaluating regression model type: scaling_MLP
fold 0 pretrain... done fold 1 pretrain... done fold 2 pretrain... done fold 3 pretrain... done 

In [7]:
# JETTING CLASSIFICATION MODELS
from ehd_models import EHD_Model

# Jetting Classification Models <<<<<<<<<<<<<<<<<<<<
CLASS_model_architectures = {
                             'MLE_class': {},
                             'cold_RF_class': {},
                             # 'only_pretrained_RF_class': {},  # RF_Classifier_Allpre
                             'normed_RF_class': {},  # RF_Classifier_Allpre; xtype=normed_squares
                             'normed_MLP_class': {
                                 'activation': 'tanh',
                                 'alpha': 5.98012E-05,
                                 'batch_size': 145,
                                 'beta_1': 0.075248788,
                                 'beta_2': 0.001263418,
                                 'layer1_size': 78,
                                 'layer2_size': 298,
                                 'lr': 4.97715E-05,
                             },  # RF_Classifier_Allpre; xtype=normed_squares
                             # 'v_normed_RF_class': {},  # RF_Classifier_Allpre; xtype=v_normed_squares
                             # 'cold_MLP_class': {'max_iter': 100_000},
                             # 'lastXY_MLP_class': {'max_iter': 200_000},
                             # 'lastXY_RF_class': {}
                            }
NEW_CACHE = False
OVERWRITE_RESULTS = False

XTYPE = "normed_squares"  # vector, normed_squares, v_normed_squares
YTYPE = "jetted"
FILTERS = [
           ('vector', lambda x: len(x), 2),
           ('Wavegen', lambda x: x, 'square'),
           ('V Thresh [V] @ .5s', np.isnan, False),
           ('SIJ Tip', lambda x: x, 'Std'),
           ('clogging', lambda x: x, False)
          ]

CACHE_NAME = 'classification.cache'

if NEW_CACHE:
    class_results = []
    class_names = []
else:
    with open(CACHE_NAME, 'rb') as f:
        (class_results, class_names) = pickle.load(f)
    
for architecture, params in CLASS_model_architectures.items():
    if (architecture in class_names) and not OVERWRITE_RESULTS:
        print(f'{architecture} was already evaluated - loaded from cache')
    else:
        if architecture in class_names:
            print(f"Overwriting previous results from {architecture}")
            class_results = [class_results[i] for i in range(len(class_results)) if class_names[i] != architecture]
            class_names = [c for c in class_names if c != architecture]
        print(f'\nEvaluating classification model type: {architecture}')
        model = EHD_Model(architecture=architecture, params=params)

        for dataset_fold in range(loader.num_folds(FILTERS)):
            print(f"fold {dataset_fold}", end=" ")
            pretrain_set, eval_set, eval_name = loader.folded_dataset(fold=dataset_fold, xtype=XTYPE, ytype=YTYPE,
                                                                     pretrain=model.pretrainer, filters=FILTERS)
            model.pretrain(pretrain_set)
            output = model.evaluate(eval_set)

            output['architecture'] = architecture
            output['eval_dataset'] = eval_name
            class_names.append(architecture)
            class_results.append(output)

with open(CACHE_NAME, 'wb') as f:
    pickle.dump((class_results, class_names), f)

Overwriting previous results from MLE_class

Evaluating classification model type: MLE_class
fold 0 fold 1 fold 2 fold 3 Overwriting previous results from cold_RF_class

Evaluating classification model type: cold_RF_class
fold 0 fold 1 fold 2 fold 3 Overwriting previous results from normed_RF_class

Evaluating classification model type: normed_RF_class
fold 0 fold 1 fold 2 fold 3 Overwriting previous results from normed_MLP_class

Evaluating classification model type: normed_MLP_class
fold 0 fold 1 fold 2 fold 3 

In [10]:
# CREATE FIG and SUMMARY FILES
import seaborn as sns

OUT_XLSX = 'eval_summary.xlsx'
xlsx_writer = pd.ExcelWriter(OUT_XLSX)
plt.rcParams['figure.figsize'] = (11.0, 10.0) # set default size of plots
plt.clf()

def n_walk_result_vis(df, x, y, trends, ax=None, outfile=None, legend='auto', labels=None):
    bar_ci = 0.95  # 'sd' for standard deviation
    sns.lineplot(
        data=df.loc[~df['unique N']],
        x=x, y=y, hue=trends, style=trends,
        markers=True,
        err_style="bars", ci=bar_ci,
        markersize=7,
        linewidth=1.8,
        ax=ax,
        legend=legend
    )
    ticks = df[x].loc[~df['unique N']].unique()
    ax.set_xticks(ticks=ticks,
               labels=np.round(10**ticks).astype(int))
    ax.set_xlabel('dataset size')
    if legend is not None and labels is not None:
        ax.legend(labels)
    if y == 'MSE':
        ax.set_ylim(0, 16e3)
    if y == 'MAE':
        ax.set_ylim(0, 40)
    if y == 'r':
        ax.set_ylim(0, 1)
    if y == 'F1':
        ax.set_ylim(0.49, 1)
    if y == 'AUC':
        ax.set_ylim(0.49, 1)
    if y == 'MAPE':
        ax.set_ylim(0, 100)
    if outfile is not None:
        plt.savefig(outfile, bbox_inches='tight', pad_inches=0.1, transparent=False, dpi=500)
        plt.clf()

# REGRESSION: Expand, log, and visualize results
df = pd.concat(reg_results, ignore_index=True)
# labels = ('MLE', 'Cold Random Forest', 'Pretrained Random Forest', 'normed MLE', 'normed RF', 'voltage-norm RF')
labels = ('MLE', 'cold-start Random Forest', 'zero-shot Random Forest', 'zero-shot MLP', 'warm-start MLP', 'warm-start Random Forest', 'reweighed RF (wide)', 'reweighed RF (tall)', 'post-scaling MLP')
print(f"Regression architectures: {df.architecture.unique()}")
df['log_size'] = df['train_size'].apply(lambda x: np.log10(x).round(3))
df['MAPE'] = df['MAPE'].apply(lambda x: 100 * x)
df['unique N'] = df['train_size'].apply(lambda x: (df['train_size'] == x).sum() <= len(df['architecture'].unique()))

fig, ax = plt.subplots(2,2)

legend = 'auto'
for i, metric in enumerate(['MAE', 'r']):
    n_walk_result_vis(df, x='log_size', y=metric, trends='architecture', ax=ax[0][i], legend=legend, labels=labels)
    labels = None
    legend = False
    
df.to_excel(xlsx_writer, index=False, sheet_name='Regression')


# CLASSIFICATION: Expand, log, and visualize results
# labels = ('MLE', 'Cold Random Forest', 'Pretrained Random Forest', 'normed MLE', 'normed RF', 'voltage-norm RF')
labels = ('MLE', 'cold start RF', 'zero-shot RF', 'zero-shot MLP')
df = pd.concat(class_results, ignore_index=True)
print(f"Classification architectures: {df.architecture.unique()}")
df['log_size'] = df['train_size'].apply(lambda x: np.log10(x).round(3))
df['unique N'] = df['train_size'].apply(lambda x: (df['train_size'] == x).sum() <= len(df['architecture'].unique()))
    
df.to_excel(xlsx_writer, index=False, sheet_name='Classification')

legend = 'auto'
for i, metric in enumerate(['AUC', 'F1']):
    n_walk_result_vis(df, x='log_size', y=metric, trends='architecture', ax=ax[1][i], legend=legend, labels=labels)
    labels = None
    legend = False


plt.savefig('4-up.png', bbox_inches='tight', pad_inches=0.1, transparent=False, dpi=300)
plt.clf()
xlsx_writer.save()
xlsx_writer.close()

Regression architectures: ['MLE' 'cold_RF' 'normed_RF' 'normed_MLP' 'warm_MLP' 'warm_RF'
 'reweight_RF_wide' 'reweight_RF_tall' 'scaling_MLP']
Classification architectures: ['MLE_class' 'cold_RF_class' 'normed_RF_class' 'normed_MLP_class']


<Figure size 1100x1000 with 0 Axes>

<Figure size 1100x1000 with 0 Axes>