<img align="right" width="400" src="https://www.fhnw.ch/de/++theme++web16theme/assets/media/img/fachhochschule-nordwestschweiz-fhnw-logo.svg" alt="FHNW Logo">


# Keras Hyperparameter Tuning

by Fabian Märki

## Summary
The aim of this notebook is to tune hyperparameters of a Keras model. Due to the setup it would also be possible to search for a good model design (number of layers etc.).


## Alternatives
- [KerasTuner](https://keras.io/keras_tuner/)
- [AutoKeras](https://autokeras.com/) (allows for AutoML)

<a href="https://colab.research.google.com/github/markif/2021_HS_DAS_NLP_Notebooks/blob/master/04_b_Keras_Hyperparameter_Tuning.ipynb">
  <img align="left" src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
!pip install 'fhnw-nlp-utils>=0.1.6,<0.2'

from fhnw.nlp.utils.storage import load_dataframe
from fhnw.nlp.utils.storage import download
from fhnw.nlp.utils.colab import runs_on_colab

import numpy as np
import pandas as pd

import tensorflow as tf

print("Tensorflow version:", tf.__version__)

#physical_devices = tf.config.list_physical_devices('GPU') 
#tf.config.experimental.set_memory_growth(physical_devices[0], True)
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Tensorflow version: 2.5.1
GPU is available


In [2]:
%%time
download("https://drive.google.com/uc?id=19AFeVnOfX8WXU4_3rM7OFoNTWWog_sb_", "data/german_doctor_reviews_tokenized.parq")
data = load_dataframe("data/german_doctor_reviews_tokenized.parq")

download("https://drive.google.com/uc?id=1tT2dj70GLi2bJYg4j3g1MIglGXTDAugI", "data/german_doctor_reviews_augmented_tokenized.parq")
data_aug = load_dataframe("data/german_doctor_reviews_augmented_tokenized.parq")

CPU times: user 14.9 s, sys: 2.63 s, total: 17.5 s
Wall time: 9.13 s


In [3]:
# remove all neutral sentimens
data = data.loc[(data["label"] != "neutral")]
data.shape

(331187, 10)

In [4]:
data.head(3)

Unnamed: 0,text_original,rating,text,label,sentiment,token_clean,text_clean,token_lemma,token_stem,token_clean_stopwords
0,Ich bin franzose und bin seit ein paar Wochen ...,2.0,Ich bin franzose und bin seit ein paar Wochen ...,positive,1,"[ich, bin, franzose, und, bin, seit, ein, paar...",ich bin franzose und bin seit ein paar wochen ...,"[franzose, seit, paar, wochen, muenchen, zahn,...","[franzos, seit, paar, woch, muench, ., zahn, s...","[franzose, seit, paar, wochen, muenchen, ., za..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,Dieser Arzt ist das unmöglichste was mir in me...,negative,-1,"[dieser, arzt, ist, das, unmöglichste, was, mi...",dieser arzt ist das unmöglichste was mir in me...,"[arzt, unmöglichste, leben, je, begegnen, unfr...","[arzt, unmog, leb, je, begegnet, unfreund, ,, ...","[arzt, unmöglichste, leben, je, begegnet, unfr..."
2,Hatte akute Beschwerden am Rücken. Herr Magura...,1.0,Hatte akute Beschwerden am Rücken. Herr Magura...,positive,1,"[hatte, akute, beschwerden, am, rücken, ., her...",hatte akute beschwerden am rücken . herr magur...,"[akut, beschwerden, rücken, magura, erste, arz...","[akut, beschwerd, ruck, ., magura, erst, arzt,...","[akute, beschwerden, rücken, ., magura, erste,..."


In [5]:
data_aug.head(3)

Unnamed: 0,text_original,rating,sentiment,text,label,token_clean,text_clean,token_lemma,token_stem,token_clean_stopwords
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,-1,Dieser Arzt ist das unmöglichste was mir in me...,negative,"[dieser, arzt, ist, das, unmöglichste, was, mi...",dieser arzt ist das unmöglichste was mir in me...,"[arzt, unmöglichste, leben, je, begegnen, unfr...","[arzt, unmog, leb, je, begegnet, unfreund, ,, ...","[arzt, unmöglichste, leben, je, begegnet, unfr..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,-1,Dieser Arzt ist das unmöglichste was mir in me...,negative,"[dieser, arzt, ist, das, unmöglichste, was, mi...",dieser arzt ist das unmöglichste was mir in me...,"[arzt, unmöglichste, leben, je, begegnen, unfr...","[arzt, unmog, leb, je, begegnet, unfreund, ,, ...","[arzt, unmöglichste, leben, je, begegnet, unfr..."
1,Dieser Arzt ist das unmöglichste was mir in me...,6.0,-1,Dieser Arzt ist das unmöglichste was mir in me...,negative,"[dieser, arzt, ist, das, unmöglichste, was, mi...",dieser arzt ist das unmöglichste was mir in me...,"[arzt, unmöglichste, leben, je, begegnen, unfr...","[arzt, unmog, leb, je, begegnet, unfreund, ,, ...","[arzt, unmöglichste, leben, je, begegnet, unfr..."


In [6]:
from fhnw.nlp.utils.ploting import plot_history

from fhnw.nlp.utils.params import build_model_cnn
from fhnw.nlp.utils.params import build_model_rnn
from fhnw.nlp.utils.params import compile_model
from fhnw.nlp.utils.params import create_label_binarizer_and_set
from fhnw.nlp.utils.params import dataframe_to_dataset
from fhnw.nlp.utils.params import extract_embedding_layer_and_set
from fhnw.nlp.utils.params import extract_text_vectorization_and_set
from fhnw.nlp.utils.params import extract_vocabulary_and_set
from fhnw.nlp.utils.params import get_train_test_split
from fhnw.nlp.utils.params import re_compile_model
from fhnw.nlp.utils.params import save_model
from fhnw.nlp.utils.params import train_model

In [7]:
params = {
    "verbose": True,
    "shuffle": True,
    "batch_size": 128,
    "X_column_name": "text_clean",
    "y_column_name": "label",
    "embedding_type": "fasttext",
    #"embedding_type": "bytepair",
    "embedding_dim": 300,
    "embedding_mask_zero": True,
    "embedding_trainable": False,
    #"embedding_input_sequence_length": output_sequence_length if 'output_sequence_length' in locals() or 'output_sequence_length' in globals() else None,
    "embedding_fasttext_model": "cc.de.300.bin",
    "embedding_word2vec_model_url": "https://cloud.devmount.de/d2bc5672c523b086/german.model",
    "embedding_spacy_model": "de_core_news_md",
    "embedding_tensorflow_hub_url": "https://tfhub.dev/google/nnlm-de-dim128-with-normalization/2",
    "model_type": "cnn",
}

if runs_on_colab() and params["embedding_type"] == "fasttext":
    # colab as problems handling such large files
    model_name = "cc.de.50.bin"
    download("https://drive.google.com/uc?id=1iqw8UPEEVmzQQGmI5FkRJH8B5SkZCgXG", model_name)
    params["embedding_dim"] = 50
    params["embedding_fasttext_model"] = model_name
    

In [8]:
all_data = pd.concat([data, data_aug])
#all_data = data

In [9]:
create_label_binarizer_and_set(params, all_data)
extract_vocabulary_and_set(params, all_data)

Inferred classification type: binary
Median sequence length: 58
Percentil (0.98) cutoff sequence length: 301
Max sequence length: 408
Used embedding sequence length: 301


In [10]:
extract_text_vectorization_and_set(params)

Vocabulary length: 205493


In [11]:
%%time

extract_embedding_layer_and_set(params)



Number of null word embeddings:  14
Words not found in total:  14
Words without embedding (14/14): ['öä', 'jé', '``', 'öz', "'t", "''", 'èn', 'ìn', 'äk', 'úm', 'üw', "'s", '', 'wã']
CPU times: user 6.81 s, sys: 4.58 s, total: 11.4 s
Wall time: 14.7 s


Prepare the training/test/validation data

In [12]:
data_train, data_test = get_train_test_split(params, data)
data_cross_validate = pd.concat([data_train, data_aug])


264949 train examples
66238 test examples


In [13]:
!pip install install git+https://github.com/scikit-optimize/scikit-optimize.git

Collecting git+https://github.com/scikit-optimize/scikit-optimize.git
  Cloning https://github.com/scikit-optimize/scikit-optimize.git to /tmp/pip-req-build-0nb68njf
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting install
  Downloading install-1.3.4-py3-none-any.whl (3.1 kB)
Collecting pyaml>=16.9
  Using cached pyaml-21.8.3-py2.py3-none-any.whl (17 kB)
Building wheels for collected packages: scikit-optimize
  Building wheel for scikit-optimize (PEP 517) ... [?25ldone
[?25h  Created wheel for scikit-optimize: filename=scikit_optimize-0.9.dev0-py2.py3-none-any.whl size=100309 sha256=c1e41913a24d3773b2fbb8e743eea236851220ce75677a8d454f3ec65455e429
  Stored in directory: /tmp/pip-ephem-wheel-cache-itq340kq/wheels/a6/18/3a/f5a8100b6f43b1c60878a02393d97b4fe52e987aec42c1001c
Successfully built scikit-optimize
Installing collected packages: install, pyaml, scikit-opti

In [14]:
from skopt.space import Real, Categorical, Integer

parameter = {
    "learning_rate": Real(0.0001, 0.01, prior="log-uniform"),
    "learning_rate_decay": Real(0.0001, 0.01, prior="log-uniform"),
}

In [15]:
def build_model_parameter(learning_rate=0.01, learning_rate_decay= 0.01): 
    # store function parameters (locals()) and local parameters defined so far
    loc_params = dict(vars())
    loc_params.update(params)
        
    return build_model(loc_params)


def build_model(params):
    
    from fhnw.nlp.utils.params import compile_model
    from fhnw.nlp.utils.params import build_model_cnn
    from fhnw.nlp.utils.params import build_model_rnn
        
    model_type = params.get("model_type", "cnn")
    
    if model_type == "cnn":
        model = build_model_cnn(params)
    elif model_type == "rnn":
        model = build_model_rnn(params)
    else:
        raise TypeError("Unknown model_type "+ model_type)
        
    compile_model(params, model)
    
    return model

In [None]:
from sklearn.metrics import f1_score, make_scorer

scoring_function = make_scorer(f1_score, average='weighted')

In [None]:
import numpy as np
import tensorflow.keras
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV


# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
training_epochs = params.get("training_epochs", 5)
num_folds = params.get("cross_validation_num_folds", 3)
batch_size = params.get("batch_size", 128)

model = KerasClassifier(build_fn=build_model_parameter, epochs=training_epochs, batch_size=batch_size, verbose=2)

opt = BayesSearchCV(
    model,
    # (parameter space, # of evaluations)
    [(parameter, 15)],
    scoring=scoring_function,
    #n_jobs=-1, 
    cv=StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42),
    verbose=2
)

In [None]:
%%time

X_column_name = params.get("X_column_name", "text_clean")
y_column_name = params.get("y_column_name", "sentiment")

X_train = data_cross_validate[X_column_name]
y_train = data_cross_validate[y_column_name]
#X_train = data[X_column_name]
#y_train = data[y_column_name]

_ = opt.fit(X_train, y_train)

In [None]:
print(opt.best_score_)
print(opt.best_params_)

In [None]:
import matplotlib.pyplot as plt
from skopt.plots import plot_objective

_ = plot_objective(opt.optimizer_results_[0])
plt.show()

In [None]:
params.update(opt.best_params_)

model = build_model(params)
model.summary()

compile_model(params, model)

In [None]:
params["batch_size"] = 128
data_train, data_val = get_train_test_split(params, data_cross_validate)
#data_train, data_val = get_train_test_split(params, data)

dataset_train = dataframe_to_dataset(params, data_train)
dataset_val = dataframe_to_dataset(params, data_val)

params["training_epochs"] = 10
history = train_model(params, model, dataset_train, dataset_val)

Let's see if it is possible to further improve the model by unfreezing the embedding layer.

Also see https://www.tensorflow.org/guide/keras/transfer_learning

In [None]:
# Unfreeze all layers (i.e. make embeddings trainable)
model.trainable = True
model.summary()

In [None]:
re_compile_model(params, model)

In [None]:
params["training_epochs"] = 2
history = train_model(params, model, dataset_train, dataset_val)

Check performance on all data.

In [None]:
%%time

from fhnw.nlp.utils.ploting import report_classification_results

report_classification_results(params, data, model)

Check performance on test data.

In [None]:
%%time

from fhnw.nlp.utils.ploting import report_classification_results

report_classification_results(params, data_test, model)

In [None]:
!pip install ax-platform

In [None]:
from ax.service.ax_client import AxClient
from ax.utils.notebook.plotting import render, init_notebook_plotting

init_notebook_plotting()

In [None]:
def extend_dict(to_extend, other):
    for key, value in other.items():
        if key not in to_extend:
            to_extend[key] = value
            
    return to_extend

def get_model_checkpoint_path(params):
    import os
    from datetime import datetime

    model_type = params.get("model_type", "unknown")
    
    path = os.path.join(".", "checkpoints", model_type, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    os.makedirs(path, exist_ok=True)
    
    params["model_path"]=path
    
    return path

def get_storable_params(params):
    store_params = {}
    tmp_params = {}
    
    for key, value in params.items():
        tmp_params.clear()
        tmp_params[key] = value
        
        try:
            json_str = json.dumps(tmp_params)
            if len(json_str) < 1000:
                store_params[key] = value
        except:
            pass
    
    return store_params

def store_params(params):
    import os
    import json
    
    path = params["model_path"]
    os.makedirs(path, exist_ok=True)
    
    with open(os.path.join(path, "params.json"), "w") as fp:
        json.dump(get_storable_params(params), fp, indent=2)

def get_fold_splits(params, data):
    from sklearn.model_selection import StratifiedKFold
    
    num_folds = params.get("cross_validation_num_folds", 3)
    X_column_name = params.get("X_column_name", "text_clean")
    y_column_name = params.get("y_column_name", "sentiment")
    computed_objects_column_name = params.get("computed_objects_column_name", "computed_objects")
    
    kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    folds = []
        
    for train_index, val_index in kfold.split(data[X_column_name], data[y_column_name]):
        fold = {}
        fold["train_index"] = train_index
        fold["val_index"] = val_index
        folds.append(fold)
        
    params[computed_objects_column_name]["cross_validation_fold_indices"] = folds
    
    return folds

def evaluate_score(histories, eval_function_name):
    epochs = len(histories[0].history.keys())
    eval_last_n = min(max(int(0.3 * epochs), 3), epochs)
    
    # look at the last n epochs and get the mean and standard deviation of the validation score
    last_n_scores = np.array([history.history[eval_function_name][-eval_last_n:] for history in histories])
    mean = last_n_scores.mean()
    std = last_n_scores.std()
    
    # if the model did not converge then set a high loss.
    if np.isnan(mean):
        mean = 9999.0
        std = 0.0
    
    return mean, std

def build_model(params):    
    from fhnw.nlp.utils.params import compile_model
    from fhnw.nlp.utils.params import build_model_cnn
    from fhnw.nlp.utils.params import build_model_rnn
        
    model_type = params.get("model_type", "cnn")
    
    if model_type == "cnn":
        model = build_model_cnn(params)
    elif model_type == "rnn":
        model = build_model_rnn(params)
    else:
        raise TypeError("Unknown model_type "+ model_type)
        
    compile_model(params, model)
    
    return model

def cross_validate(params, data, fold_no, fold):
    import os
    
    verbose = params.get("verbose", False)
    training_epochs = params.get("training_epochs", 5)
    checkpoint_path = get_model_checkpoint_path(params)
    store_params(params)
    
    train_index = fold["train_index"]
    val_index = fold["val_index"]
    data_train = data.iloc[train_index]
    data_val = data.iloc[val_index]
    dataset_train = dataframe_to_dataset(params, data_train)
    dataset_val = dataframe_to_dataset(params, data_val)
        
    model = build_model(params)
    if verbose:
        model.summary()
            
    callbacks = [
        #tf.keras.callbacks.EarlyStopping(patience=2),
        tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(checkpoint_path, str(fold_no), "model.{epoch:02d}"), save_weights_only=False, monitor="val_loss", mode="min", save_best_only=True),
        #tf.keras.callbacks.TensorBoard(log_dir=os.path.join(".", "logs")),
    ]
        
    history = model.fit(dataset_train, validation_data=dataset_val, epochs=training_epochs, callbacks=callbacks)
        
    if verbose:
        plot_history(history)
    
    return history

def do_cross_validation(params, data):
    
    #import psutil
    #from multiprocess import Pool
    #from functools import partial
    
    folds = get_fold_splits(params, data)
    
    #cv_fold_params = [(params, data, fold_no, fold) for fold_no, fold in enumerate(folds)]
    #n_cores = psutil.cpu_count(logical=False)
    #pool = Pool(n_cores)
    #histories = pool.map(cross_validate, cv_fold_params)
    #pool.close()
    #pool.join()
    
    histories = []
    for fold_no, fold in enumerate(folds):
        history = cross_validate(params, data, fold_no, fold)
        histories.append(history)
    
    return histories
    

def evaluate(params, data, eval_function_name):       
    histories = do_cross_validation(params, data)
    mean, std = evaluate_score(histories, eval_function_name)
    
    return {eval_function_name: (mean, std)}

def hyperparameter_tuning(params, data):
    eval_function_name = params.get("eval_function_name", "val_loss")
    eval_function_is_minimize = params.get("eval_function_is_minimize", True)
    hyper_parameters=[
            {
                "name": "learning_rate",
                "type": "range",
                "bounds": [1e-5, 1e-1],
                "value_type": "float",  
                "log_scale": True, 
            },
            {
                "name": "learning_rate_decay",
                "type": "range",
                "bounds": [1e-5, 1e-1],
                "value_type": "float",  
                "log_scale": True, 
            },
        ]
    hyperparameter_search_trials = params.get("hyperparameter_search_trials", max(15, 5 * len(hyper_parameters)))
    
    ax_client = AxClient(enforce_sequential_optimization=False)
    
    ax_client.create_experiment(
        name="hyperparameter_tuning",
        parameters=hyper_parameters,
        objective_name=eval_function_name,
        minimize=eval_function_is_minimize, 
    )
    
    for i in range(hyperparameter_search_trials):
        parameters, trial_index = ax_client.get_next_trial()
        parameters = extend_dict(parameters, params)
        
        ax_client.complete_trial(trial_index=trial_index, raw_data=evaluate(parameters, data, eval_function_name))
    
    return ax_client
    

In [None]:
%%time

ax_client = hyperparameter_tuning(params, data_cross_validate)

In [None]:
import os

path = "hp_tuning/german_doctor_reviews/"
os.makedirs(path, exist_ok=True)
ax_client.save_to_json_file(path+"keras_run_01.json")

In [None]:
ax_client.get_trials_data_frame()

In [None]:
best_parameters, values = ax_client.get_best_parameters()
best_parameters

In [None]:
means, covariances = values
means

In [None]:
from ax.utils.notebook.plotting import render
from ax.plot.contour import interact_contour

eval_function_name = params.get("eval_function_name", "val_loss")

render(interact_contour(model=ax_client.generation_strategy.model, metric_name=eval_function_name))