# Hyperparameter Optimization In Ludwig



Demonstrates hyper-parameter tuning capabilities of Ludwig. The following steps occur in this notebook:
* Training data is prepared for use
* Programmatically create Ludwig model definition dictionary from the training data dataframe
* Setup parameter space for hyperparameter optimization
* Perform two hyperparameter runs
  * Parallel workers using random search strategy
  * Serial processing using random search strategy
  * Parallel workers using grid search strategy
* Convert results returned from hyperparameter optimization to a dataframe

## Import required libraries

In [1]:
import warnings
warnings.simplefilter('ignore')

import logging
import shutil
import tempfile
import datetime

import pandas as pd
import numpy as np

from ludwig.api import LudwigModel
from ludwig.utils.data_utils import load_json
from ludwig.utils.defaults import merge_with_defaults, ACCURACY
from ludwig.utils.tf_utils import get_available_gpus_cuda_string
from ludwig.visualize import learning_curves
from ludwig.hyperopt.execution import get_build_hyperopt_executor
from ludwig.hyperopt.sampling import (get_build_hyperopt_sampler)
from ludwig.hyperopt.utils import update_hyperopt_params_with_defaults

from sklearn.model_selection import train_test_split

## Retrieve data for training

In [2]:
train_df = pd.read_csv('./data/winequalityN.csv')
train_df.shape

(6497, 13)

## Standardize column names to replace spaces(" ") with underscore("_")

In [3]:
new_col = []
for i in range(len(train_df.columns)):
    new_col.append(train_df.columns[i].replace(' ', '_'))
    
train_df.columns = new_col


## Data Set Overview

In [4]:
train_df.dtypes

type                     object
fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

## Create training and test data sets

In [5]:
train_df['quality'].value_counts().sort_index()

3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
Name: quality, dtype: int64

In [6]:
# isolate the predictor variables only
predictor_vars = list(set(train_df.columns) - set(['quality']))

#extract categorical variables
categorical_vars = []
for p in predictor_vars:
    if train_df[p].dtype == 'object':
        categorical_vars.append(p)
        
print("categorical variables:", categorical_vars,'\n')

# get numerical variables
numerical_vars = list(set(predictor_vars) - set(categorical_vars))

print("numerical variables:", numerical_vars,"\n")

categorical variables: ['type'] 

numerical variables: ['citric_acid', 'total_sulfur_dioxide', 'pH', 'residual_sugar', 'sulphates', 'alcohol', 'chlorides', 'free_sulfur_dioxide', 'density', 'fixed_acidity', 'volatile_acidity'] 



In [7]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed_acidity,6487.0,7.216579,1.29675,3.8,6.4,7.0,7.7,15.9
volatile_acidity,6489.0,0.339691,0.164649,0.08,0.23,0.29,0.4,1.58
citric_acid,6494.0,0.318722,0.145265,0.0,0.25,0.31,0.39,1.66
residual_sugar,6495.0,5.444326,4.758125,0.6,1.8,3.0,8.1,65.8
chlorides,6495.0,0.056042,0.035036,0.009,0.038,0.047,0.065,0.611
free_sulfur_dioxide,6497.0,30.525319,17.7494,1.0,17.0,29.0,41.0,289.0
total_sulfur_dioxide,6497.0,115.744574,56.521855,6.0,77.0,118.0,156.0,440.0
density,6497.0,0.994697,0.002999,0.98711,0.99234,0.99489,0.99699,1.03898
pH,6488.0,3.218395,0.160748,2.72,3.11,3.21,3.32,4.01
sulphates,6493.0,0.531215,0.148814,0.22,0.43,0.51,0.6,2.0


In [8]:
for p in categorical_vars:
    print("unique values for",p,"is",train_df[p].nunique())

unique values for type is 2


## Create model definition

In [9]:
# template for model definition
model_definition = {'input_features':[], 'output_features': [], 'training':{}}

# setup input features for categorical variables
for p in categorical_vars:
    a_feature = {'name': p.replace(' ','_'), 'type': 'category', 'representation': 'sparse'}
    model_definition['input_features'].append(a_feature)


# setup input features for numerical variables
for p in numerical_vars:
    a_feature = {'name': p.replace(' ','_'), 'type': 'numerical', 
                'preprocessing': {'missing_value_strategy': 'fill_with_mean', 'normalization': 'zscore'}}
    model_definition['input_features'].append(a_feature)

# set up output variable
model_definition['output_features'].append({'name': 'quality', 'type':'category'})

# set up training
model_definition['training'] = {'epochs': 20}

In [10]:
# View the model defintion
print("model definition:")
model_definition

model definition:


{'input_features': [{'name': 'type',
   'type': 'category',
   'representation': 'sparse'},
  {'name': 'citric_acid',
   'type': 'numerical',
   'preprocessing': {'missing_value_strategy': 'fill_with_mean',
    'normalization': 'zscore'}},
  {'name': 'total_sulfur_dioxide',
   'type': 'numerical',
   'preprocessing': {'missing_value_strategy': 'fill_with_mean',
    'normalization': 'zscore'}},
  {'name': 'pH',
   'type': 'numerical',
   'preprocessing': {'missing_value_strategy': 'fill_with_mean',
    'normalization': 'zscore'}},
  {'name': 'residual_sugar',
   'type': 'numerical',
   'preprocessing': {'missing_value_strategy': 'fill_with_mean',
    'normalization': 'zscore'}},
  {'name': 'sulphates',
   'type': 'numerical',
   'preprocessing': {'missing_value_strategy': 'fill_with_mean',
    'normalization': 'zscore'}},
  {'name': 'alcohol',
   'type': 'numerical',
   'preprocessing': {'missing_value_strategy': 'fill_with_mean',
    'normalization': 'zscore'}},
  {'name': 'chlorides',

## Define hyperparameter search space

In [11]:
SEED=13

HYPEROPT_CONFIG = {
    "parameters": {
        "training.learning_rate": {
            "type": "float",
            "low": 0.0001,
            "high": 0.01,
            "space": "log",
            "steps": 3,
        },
        "training.batch_size": {
            "type": "int",
            "low": 32,
            "high": 256,
            "space": "log",
            "steps": 5,
            "base" : 2
        },
        "quality.fc_size": {
            "type": "int",
            "low": 32,
            "high": 256,
            "steps": 5
        },
        "quality.num_fc_layers": {
            'type': 'int',
            'low': 1,
            'high': 5,
            'space': 'linear',
            'steps': 4
        }
    },
    "goal": "minimize",
    'output_feature': "quality",
    'validation_metrics': 'loss'
}

## Helper Function to run Hyperopt 

In [12]:
# function to run hyperparameter optimization run
def run_hyperopt_executor(sampler, executor, model_definition,
                      dataset):

    # update model definition with remaining defaults
    model_definition = merge_with_defaults(model_definition)

    # get copy of hyperparameter configuration parameters to optimize
    hyperopt_config = HYPEROPT_CONFIG.copy()

    # update with remaining defaults
    update_hyperopt_params_with_defaults(hyperopt_config)

    # Extract relevant parameters
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    # setup sampler
    hyperopt_sampler = get_build_hyperopt_sampler(
        sampler["type"])(goal, parameters, **sampler)

    # setup executor
    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    # run hyperparameter executor
    hyperopt_results = hyperopt_executor.execute(model_definition,
                              dataset=dataset,
                              gpus=get_available_gpus_cuda_string())

    return hyperopt_results

# function to convert results from hyperopt run into a dataframe
def extract_row_data(hyperopt_result):
    row = hyperopt_result['parameters']
    row['metric_score'] = hyperopt_result['metric_score']
    return row

## Train with optimal hyperparameters on the whole data set

In [13]:
# clean out old results
shutil.rmtree('./results', ignore_errors=True)
shutil.rmtree('./visualizations', ignore_errors=True)

#### Random Search with 4 parallel executors

In [14]:
%%time
print("starting:", datetime.datetime.now())
random_parallel_results = run_hyperopt_executor(
    {'type': 'random', 'num_samples': 10},  # sampler
   {'type': 'parallel', 'num_workers': 4}, # executor
    model_definition,
    train_df.sample(4000, random_state=42)  # limit number records for demonstration purposes
)

starting: 2020-09-21 02:23:20.635976
CPU times: user 242 ms, sys: 104 ms, total: 346 ms
Wall time: 1min 50s


#### Random Search with serial executor

In [15]:
%%time
print("starting:", datetime.datetime.now())
random_serial_results = run_hyperopt_executor(
    {'type': 'random', 'num_samples': 10},  # sampler
    {'type': 'serial'},  #executor
    model_definition,
    train_df.sample(4000, random_state=42)  # limit number records for demonstration purposes
)

starting: 2020-09-21 02:25:11.252120
CPU times: user 1min 25s, sys: 17.4 s, total: 1min 43s
Wall time: 1min 32s


#### Grid Search with 4 parallel executors (takes about 35 minutes)

In [20]:
%%time
print("starting:", datetime.datetime.now())
grid_parallel_results = run_hyperopt_executor(
    {'type': 'grid'},  # sampler
    {'type': 'parallel', 'num_workers': 4}, # executor
    model_definition,
    train_df.sample(4000, random_state=42)  # limit number records for demonstration purposes
)

starting: 2020-09-21 02:37:59.433166
CPU times: user 2.16 s, sys: 697 ms, total: 2.86 s
Wall time: 37min 19s


### Note:
`random_parallel_results`, `random_serial_results` and `grid_parallel_results` are lists.  The first element in each list contains the best performing metric with the associated parameters.

## Convert hyperparameter optimization results to dataframe

#### Results For Random Search with 4 parallel executors

In [21]:
df = pd.DataFrame.from_dict([extract_row_data(result) for result in random_parallel_results])
print(df)

   quality.fc_size  quality.num_fc_layers  training.batch_size  \
0              224                      4                   44   
1              176                      5                   55   
2               70                      2                   38   
3              190                      4                   82   
4              182                      4                   51   
5              256                      2                  155   
6               92                      3                   34   
7               93                      1                  207   
8              102                      1                  208   
9              102                      3                  186   

   training.learning_rate  metric_score  
0                0.001996      1.038846  
1                0.000200      1.053650  
2                0.000170      1.053995  
3                0.000503      1.055694  
4                0.007774      1.058455  
5                0.00

#### Results for Random Search with serial executor

In [22]:
df2 = pd.DataFrame.from_dict([extract_row_data(result) for result in random_serial_results])
print(df2)

   quality.fc_size  quality.num_fc_layers  training.batch_size  \
0               80                      4                   38   
1              216                      3                   38   
2              139                      3                   39   
3               45                      3                   34   
4              168                      5                   72   
5              132                      4                   41   
6              249                      3                  177   
7              219                      1                   84   
8              194                      1                  122   
9               80                      2                  189   

   training.learning_rate  metric_score  
0                0.000193      1.028224  
1                0.004513      1.040447  
2                0.004429      1.070734  
3                0.009849      1.071095  
4                0.000172      1.076602  
5                0.00

#### Results for Grid Search with 4 parallel executors (takes about 35 minutes)

In [23]:
df3 = pd.DataFrame.from_dict([extract_row_data(result) for result in grid_parallel_results])
print(df3)

     quality.fc_size  quality.num_fc_layers  training.batch_size  \
0                 32                      5                  144   
1                144                      3                  144   
2                 32                      3                  144   
3                200                      3                   88   
4                 88                      5                   88   
..               ...                    ...                  ...   
295               32                      1                  200   
296               32                      5                  256   
297               32                      1                  256   
298               32                      2                  200   
299               32                      2                  256   

     training.learning_rate  metric_score  
0                    0.0100      1.029655  
1                    0.0100      1.041182  
2                    0.0100      1.041571  
3      