# Selecting the model architecture

In this notebook, we select the model to be further studied.

## Imports

In [1]:
import pandas as pd

import itertools

import talos as ta

from gensim.models import KeyedVectors

from paths import input_folder, output_folder, word_embeddings_folder

from helpers import create_embedding_matrix, tokens2index, texts2index_padded

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import gc

from keras.models import Model
from keras.optimizers import Adam, Nadam
from keras.layers import TimeDistributed, GlobalMaxPooling1D, GlobalAveragePooling1D, Activation, Input, LSTM, GRU, Dense, Dropout, Flatten, Embedding, SpatialDropout1D, Bidirectional, CuDNNGRU
from keras.layers.convolutional import Conv1D, MaxPooling1D, AveragePooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers import Flatten, concatenate
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, activations
from keras.layers.recurrent import Recurrent
from keras.engine import InputSpec
from keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau, CSVLogger


import tensorflow as tf
from tensorflow import set_random_seed
from keras import backend as K

# This part required only for GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.allocator_type = 'BFC'
sess = tf.Session(config=config)
K.set_session(sess)

random_seed = 102329

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)
Using TensorFlow backend.


## Load data

In [2]:
use_cases_df = pd.read_csv(f"{input_folder}use-cases.csv", index_col=0)
use_cases_df.head(5)

Unnamed: 0,ProjectID,UC,TransTypes,UCType,Cfp,TitleTokens
0,P01,UC2-1-1,C|D|R|U,C|D|R|U,16,manage faculties crud
1,P01,UC2-1-10,DL|L|R,L,27,assign science olympiads major specialty edit ...
2,P01,UC2-1-11,CS|R,CS,7,manage ranking algorithms
3,P01,UC2-1-13,C|D|R|U,C|D|R|U,17,manage exams crud
4,P01,UC2-1-14,DL|L|R,L,27,manage assignments exams majors specialties


Load Google News word2vec embeddings

In [3]:
wv = KeyedVectors.load_word2vec_format(f"{word_embeddings_folder}SO_vectors_200.bin", binary=True)

## Prepare input data and embeddings matrix

Use-case names has been already tokenized and stored in TitleTokens column. We need to split them.

In [4]:
tokenized_names = [name.split(" ") for name in use_cases_df['TitleTokens'].tolist()]

In [5]:
tokenized_names[:5]

[['manage', 'faculties', 'crud'],
 ['assign', 'science', 'olympiads', 'major', 'specialty', 'edit', 'delete'],
 ['manage', 'ranking', 'algorithms'],
 ['manage', 'exams', 'crud'],
 ['manage', 'assignments', 'exams', 'majors', 'specialties']]

We will prune the word embeddings model so it does not contain words that are not in the dataset to reduce the memory usage. Since we do not train the embeddings it does not affect the results in any way.

In [6]:
unique_tokens = list(set(list(itertools.chain.from_iterable(tokenized_names))))
len(unique_tokens)

449

Let's prune the wv model:

In [7]:
words_to_keep = unique_tokens
words_to_trim = [w for w in wv.vocab.keys() if w not in words_to_keep]
ids_to_trim = [wv.vocab[w].index for w in words_to_trim]

for w in words_to_trim:
    del wv.vocab[w]

wv.vectors = np.delete(wv.vectors, ids_to_trim, axis=0)
wv.init_sims(replace=True)

for i in sorted(ids_to_trim, reverse=True):
    del(wv.index2word[i])

for i in range(len(wv.vocab.keys())):
    word = wv.index2word[i]
    wv.vocab[word].index = i

In [8]:
MAX_SEQUENCE_LENGTH, VECTOR_LENGTH = 16, wv.vector_size

Create embeddings matrix:

In [9]:
embedding_matrix = create_embedding_matrix(wv, VECTOR_LENGTH)
embedding_matrix.shape

(442, 200)

Convert use-case names to list of token identifiers from the vocabulary

In [10]:
use_case_names_ids = texts2index_padded(tokenized_names, wv, seq_length=MAX_SEQUENCE_LENGTH)

''s' is not in the vocabulary - 'modify candidate 's payments'
''s' is not in the vocabulary - 'view candidate 's data'
''s' is not in the vocabulary - 'modify candidate 's data'
''' is not in the vocabulary - 'view candidates ' ranking major specialization classification'
''s' is not in the vocabulary - 'browse candidate 's events history'
''s' is not in the vocabulary - 'creating customer 's representative information contacts'
''s' is not in the vocabulary - 'modifying customer 's representative information contacts'
''s' is not in the vocabulary - 'deleting customer 's representative information contacts'
''' is not in the vocabulary - 'looking clients ' table'
''s' is not in the vocabulary - 'view candidate 's data'
'a' is not in the vocabulary - 'a quick search student archival grades'
''s' is not in the vocabulary - 'view protocol 's status'
''' is not in the vocabulary - 'inform assignment task ' outside system ''
''' is not in the vocabulary - 'inform assignment task ' outside

In [11]:
use_case_names_ids[0]

array([168, 429, 295,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0])

In [12]:
x = use_case_names_ids
y = use_cases_df['Cfp'].values

## Initial architecture selection

In the first step we analyze the key architectural decisions to be made - the choice of type of layers.

In [44]:
def get_cnn_1l_model(x_train, y_train, x_val, y_val, params):

    MAX_SEQUENCE_LENGTH = x_train.shape[1]
    
    global embedding_matrix
    
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                     mask_zero=False,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    cnn = Conv1D(filters=params['filters_first'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(embedded_sequences)
    cnn = AveragePooling1D(pool_size=2)(cnn)

    cnn = Flatten()(cnn)

    cnn = Dropout(params['dropout'])(cnn)

    output = Dense(1, activation='linear')(cnn)

    model = Model([line_input] , output) 
    
    optimizer = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=optimizer,
                  loss=params['loss'],
                  metrics=['mse', 'mae'])

    callbacks_list = [
        ReduceLROnPlateau( 
            monitor='loss',
            min_lr=0.001,
            factor=0.5,
            verbose=1,
            patience=10), 
    ]
    
    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val])
    return out, model


In [45]:
p = {'filters_first':[4, 8, 16, 32],
     'kernel_size':[3],
     'batch_size': [64, 128, 256],
     'epochs': [100, 300, 500],
     'dropout': [0, 0.2, 0.5, 0.8],
     'padding' : ['same',],
     'loss': ['mae',],
     'activation':['relu',]}

In [46]:
h_cnn1 = ta.Scan(x, y, params=p,
            model=get_cnn_1l_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|████████████████████████████████████████████████████████████████████████████████| 144/144 [18:11<00:00,  6.69s/it]


In [47]:
r_cnn1 = ta.Reporting(h_cnn1)

In [77]:
r_cnn1.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn1.xlsx')

In [48]:
r_cnn1.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,kernel_size,loss.1,padding
68,500,3.726943,32.047591,3.726943,3.415188,31.368683,3.415188,relu,128,0.2,500,4,3,mae,same
45,500,3.7296,32.526435,3.7296,3.613402,33.08052,3.613402,relu,64,0.8,500,8,3,mae,same
32,500,3.73684,31.898237,3.73684,3.079919,25.045749,3.079919,relu,64,0.5,500,4,3,mae,same
142,500,3.74611,32.187851,3.74611,3.481493,31.267456,3.481493,relu,256,0.8,500,16,3,mae,same
90,300,3.748397,32.422323,3.748397,3.680381,32.974671,3.680381,relu,128,0.8,300,16,3,mae,same
77,300,3.750573,31.803752,3.750573,3.473694,31.426172,3.473694,relu,128,0.5,300,8,3,mae,same
41,300,3.75203,32.847059,3.75203,3.706009,34.045101,3.706009,relu,64,0.8,300,8,3,mae,same
91,300,3.753161,31.28172,3.753161,3.212223,25.569225,3.212223,relu,128,0.8,300,32,3,mae,same
116,500,3.753482,32.528194,3.753482,3.53989,31.543397,3.53989,relu,256,0.2,500,4,3,mae,same
28,300,3.753921,32.777654,3.753921,3.632639,31.353646,3.632639,relu,64,0.5,300,4,3,mae,same


In [13]:
def get_cnn_2l_model(x_train, y_train, x_val, y_val, params):

    MAX_SEQUENCE_LENGTH = x_train.shape[1]
    
    global embedding_matrix
    
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                     mask_zero=False,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    cnn = Conv1D(filters=params['filters_first'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(embedded_sequences)
    cnn = AveragePooling1D(pool_size=2)(cnn)
    cnn = Conv1D(filters=params['filters_second'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(cnn)
    cnn = AveragePooling1D(pool_size=2)(cnn)

    cnn = Flatten()(cnn)

    cnn = Dropout(params['dropout'])(cnn)

    output = Dense(1, activation='linear')(cnn)

    model = Model([line_input] , output) 
    
    optimizer = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=optimizer,
                  loss=params['loss'],
                  metrics=['mse', 'mae'])

    callbacks_list = [
        ReduceLROnPlateau( 
            monitor='loss',
            min_lr=0.001,
            factor=0.5,
            verbose=1,
            patience=10), 
    ]
    
    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val])
    return out, model


In [14]:
p = {'filters_first':[4, 8, 16, 32],
     'filters_second':[4, 8, 16, 32],
     'kernel_size':[3],
     'batch_size': [64, 128, 256],
     'epochs': [100, 300, 500],
     'dropout': [0, 0.2, 0.5, 0.8],
     'padding' : ['same',],
     'loss': ['mae',],
     'activation':['relu',]}

In [15]:
h_cnn2 = ta.Scan(x, y, params=p,
            model=get_cnn_2l_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|██████████████████████████████████████████████████████████████████████████████| 576/576 [1:16:39<00:00,  8.16s/it]


In [16]:
r_cnn2 = ta.Reporting(h_cnn2)

In [78]:
r_cnn2.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2.xlsx')

In [17]:
r_cnn2.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,kernel_size,loss.1,padding
498,300,3.681885,32.034462,3.681885,3.573565,31.586531,3.573565,relu,256,0.5,300,4,16,3,mae,same
103,100,3.699152,32.437884,3.699152,3.404536,32.041352,3.404536,relu,64,0.5,100,8,32,3,mae,same
109,100,3.703135,32.599696,3.703135,3.618919,30.540457,3.618919,relu,64,0.5,100,32,8,3,mae,same
501,300,3.711105,33.078178,3.711105,3.48387,29.878112,3.48387,relu,256,0.5,300,8,8,3,mae,same
155,100,3.7169,32.692255,3.7169,3.655466,30.587623,3.655466,relu,64,0.8,100,16,32,3,mae,same
400,300,3.717098,32.182449,3.717098,3.099026,27.892264,3.099026,relu,256,0.0,300,4,4,3,mae,same
502,300,3.722425,31.837513,3.722425,3.640665,34.000961,3.640665,relu,256,0.5,300,8,16,3,mae,same
308,300,3.724686,33.788252,3.724686,3.821708,34.492095,3.821708,relu,128,0.5,300,8,4,3,mae,same
550,300,3.724966,33.05769,3.724966,3.735871,34.255339,3.735871,relu,256,0.8,300,8,16,3,mae,same
253,100,3.725416,32.329051,3.725416,3.313602,29.188136,3.313602,relu,128,0.2,100,32,8,3,mae,same


In [18]:
def get_rnn_1l_model(x_train, y_train, x_val, y_val, params):

    MAX_SEQUENCE_LENGTH = x_train.shape[1]
    
    global embedding_matrix
    
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                     mask_zero=False,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    rnn = CuDNNGRU(params['units_first'], return_sequences=False,  stateful=False)(embedded_sequences)

    rnn = Dropout(params['dropout'])(rnn)

    output = Dense(1, activation='linear')(rnn)

    model = Model([line_input] , output) 
    
    optimizer = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=optimizer,
                  loss=params['loss'],
                  metrics=['mse', 'mae'])

    callbacks_list = [
        ReduceLROnPlateau( 
            monitor='loss',
            min_lr=0.001,
            factor=0.5,
            verbose=1,
            patience=10), 
    ]
    
    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val])
    return out, model



In [19]:
p = {'units_first':[4, 8, 16, 32],
     'batch_size': [64, 128, 256],
     'epochs': [100, 300, 500],
     'dropout': [0, 0.2, 0.5, 0.8],
     'loss': ['mae',],
     'activation':['relu',]}

In [20]:
h_rnn1 = ta.Scan(x, y, params=p,
            model=get_rnn_1l_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|████████████████████████████████████████████████████████████████████████████████| 144/144 [25:18<00:00,  8.70s/it]


In [21]:
r_rnn1 = ta.Reporting(h_rnn1)

In [79]:
r_rnn1.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}rnn1.xlsx')

In [22]:
r_rnn1.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,loss.1,units_first
27,100,4.017101,34.435091,4.017101,3.807407,34.757183,3.807407,relu,64,0.5,100,mae,32
91,300,4.055992,36.333089,4.055992,3.591587,35.107207,3.591587,relu,128,0.8,300,mae,32
103,300,4.081528,34.772911,4.081528,2.889139,26.886685,2.889139,relu,256,0.0,300,mae,32
107,500,4.095769,36.172653,4.095769,1.869107,16.747403,1.869107,relu,256,0.0,500,mae,32
143,500,4.09581,35.786087,4.09581,3.194524,26.151485,3.194524,relu,256,0.8,500,mae,32
79,300,4.105294,34.54359,4.105294,2.710173,22.326963,2.710173,relu,128,0.5,300,mae,32
4,300,4.140781,42.310783,4.140781,3.865497,41.375933,3.865497,relu,64,0.0,300,mae,4
127,300,4.150539,38.53738,4.150539,4.125519,39.989468,4.125519,relu,256,0.5,300,mae,32
15,100,4.155402,35.595623,4.155402,3.674241,34.038256,3.674241,relu,64,0.2,100,mae,32
55,300,4.161576,35.398808,4.161576,2.095459,18.81213,2.095459,relu,128,0.0,300,mae,32


In [39]:
def get_rnn_2l_model(x_train, y_train, x_val, y_val, params):

    MAX_SEQUENCE_LENGTH = x_train.shape[1]
    
    global embedding_matrix
    
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                     mask_zero=False,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    rnn = CuDNNGRU(params['units_first'], return_sequences=True,  stateful=False)(embedded_sequences)
    rnn = CuDNNGRU(params['units_second'], return_sequences=False,  stateful=False)(rnn)

    rnn = Dropout(params['dropout'])(rnn)

    output = Dense(1, activation='linear')(rnn)

    model = Model([line_input] , output) 
    
    optimizer = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=optimizer,
                  loss=params['loss'],
                  metrics=['mse', 'mae'])

    callbacks_list = [
        ReduceLROnPlateau( 
            monitor='loss',
            min_lr=0.001,
            factor=0.5,
            verbose=1,
            patience=10), 
    ]
    
    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val])
    return out, model



In [40]:
p = {'units_first':[4, 8, 16, 32],
     'units_second':[4, 8, 16, 32],
     'batch_size': [64, 128, 256],
     'epochs': [100, 300, 500],
     'dropout': [0, 0.2, 0.5, 0.8],
     'loss': ['mae',],
     'activation':['relu',]}

In [41]:
h_rnn2 = ta.Scan(x, y, params=p,
            model=get_rnn_2l_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|██████████████████████████████████████████████████████████████████████████████| 576/576 [2:34:29<00:00, 17.03s/it]


In [42]:
r_rnn2 = ta.Reporting(h_rnn2)

In [80]:
r_rnn2.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}rnn2.xlsx')

In [43]:
r_rnn2.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,loss.1,units_first,units_second
315,300,3.93749,34.544248,3.93749,2.591791,20.36033,2.591791,relu,128,0.5,300,mae,16,32
467,500,3.940184,34.611366,3.940184,2.575337,20.541347,2.575337,relu,256,0.2,500,mae,4,32
82,500,3.98582,35.956879,3.98582,2.160543,16.590963,2.160543,relu,64,0.2,500,mae,4,16
22,300,3.993645,35.797341,3.993645,1.932441,16.772294,1.932441,relu,64,0.0,300,mae,8,16
459,300,4.005436,34.789242,4.005436,2.672401,23.130051,2.672401,relu,256,0.2,300,mae,16,32
411,300,4.008783,34.232544,4.008783,2.392893,20.078534,2.392893,relu,256,0.0,300,mae,16,32
11,100,4.024505,34.176532,4.024505,2.661223,23.353004,2.661223,relu,64,0.0,100,mae,16,32
571,500,4.024805,35.468475,4.024805,3.155714,25.000165,3.155714,relu,256,0.8,500,mae,16,32
18,300,4.026302,35.304062,4.026302,2.290792,20.456797,2.290792,relu,64,0.0,300,mae,4,16
519,500,4.028748,35.833904,4.028748,2.842785,24.800153,2.842785,relu,256,0.5,500,mae,8,32


In [23]:
def get_cnn_rnn_model(x_train, y_train, x_val, y_val, params):

    MAX_SEQUENCE_LENGTH = x_train.shape[1]
    
    global embedding_matrix
    
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                     mask_zero=False,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    
    cnn = Conv1D(filters=params['filters_first'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(embedded_sequences)
    cnn = AveragePooling1D(pool_size=2)(cnn)
    
    rnn = CuDNNGRU(params['units_first'], return_sequences=False,  stateful=False)(cnn)

    rnn = Dropout(params['dropout'])(rnn)

    output = Dense(1, activation='linear')(rnn)

    model = Model([line_input] , output) 
    
    optimizer = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=optimizer,
                  loss=params['loss'],
                  metrics=['mse', 'mae'])

    callbacks_list = [
        ReduceLROnPlateau( 
            monitor='loss',
            min_lr=0.001,
            factor=0.5,
            verbose=1,
            patience=10), 
    ]
    
    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val])
    return out, model

In [24]:
p = {'filters_first':[4, 8, 16, 32],
     'units_first':[4, 8, 16, 32],
     'kernel_size':[3],
     'batch_size': [64, 128, 256],
     'epochs': [100, 300, 500],
     'dropout': [0, 0.2, 0.5, 0.8],
     'padding' : ['same',],
     'loss': ['mae',],
     'activation':['relu',]}

In [25]:
h_cnn_rnn = ta.Scan(x, y, params=p,
            model=get_cnn_rnn_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|██████████████████████████████████████████████████████████████████████████████| 576/576 [1:49:53<00:00, 10.67s/it]


In [26]:
r_cnn_rnn = ta.Reporting(h_cnn_rnn)

In [81]:
r_cnn_rnn.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn_rnn.xlsx')

In [27]:
r_cnn_rnn.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,kernel_size,loss.1,padding,units_first
11,100,3.931706,33.633681,3.931706,2.357946,22.745678,2.357946,relu,64,0.0,100,16,3,mae,same,32
403,300,3.954406,34.150841,3.954406,2.471902,22.945651,2.471902,relu,256,0.0,300,4,3,mae,same,32
459,300,3.977251,33.501972,3.977251,2.466416,22.071932,2.466416,relu,256,0.2,300,16,3,mae,same,32
214,300,4.026696,34.355962,4.026696,2.500297,23.679653,2.500297,relu,128,0.0,300,8,3,mae,same,16
3,100,4.030043,34.276624,4.030043,2.95052,27.676142,2.95052,relu,64,0.0,100,4,3,mae,same,32
51,100,4.037873,35.13812,4.037873,3.407935,32.28097,3.407935,relu,64,0.2,100,4,3,mae,same,32
7,100,4.04998,36.534445,4.04998,3.625712,35.389202,3.625712,relu,64,0.0,100,8,3,mae,same,32
418,500,4.051798,35.326527,4.051798,2.495266,23.992723,2.495266,relu,256,0.0,500,4,3,mae,same,16
522,500,4.056374,34.480427,4.056374,3.054852,27.582959,3.054852,relu,256,0.5,500,16,3,mae,same,16
422,500,4.05685,35.321587,4.05685,2.040443,19.908373,2.040443,relu,256,0.0,500,8,3,mae,same,16


In [52]:
f"Total number of trials was {r_cnn1.data.shape[0]+r_cnn2.data.shape[0]+r_rnn1.data.shape[0]+r_rnn2.data.shape[0]+r_cnn_rnn.data.shape[0]}"

'Total number of trials was 2016'

## Tunning the selected model

1) Check if adding more CNN layers improve the results:

In [54]:
def get_cnn_3l_model(x_train, y_train, x_val, y_val, params):

    MAX_SEQUENCE_LENGTH = x_train.shape[1]
    
    global embedding_matrix
    
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                     mask_zero=False,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    cnn = Conv1D(filters=params['filters_first'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(embedded_sequences)
    cnn = AveragePooling1D(pool_size=2)(cnn)
    cnn = Conv1D(filters=params['filters_second'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(cnn)
    cnn = AveragePooling1D(pool_size=2)(cnn)
    cnn = Conv1D(filters=params['filters_third'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(cnn)
    cnn = AveragePooling1D(pool_size=2)(cnn)

    cnn = Flatten()(cnn)

    cnn = Dropout(params['dropout'])(cnn)

    output = Dense(1, activation='linear')(cnn)

    model = Model([line_input] , output) 
    
    optimizer = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=optimizer,
                  loss=params['loss'],
                  metrics=['mse', 'mae'])

    callbacks_list = [
        ReduceLROnPlateau( 
            monitor='loss',
            min_lr=0.001,
            factor=0.5,
            verbose=1,
            patience=10), 
    ]
    
    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val])
    return out, model


In [55]:
p = {'filters_first':[4, 8, 16, 32],
     'filters_second':[4, 8, 16, 32],
     'filters_third':[4, 8, 16, 32],
     'kernel_size':[3],
     'batch_size': [64, 128, 256],
     'epochs': [100, 300, 500],
     'dropout': [0, 0.2, 0.5, 0.8],
     'padding' : ['same',],
     'loss': ['mae',],
     'activation':['relu',]}

In [56]:
h_cnn3 = ta.Scan(x, y, params=p,
            model=get_cnn_3l_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|████████████████████████████████████████████████████████████████████████████| 2304/2304 [6:09:37<00:00,  9.57s/it]


In [57]:
r_cnn3 = ta.Reporting(h_cnn3)

In [82]:
r_cnn3.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn3.xlsx')

In [58]:
r_cnn3.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,filters_third,kernel_size,loss.1,padding
418,100,3.707282,33.828422,3.707282,3.633263,33.118604,3.633263,relu,64,0.5,100,16,4,16,3,mae,same
611,100,3.711038,33.189314,3.711038,4.149989,39.267885,4.149989,relu,64,0.8,100,16,4,32,3,mae,same
1990,300,3.719381,33.000488,3.719381,3.527969,31.644216,3.527969,relu,256,0.5,300,4,8,16,3,mae,same
1012,100,3.719963,33.659943,3.719963,3.674645,35.808758,3.674645,relu,128,0.2,100,32,8,4,3,mae,same
1029,300,3.725108,32.606672,3.725108,3.623882,33.579163,3.623882,relu,128,0.2,300,4,8,8,3,mae,same
11,100,3.725483,31.951894,3.725483,2.860236,25.370698,2.860236,relu,64,0.0,100,4,16,32,3,mae,same
199,100,3.726642,32.659976,3.726642,3.349699,29.777566,3.349699,relu,64,0.2,100,4,8,32,3,mae,same
414,100,3.729649,33.569738,3.729649,3.638212,32.973961,3.638212,relu,64,0.5,100,8,32,16,3,mae,same
1198,100,3.730466,34.122732,3.730466,3.696781,34.811666,3.696781,relu,128,0.5,100,16,32,16,3,mae,same
406,100,3.730949,32.958525,3.730949,3.490194,30.616546,3.490194,relu,64,0.5,100,8,8,16,3,mae,same


**Observations:**
- CNN with 3 layers provide worse accuracy than with 2 layers

2) Optimize the 2-layer CNN model

In [17]:
def get_cnn_2l_optimize_model(x_train, y_train, x_val, y_val, params):

    MAX_SEQUENCE_LENGTH = x_train.shape[1]
    
    global embedding_matrix
    
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                     mask_zero=False,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    cnn = Conv1D(filters=params['filters_first'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(embedded_sequences)
    cnn = params['pooling'](pool_size=2)(cnn)
    cnn = Conv1D(filters=params['filters_second'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(cnn)
    cnn = AveragePooling1D(pool_size=2)(cnn)
    
    if params['global_pool'] is False or params['global_pool'] is None:
        cnn = Flatten()(cnn)

    cnn = Dropout(params['dropout'])(cnn)
    
    if params['global_pool'] is not None:
        cnn = params['global_pool']()(cnn)

    output = Dense(1, activation='linear')(cnn)

    model = Model([line_input] , output) 
    
    optimizer = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=optimizer,
                  loss=params['loss'],
                  metrics=['mse', 'mae'])

    callbacks_list = [
        ReduceLROnPlateau( 
            monitor='loss',
            min_lr=0.001,
            factor=0.5,
            verbose=1,
            patience=10), 
    ]
    
    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val])
    return out, model


**Round 1:**

Goals:
- try different number of filters
- it seems that it is better to have larger batch size - try 200 (between 128 and 256)
- try more dropout rates


In [15]:
p = {'filters_first':[4, 8, 10, 12, 16, 32],
     'filters_second':[4, 8, 10, 12, 16, 32],
     'global_pool' : [None],
     'pooling' : [AveragePooling1D],
     'kernel_size' : [3],
     'batch_size': [128, 200, 256],
     'epochs': [300, 400, 500],
     'dropout': [0.2, 0.3, 0.5, 0.6, 0.8],
     'padding' : ['same', ],
     'loss': ['mae',],
     'activation':['relu',]}

In [18]:
h_cnn2_opt = ta.Scan(x, y, params=p,
            model=get_cnn_2l_optimize_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|████████████████████████████████████████████████████████████████████████████| 1620/1620 [3:37:14<00:00,  8.37s/it]


In [21]:
r_cnn2_opt1 = ta.Reporting(h_cnn2_opt)

In [23]:
r_cnn2_opt1.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2_opt1.xlsx')

In [24]:
r_cnn2_opt1.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,global_pool,kernel_size,loss.1,padding,pooling
1152,500,3.681899,31.878607,3.681899,3.237408,27.56996,3.237408,relu,256,0.2,500,4,4,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
1010,400,3.683891,32.60379,3.683891,3.973372,36.937891,3.973372,relu,200,0.8,400,4,10,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
1442,400,3.690797,31.755579,3.690797,3.547464,31.432706,3.547464,relu,256,0.6,400,4,10,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
331,300,3.693576,32.782413,3.693576,3.805458,33.693159,3.805458,relu,128,0.6,300,8,8,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
794,400,3.694512,32.586864,3.694512,3.72704,35.877683,3.72704,relu,200,0.5,400,4,10,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
763,300,3.697991,32.613914,3.697991,3.662649,30.834937,3.662649,relu,200,0.5,300,8,8,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
552,300,3.698331,32.667267,3.698331,3.531215,32.913167,3.531215,relu,200,0.2,300,10,4,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
1081,300,3.698541,31.538437,3.698541,3.203499,26.931889,3.203499,relu,256,0.2,300,4,8,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
720,500,3.700402,32.121761,3.700402,3.484164,31.16052,3.484164,relu,200,0.3,500,4,4,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
867,300,3.704948,33.280777,3.704948,3.762641,35.229409,3.762641,relu,200,0.6,300,4,12,False,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>


**Observations:**
- a smaller number of filters seems to perform better
- larger batch sizes (200 and 256) seem to perform better

**Round 2:**

Goals: 
- reduce the number of filters to max. 12
- remove 128 from batch sizes
- try max pooling 
- try valid padding 

In [25]:
p = {'filters_first':[4, 8, 10, 12],
     'filters_second':[4, 8, 10, 12],
     'global_pool' : [None],
     'pooling' : [MaxPooling1D],
     'kernel_size' : [3],
     'batch_size': [200, 256],
     'epochs': [300, 400, 500],
     'dropout': [0.2, 0.3, 0.5, 0.6, 0.8],
     'padding' : ['same', 'valid' ],
     'loss': ['mae',],
     'activation':['relu',]}

In [26]:
h_cnn2_opt2 = ta.Scan(x, y, params=p,
            model=get_cnn_2l_optimize_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|██████████████████████████████████████████████████████████████████████████████| 960/960 [1:55:12<00:00,  8.41s/it]


In [27]:
r_cnn2_opt2 = ta.Reporting(h_cnn2_opt2)

In [28]:
r_cnn2_opt2.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2_opt2.xlsx')

In [31]:
r_cnn2_opt2.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,global_pool,kernel_size,loss.1,padding,pooling
104,300,3.68032,32.193874,3.68032,3.317353,27.581006,3.317353,relu,200,0.3,300,8,4,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
490,300,3.681606,32.817875,3.681606,3.442568,30.893156,3.442568,relu,256,0.2,300,8,8,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
552,500,3.682738,32.364494,3.682738,3.213886,26.902519,3.213886,relu,256,0.2,500,8,4,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
684,300,3.688679,32.42091,3.688679,3.481388,31.761607,3.481388,relu,256,0.5,300,8,10,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
694,300,3.688787,31.680845,3.688787,3.334398,31.625212,3.334398,relu,256,0.5,300,10,12,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
588,300,3.698598,32.061123,3.698598,3.249271,26.941509,3.249271,relu,256,0.3,300,8,10,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
720,400,3.699495,32.694237,3.699495,3.745995,31.725996,3.745995,relu,256,0.5,400,10,4,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
422,400,3.70145,32.939861,3.70145,3.640594,31.60164,3.640594,relu,200,0.8,400,4,12,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
736,500,3.701734,32.610508,3.701734,3.574619,30.611063,3.574619,relu,256,0.5,500,4,4,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
806,400,3.704964,31.84939,3.704964,3.36017,29.380457,3.36017,relu,256,0.6,400,4,12,False,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>


**Observations:**
- the same padding seems better than valid padding
- max and average over-time pooling seem to perform with very similar accuracy (max pooling seem to work a little bit better)

**Round 3:**

Goals:
- continue comparing max and average over-time pooling
- try to use global pooling instead of flattening the feature maps

In [33]:
p = {'filters_first':[4, 8, 10, 12],
     'filters_second':[4, 8, 10, 12],
     'global_pool' : [GlobalAveragePooling1D, GlobalMaxPooling1D],
     'pooling' : [AveragePooling1D, MaxPooling1D],
     'kernel_size' : [3],
     'batch_size': [200, 256],
     'epochs': [300, 400, 500],
     'dropout': [0.2, 0.3, 0.5, 0.6, 0.8],
     'padding' : ['same', ],
     'loss': ['mae',],
     'activation':['relu',]}

In [34]:
h_cnn2_opt3 = ta.Scan(x, y, params=p,
            model=get_cnn_2l_optimize_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|████████████████████████████████████████████████████████████████████████████| 1920/1920 [3:51:53<00:00,  8.63s/it]


In [35]:
r_cnn2_opt3 = ta.Reporting(h_cnn2_opt3)

In [36]:
r_cnn2_opt3.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2_opt3.xlsx')

In [37]:
r_cnn2_opt3.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,global_pool,kernel_size,loss.1,padding,pooling
717,500,3.665162,30.928324,3.665162,3.351209,28.824132,3.351209,relu,200,0.6,500,4,12,<class 'keras.layers.pooling.GlobalAveragePool...,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
1631,400,3.688706,32.959915,3.688706,3.439334,33.444127,3.439334,relu,256,0.6,400,8,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
308,400,3.691783,32.888565,3.691783,3.589276,32.428992,3.589276,relu,200,0.3,400,12,8,<class 'keras.layers.pooling.GlobalAveragePool...,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
479,400,3.694067,32.333881,3.694067,3.217701,27.858132,3.217701,relu,200,0.5,400,8,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
623,300,3.695072,32.788326,3.695072,3.756492,34.210275,3.756492,relu,200,0.6,300,10,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
1646,400,3.695392,31.876768,3.695392,3.624677,31.619041,3.624677,relu,256,0.6,400,10,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
1654,400,3.696461,32.876373,3.696461,3.756115,34.317944,3.756115,relu,256,0.6,400,12,8,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
1647,400,3.696624,32.634186,3.696624,3.470826,31.502185,3.470826,relu,256,0.6,400,10,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,3,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
632,300,3.699656,32.943523,3.699656,3.563812,33.397288,3.563812,relu,200,0.6,300,12,10,<class 'keras.layers.pooling.GlobalAveragePool...,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>
1500,500,3.699997,33.148014,3.699997,3.706141,34.206569,3.706141,relu,256,0.5,500,8,12,<class 'keras.layers.pooling.GlobalAveragePool...,3,mae,same,<class 'keras.layers.pooling.AveragePooling1D'>


**Obervations**
- using global pooling seems to provide better results than flattening features maps
- when using global pooling we should try more epochs
- batch size of 200 seems to dominate the best results
- dropout rate should be higher than 0.2
- the number of filters in the second CNN layer was dominated by 8 and 12
- max over-time pooling seems to work a little better than the average over-time pooling

**Round 4:**

Goals:
- try limiting the number of filters the second layer to 8 and 12.
- try a smaller filter kernel size => 2

In [38]:
p = {'filters_first':[4, 8, 10, 12],
     'filters_second':[8, 12],
     'global_pool' : [GlobalMaxPooling1D, GlobalAveragePooling1D],
     'pooling' : [MaxPooling1D],
     'kernel_size' : [2, 3],
     'batch_size': [200],
     'epochs': [400, 500, 600],
     'dropout': [0.3, 0.4, 0.5, 0.6, 0.8],
     'padding' : ['same', ],
     'loss': ['mae',],
     'activation':['relu',]}

In [39]:
h_cnn2_opt4 = ta.Scan(x, y, params=p,
            model=get_cnn_2l_optimize_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|██████████████████████████████████████████████████████████████████████████████| 480/480 [1:11:08<00:00, 10.43s/it]


In [40]:
r_cnn2_opt4 = ta.Reporting(h_cnn2_opt4)

In [41]:
r_cnn2_opt4.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2_opt4.xlsx')

In [42]:
r_cnn2_opt4.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,global_pool,kernel_size,loss.1,padding,pooling
316,400,3.65704,32.612061,3.65704,3.666067,32.77119,3.666067,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
234,500,3.666194,32.019459,3.666194,3.648907,31.95322,3.648907,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
214,400,3.679745,32.434525,3.679745,3.650768,33.093451,3.650768,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
354,600,3.686366,32.982075,3.686366,3.858482,34.156106,3.858482,relu,200,0.6,600,4,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
366,600,3.686554,32.699501,3.686554,3.849001,33.781052,3.849001,relu,200,0.6,600,8,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
248,500,3.687132,31.864271,3.687132,3.519705,29.559919,3.519705,relu,200,0.5,500,12,8,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
220,400,3.688397,32.63448,3.688397,3.512792,30.066487,3.512792,relu,200,0.5,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
280,600,3.689655,32.604107,3.689655,3.44714,29.515491,3.44714,relu,200,0.5,600,12,8,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
158,500,3.690809,32.312889,3.690809,3.44594,29.571715,3.44594,relu,200,0.4,500,12,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
332,500,3.697461,32.308838,3.697461,3.376893,29.633159,3.376893,relu,200,0.6,500,8,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>


**Observations:**
- the kernel size 2 seems to work better

**Round 5:**

Goals:
- limit the number of combinations to the most promising ones (the hyperparameters in top 3 configurations)
- try adding a dense layer with diffferent dropout rates

In [45]:
def get_cnn_2l_optimize_dense_model(x_train, y_train, x_val, y_val, params):

    MAX_SEQUENCE_LENGTH = x_train.shape[1]
    
    global embedding_matrix
    
    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                     output_dim=embedding_matrix.shape[1],
                     mask_zero=False,
                     weights=[embedding_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    cnn = Conv1D(filters=params['filters_first'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(embedded_sequences)
    cnn = params['pooling'](pool_size=2)(cnn)
    cnn = Conv1D(filters=params['filters_second'], kernel_size=params['kernel_size'], strides=1, 
                        padding=params['padding'],
                        activation=params['activation'])(cnn)
    cnn = AveragePooling1D(pool_size=2)(cnn)
    
    if params['global_pool'] is False or params['global_pool'] is None:
        cnn = Flatten()(cnn)

    cnn = Dropout(params['dropout'])(cnn)
    
    if params['global_pool'] is not None:
        cnn = params['global_pool']()(cnn)
        
    dense = Dense(params['dense_additional'], activation='relu')(cnn)
    
    dense = Dropout(params['dropout2'])(dense)

    output = Dense(1, activation='linear')(dense)

    model = Model([line_input] , output) 
    
    optimizer = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=optimizer,
                  loss=params['loss'],
                  metrics=['mse', 'mae'])

    callbacks_list = [
        ReduceLROnPlateau( 
            monitor='loss',
            min_lr=0.001,
            factor=0.5,
            verbose=1,
            patience=10), 
    ]
    
    out = model.fit(x_train, y_train,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=0,
                    validation_data=[x_val, y_val])
    return out, model


In [46]:
p = {'filters_first':[8, 10, 12],
     'filters_second':[8, 12],
     'dense_additional':[2, 4, 6, 8, 12],
     'global_pool' : [GlobalMaxPooling1D, GlobalAveragePooling1D],
     'pooling' : [MaxPooling1D],
     'kernel_size' : [2],
     'batch_size': [200],
     'epochs': [400, 500],
     'dropout': [ 0.6, 0.8],
     'dropout2': [0.0, 0.2, 0.4, 0.6, 0.8],
     'padding' : ['same', ],
     'loss': ['mae',],
     'activation':['relu',]}

In [47]:
h_cnn2_opt5 = ta.Scan(x, y, params=p,
            model=get_cnn_2l_optimize_dense_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|████████████████████████████████████████████████████████████████████████████| 1200/1200 [3:02:27<00:00, 11.45s/it]


In [48]:
r_cnn2_opt5 = ta.Reporting(h_cnn2_opt5)

In [49]:
r_cnn2_opt5.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2_opt5.xlsx')

In [50]:
r_cnn2_opt5.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dense_additional,dropout,dropout2,epochs,filters_first,filters_second,global_pool,kernel_size,loss.1,padding,pooling
733,500,3.665074,32.045742,3.665074,3.689064,32.694996,3.689064,relu,200,8,0.6,0.0,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
961,400,3.665128,31.723003,3.665128,3.296635,30.00453,3.296635,relu,200,12,0.6,0.0,400,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
243,400,3.669019,32.616409,3.669019,3.531558,31.979408,3.531558,relu,200,4,0.6,0.0,400,8,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
1139,400,3.674914,33.637676,3.674914,4.104434,38.122835,4.104434,relu,200,12,0.8,0.4,400,12,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
275,400,3.679839,33.244526,3.679839,3.949038,33.710525,3.949038,relu,200,4,0.6,0.2,400,12,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
501,500,3.680055,30.912025,3.680055,3.13154,27.341768,3.13154,relu,200,6,0.6,0.0,500,12,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
749,400,3.681636,32.10643,3.681636,3.798454,32.427885,3.798454,relu,200,8,0.6,0.2,400,10,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
1011,400,3.684298,32.565975,3.684298,3.495869,33.353973,3.495869,relu,200,12,0.6,0.4,400,8,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
721,400,3.686246,33.235195,3.686246,3.672705,36.881931,3.672705,relu,200,8,0.6,0.0,400,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
263,500,3.688753,32.668148,3.688753,3.586621,32.060119,3.586621,relu,200,4,0.6,0.0,500,12,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>


**Observations:**
- adding a dense layer did not improve the results

3) Take top 3 configurations from the round 4 and study their stability

***Configuration 1:***

In [78]:
p = {'filters_first':[12],
     'filters_second':[12],
     'global_pool' : [GlobalMaxPooling1D],
     'pooling' : [MaxPooling1D],
     'kernel_size' : [2],
     'batch_size': [200, ] * 10,
     'epochs': [400],
     'dropout': [0.6],
     'padding' : ['same', ],
     'loss': ['mae',],
     'activation':['relu',]}

In [79]:
h_cnn2_opt4_c1 = ta.Scan(x, y, params=p,
            model=get_cnn_2l_optimize_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:07<00:00,  6.68s/it]


In [80]:
r_cnn2_opt4_c1 = ta.Reporting(h_cnn2_opt4_c1)

In [81]:
r_cnn2_opt4_c1.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2_opt4_c1.xlsx')

In [82]:
r_cnn2_opt4_c1.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,global_pool,kernel_size,loss.1,padding,pooling
4,400,3.690124,32.149601,3.690124,3.926525,32.113115,3.926525,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
9,400,3.70246,32.360439,3.70246,3.802073,34.589403,3.802073,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
6,400,3.709958,33.145634,3.709958,3.741108,33.907757,3.741108,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
0,400,3.710782,32.722908,3.710782,3.763648,35.922882,3.763648,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
7,400,3.744325,33.906372,3.744325,4.014873,34.123041,4.014873,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
8,400,3.746644,33.665012,3.746644,3.811208,35.135997,3.811208,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
1,400,3.749992,33.479336,3.749992,3.940854,35.305391,3.940854,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
5,400,3.767886,32.530029,3.767886,3.402078,29.189157,3.402078,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
3,400,3.80749,35.004692,3.80749,4.096201,39.446648,4.096201,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
2,400,3.827899,34.54509,3.827899,3.758857,33.807755,3.758857,relu,200,0.6,400,12,12,<class 'keras.layers.pooling.GlobalMaxPooling1D'>,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>


In [83]:
f"MAE = {r_cnn2_opt4_c1.data['val_mean_absolute_error'].mean()}, SD = {r_cnn2_opt4_c1.data['val_mean_absolute_error'].std()}"

'MAE = 3.7457560300827026, SD = 0.045391303236238946'

***Configuration 2:***

In [84]:
p = {'filters_first':[8],
     'filters_second':[8],
     'global_pool' : [GlobalAveragePooling1D],
     'pooling' : [MaxPooling1D],
     'kernel_size' : [2],
     'batch_size': [200, ] * 10,
     'epochs': [500],
     'dropout': [0.5],
     'padding' : ['same', ],
     'loss': ['mae',],
     'activation':['relu',]}

In [85]:
h_cnn2_opt4_c2 = ta.Scan(x, y, params=p,
            model=get_cnn_2l_optimize_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:30<00:00,  9.91s/it]


In [86]:
r_cnn2_opt4_c2 = ta.Reporting(h_cnn2_opt4_c2)

In [87]:
r_cnn2_opt4_c2.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2_opt4_c2.xlsx')

In [88]:
r_cnn2_opt4_c2.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,global_pool,kernel_size,loss.1,padding,pooling
3,500,3.719345,33.121586,3.719345,3.540832,30.583073,3.540832,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
5,500,3.719612,32.951797,3.719612,3.814876,35.331375,3.814876,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
6,500,3.740511,32.939259,3.740511,3.65352,35.264673,3.65352,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
0,500,3.769524,33.749458,3.769524,3.598128,32.265414,3.598128,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
4,500,3.782636,33.586391,3.782636,3.881434,35.740713,3.881434,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
9,500,3.791534,34.250134,3.791534,3.855863,35.348539,3.855863,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
2,500,3.794688,34.080273,3.794688,3.924745,35.28186,3.924745,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
1,500,3.795801,34.244808,3.795801,3.802575,34.668557,3.802575,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
8,500,3.797349,34.39835,3.797349,3.864193,34.87224,3.864193,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
7,500,3.89473,36.72253,3.89473,4.364989,42.750916,4.364989,relu,200,0.5,500,8,8,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>


In [89]:
f"MAE = {r_cnn2_opt4_c2.data['val_mean_absolute_error'].mean()}, SD = {r_cnn2_opt4_c2.data['val_mean_absolute_error'].std()}"

'MAE = 3.780572843551636, SD = 0.050619414154958145'

***Configuration 3:***

In [90]:
p = {'filters_first':[10],
     'filters_second':[12],
     'global_pool' : [GlobalAveragePooling1D],
     'pooling' : [MaxPooling1D],
     'kernel_size' : [2],
     'batch_size': [200, ] * 10,
     'epochs': [400],
     'dropout': [0.5],
     'padding' : ['same', ],
     'loss': ['mae',],
     'activation':['relu',]}

In [91]:
h_cnn2_opt4_c3 = ta.Scan(x, y, params=p,
            model=get_cnn_2l_optimize_model,
            experiment_name='1',
            seed = random_seed,
            val_split=0.33)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:16<00:00,  7.40s/it]


In [92]:
r_cnn2_opt4_c3 = ta.Reporting(h_cnn2_opt4_c3)

In [93]:
r_cnn2_opt4_c3.data.sort_values('val_mean_absolute_error').to_excel(f'{output_folder}cnn2_opt4_c3.xlsx')

In [94]:
r_cnn2_opt4_c3.data.sort_values('val_mean_absolute_error').head(20)

Unnamed: 0,round_epochs,val_loss,val_mean_squared_error,val_mean_absolute_error,loss,mean_squared_error,mean_absolute_error,activation,batch_size,dropout,epochs,filters_first,filters_second,global_pool,kernel_size,loss.1,padding,pooling
0,400,3.700188,32.668411,3.700188,3.649286,32.675564,3.649286,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
6,400,3.704947,31.993299,3.704947,3.545169,31.408194,3.545169,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
5,400,3.714174,31.933207,3.714174,3.324685,29.715143,3.324685,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
1,400,3.727131,33.173229,3.727131,3.438814,28.634687,3.438814,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
8,400,3.735837,33.47961,3.735837,3.802303,35.158085,3.802303,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
9,400,3.755699,32.398083,3.755699,3.521478,32.818443,3.521478,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
4,400,3.769074,33.469788,3.769074,3.986706,34.180003,3.986706,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
2,400,3.772059,34.081093,3.772059,3.741108,33.769911,3.741108,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
7,400,3.774521,33.920952,3.774521,3.942487,33.767837,3.942487,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>
3,400,3.780445,33.447956,3.780445,3.686458,33.487981,3.686458,relu,200,0.5,400,10,12,<class 'keras.layers.pooling.GlobalAveragePool...,2,mae,same,<class 'keras.layers.pooling.MaxPooling1D'>


In [95]:
f"MAE = {r_cnn2_opt4_c3.data['val_mean_absolute_error'].mean()}, SD = {r_cnn2_opt4_c3.data['val_mean_absolute_error'].std()}"

'MAE = 3.743407416343689, SD = 0.03072201819838168'

**Conclusion**

Configuration no. 3 provides the most promising results (the lowest average MAE and the lowest variance)