# Regularization hyperparameter tuning
Due to limited computing resources, let's be satisfied for course tuning.
### List of hyperparameters
- drop_out rate

# Load libaries

In [1]:
from freq_utils import *

import regex as re
import IPython

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from tensorflow.keras.utils import to_categorical
from collections import Counter

import keras_tuner as kt

from joblib import dump, load

import time


pd.options.display.max_colwidth = 200

# Load dataset

In [2]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')
df0['label'] = 0
df1['label'] = 1

# Make word encoding items
- Make dictionaries
- Get max_len

### Pretrained word embeddings
- Word to index
- Word to vector

In [3]:
word_to_index, index_to_word, word_to_vector = get_pretrained_embedding()

### PoS tag encodings
- PoS word to index

In [4]:
df = pd.concat([df0.cleaned_pos, df1.cleaned_pos])

pos_set = set()
for x in list(df.str.lower().str.split()):
    pos_set.update(x)

pos_list = list(pos_set)
pos_to_index = { pos_list[i]: i for i in range(len(pos_list)) }

print(len(pos_to_index),pos_to_index)

39 {'vbz': 0, 'vb': 1, 'nns': 2, "''": 3, 'wp': 4, 'ex': 5, 'md': 6, 'vbd': 7, 'uh': 8, 'nnps': 9, 'fw': 10, 'dt': 11, 'jjs': 12, 'jj': 13, 'cd': 14, 'nn': 15, 'sym': 16, 'to': 17, 'prp$': 18, 'wdt': 19, 'rbr': 20, 'rb': 21, 'wrb': 22, 'cc': 23, 'vbp': 24, '(': 25, ',': 26, 'prp': 27, 'vbg': 28, ')': 29, 'jjr': 30, 'in': 31, ':': 32, 'vbn': 33, 'pos': 34, '.': 35, 'wp$': 36, 'nnp': 37, 'rbs': 38}


### Get max_len

In [5]:
xcol_names = df0.columns[:-1].to_list()
input_dict = {}
for x in xcol_names:
    print(x,'\t', df0[x].str.split().str.len().max(), df1[x].str.split().str.len().max())
    input_dict[x]=df1[x].str.split().str.len().max()
print(input_dict)

org_title 	 20 42
lower_title 	 20 42
cleaned_words 	 24 49
cleaned_pos 	 24 49
minimal_words 	 15 35
{'org_title': 42, 'lower_title': 42, 'cleaned_words': 49, 'cleaned_pos': 49, 'minimal_words': 35}


# Model layer segment functions

In [6]:
def input_encoder(X, trainable = True):
    
    # X input can be either text or PoS vectors
    # dim=2 for text -> word index input (m, max_len)
    # dim=3 for PoS -> one-hot encoding input (m, max_len, num_cat)
    dim = len(X.get_shape().as_list())
    
    if dim==2:
        # Word embedding, indices to vector
        # Output: (m, max_len, emb_dim)
        X = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)(X)             
    elif dim==3:
        # Int to float for One-hot encoding
        # Output: (m, max_len, num_cat)
        X = tf.dtypes.cast(X, tf.float32)
    else:
        print('Wrong input shape:', X.get_shape())
        
    # Skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)    
    
    return X

def ml_builder(X, hp, best_hps, ml_type):

    # Hyperparameters
    drop_out = hp.Choice('drop_out', values = [0.2, 0.4, 0.6]) 
    
    if ml_type=='FNN':
        
        # Take average of a sentence
        max_len = X.shape[1]
        X_avg = [ X[:,i,:] for i in range(max_len) ]
        X = tf.keras.layers.Average()(X_avg)    
        
        n_units = [best_hps.values['n_unit1'], best_hps.values['n_unit2'], 16]
        
        # Linear+ReLu layer
        for n_unit in n_units:
            X = tfl.Dense(units = n_unit, activation='relu', kernel_initializer='he_normal')(X)
            X = tfl.Dropout(rate = drop_out)(X)  
            X = tfl.BatchNormalization()(X)
    
        
    elif ml_type=='LSTM':
        
        n_unit = best_hps.values['n_unit']
        re_drop_out = hp.Choice('re_drop_out', values = [0.2, 0.4, 0.6]) 
        
        # Output: a[l] (m, max_len, # hidden unit), batch of sequences
        X = tfl.LSTM(units = n_unit, dropout = drop_out, recurrent_dropout=re_drop_out, return_sequences= True)(X)
        X = tfl.LSTM(units = n_unit, dropout = drop_out, recurrent_dropout=re_drop_out, return_sequences= True)(X)
        # Output: a[l] (m, # hidden unit)
        X = tfl.LSTM(units = n_unit, dropout = drop_out, recurrent_dropout=re_drop_out, return_sequences= False)(X)

    else:
        print('Wrong ml_type:',ml_type)
        
            
    # Linear+Softmax layer
    # Output: y (m, # classes=2), probability of each class
    X = tfl.Dense(units = 2, activation='softmax')(X)
    
    return X

def ml_optimizer(model, hp, best_hps):
    
    # Hyperparameter
    learning_rate = best_hps.values['learning_rate']

    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)                                
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    return model

# Hyperparameter Tuner

In [7]:
class MyHyperModel(kt.HyperModel):
    def __init__(self, input_shape, best_hps, ml_type='LSTM', trainable=True):
        
        self.input_shape = input_shape
        self.ml_type = ml_type
        self.trainable = trainable
        self.best_hps = best_hps
    
    def build(self, hp):

        # X_oh (m, max_len, num_cat)
        # X_indices (m, max_len)
        X_input = tfl.Input(shape=self.input_shape, dtype='int32')

        X = input_encoder(X_input, trainable = self.trainable)
        X = ml_builder(X, hp, best_hps=self.best_hps, ml_type=self.ml_type)

        model = tf.keras.models.Model(inputs=X_input, outputs=X)
        model = ml_optimizer(model, hp, best_hps=self.best_hps)

        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=self.best_hps.values['batch_size'],
            shuffle=True,
            **kwargs,
        )

# Generate inputs of the model

In [8]:
train, dev, test = train_dev_test_split([df0, df1], m=2000, class_column='label', 
                                    class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)

def make_input(xcol_name):

    max_len = input_dict[xcol_name]
    use_embeddings = True
    if xcol_name == 'cleaned_pos':
        use_embeddings=False

    w2i = False
    w2v = False
    X_shape = False

    # Embedding or One-hot encoding
    if use_embeddings:
        w2i = word_to_index        
        w2v = word_to_vector
        X_shape = (max_len, )
    else:
        w2i = pos_to_index
        X_shape = (max_len, len(pos_to_index))

    _, _, X_train_indices, _, Y_train_oh = dataframe_to_arrays(train, w2i, max_len, Xname=xcol_name)
    _, _, X_dev_indices,   _, Y_dev_oh   = dataframe_to_arrays(dev, w2i, max_len, Xname=xcol_name)
    index, _, X_test_indices, Y_test, _  = dataframe_to_arrays(test, w2i, max_len, Xname=xcol_name)

    # X, Y (train, dev, test)
    X = False
    if use_embeddings:
        X = [X_train_indices, X_dev_indices, X_test_indices]
    else:
        X = [to_categorical(X_train_indices, num_classes=len(pos_to_index)), 
             to_categorical(X_dev_indices, num_classes=len(pos_to_index)), 
             to_categorical(X_test_indices, num_classes=len(pos_to_index))]            

    Y = [Y_train_oh, Y_dev_oh, Y_test]
    
    print('input column:',xcol_name)
    print('max_len:',max_len)
    print('use_embeddings?:',use_embeddings)
    print(np.shape(X[0]), X_shape)
    print(np.shape(Y[0]))
    
    return X, Y, X_shape



# Hyperparameter search

In [9]:
class ClearTrainingOutput(tf.keras.callbacks.Callback):
    def on_train_end(*args, **kwargs):
        IPython.display.clear_output(wait = True)

durations=[]
        
for ml_type in ('LSTM','FNN'):
    for xcol_name in xcol_names:
        
        #if not (ml_type=='FNN' and xcol_name=='cleaned_pos'):
        #    continue
        
        begin_time = time.time()
        
        X, Y, X_shape = make_input(xcol_name)
        
        dir_name = ml_type+'_'+xcol_name
        save_name = 'data/'+dir_name+'_best_hps'
        best_hps=load(save_name)
        
        print(best_hps.values)
        
        tuner = kt.Hyperband(MyHyperModel(X_shape, best_hps, ml_type=ml_type, trainable=True),
                             objective = 'val_accuracy', 
                             max_epochs = 100,
                             factor = 3,
                             overwrite = True)

        tuner.search(X[0], Y[0],
                     validation_data=(X[1], Y[1]),
                     epochs=100,
                     callbacks=[ClearTrainingOutput(), tf.keras.callbacks.EarlyStopping(patience=5)])

        
        best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]

        save_name = 'data/'+dir_name+'_best_hps_reg'
        dump(best_hps,save_name)
        
        end_time = time.time()
        
        durations.append([ml_type,xcol_name,(end_time-begin_time)/60,'min.'])

Trial 3 Complete [00h 00m 21s]
val_accuracy: 0.8974999785423279

Best val_accuracy So Far: 0.9125000238418579
Total elapsed time: 00h 01m 05s
INFO:tensorflow:Oracle triggered exit


In [10]:
print(durations)

[['LSTM', 'org_title', 7.839323854446411, 'min.'], ['LSTM', 'lower_title', 8.104822671413421, 'min.'], ['LSTM', 'cleaned_words', 7.997315100828806, 'min.'], ['LSTM', 'cleaned_pos', 11.177371033032735, 'min.'], ['LSTM', 'minimal_words', 6.915489494800568, 'min.'], ['FNN', 'org_title', 0.752882166703542, 'min.'], ['FNN', 'lower_title', 1.1186068177223205, 'min.'], ['FNN', 'cleaned_words', 1.1331665476163229, 'min.'], ['FNN', 'cleaned_pos', 0.11688451369603475, 'min.'], ['FNN', 'minimal_words', 1.1110594630241395, 'min.']]
