# TASK 1
Riconoscimento review positiva o negativa

In [1]:
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras import Sequential

import tensorflow.keras as keras

from libraries.dataset import Dataset

import libraries.preprocessing_utils as prep_utils
import tensorflow as tf
import keras_tuner as kt

import constants as const

## Data retrieving

In [2]:
review_data = Dataset('review', 'sentiment')
#  50_000 for element
review_data.split(['text'], 'sentiment', n_samples=50_000)

Reading ./data_100_000/balanced_review_sentiment_train.csv...
File loaded in 0.03 minutes
Reading ./data_100_000/balanced_review_sentiment_val.csv...
File loaded in 0.0 minutes
Reading ./data_100_000/balanced_review_sentiment_test.csv...
File loaded in 0.0 minutes


## Models
### Naive bayes

In [3]:
# TODO

## RNN - LSTM

In [4]:
# preprocess text -> list[str]
# fit tokenizer and tokenize
tokenizer = prep_utils.get_tokenizer(review_data.train_data[0]['text'])

train_tokens = prep_utils.get_set_tokens(
    review_data.train_data[0]['text'], tokenizer, set='train', task='task1')

test_tokens = prep_utils.get_set_tokens(
    review_data.test_data[0]['text'], tokenizer, set='test', task='task1')

val_tokens = prep_utils.get_set_tokens(
    review_data.val_data[0]['text'], tokenizer, set='val', task='task1')

In [5]:
e_matrix = prep_utils.get_embedding_matrix(const.word_embedding_filepath, 'task1',
                                            tokenizer, len(tokenizer.index_word)+1)

Loading pickled embedding matrix from ./data_100_000/embedding/task1_embedding_matrix.npy...
...embedding matrix loaded


In [6]:
word_vector_dim = 100

vocab_size = len(tokenizer.word_index) +1
max_length = len(max(train_tokens, key=len))

embedding_layer = Embedding(vocab_size, word_vector_dim,
                            embeddings_initializer=Constant(e_matrix), trainable=False)

Let's define the functions that return the hypermodel with a specific hyperparameters search space.

Hyperparameters:
- number of units
- dropout (yes/no) in order to prevent overfitting
- learning rate

NOTE:
Cell and Hidden states are vectors which have a specific dimension (units parameters).

In [7]:
def rnn_builder(hp):
    # Define the hyperparams
    dropout = hp.Choice("dropout", [0.2, 0.5])
    lstm_units = hp.Choice("units", [15, 20, 50, 80])
    lr = hp.Choice("lr", [0.01, 0.001] )

    model = Sequential()    
    model.add(embedding_layer)  # the embedding layer
    model.add(LSTM(lstm_units, dropout=dropout))
    # if dropout:
    #     model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    opt = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [8]:
# The model-building function is called with different hyperparams values in different trial.
# In each trial, the tuner would generate a new set of hyperparameter values.
# The model is then fit and evaluated. The metrics are recorded.
# The tuner progressively explores the space and finally finds a good set of hyperparams values.
tuner = kt.RandomSearch(
    rnn_builder,
    objective = 'val_accuracy',
    max_trials = 10,
    directory = const.tuner_path,
    project_name = "task1_lstm"
)

tuner.search_space_summary()

Search space summary
Default search space size: 3
dropout (Choice)
{'default': 0.2, 'conditions': [], 'values': [0.2, 0.5], 'ordered': True}
units (Choice)
{'default': 50, 'conditions': [], 'values': [50, 20, 15], 'ordered': True}
lr (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001], 'ordered': True}


In [9]:
# define custom callbacks
stop_early_cb = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
tensorboard_cb = tf.keras.callbacks.TensorBoard(f"{const.logs_path}task1_lstm", update_freq='epoch')


In [10]:
tuner.search(train_tokens, review_data.train_data[1],
             batch_size=128, epochs=1000,
             validation_data=(val_tokens, review_data.val_data[1]),
             callbacks=[stop_early_cb, tensorboard_cb],
             verbose=0)




KeyboardInterrupt: 

In [None]:
# Get the optimal hyperparameters from the results
best_hps=tuner.get_best_hyperparameters()[0]


# Build model
h_model = tuner.hypermodel.build(best_hps)

# Train the hypertuned model
h_model.fit(train_tokens, review_data.train_data[1], epochs=1000, validation_data=(val_tokens, review_data.val_data[1]), callbacks=[stop_early])

In [None]:
best_model = tuner.get_best_models()[0]

# tuner.results_summary()
best_model.summary()