# TASK 1
Riconoscimento review positiva o negativa

In [5]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras import Sequential
from sklearn.naive_bayes import MultinomialNB
import tensorflow.keras as keras

from libraries.dataset import Dataset

import libraries.preprocessing_utils as prep_utils
import libraries.models_builders as models
import tensorflow as tf
import keras_tuner as kt

# import libraries.utils as utils
import constants as const
import os

## Data retrieving

In [3]:
review_data = Dataset('review', 'sentiment')
review_data.split(['text'], 'sentiment', n_samples=500_000)

Reading ./data/balanced_review_sentiment_train.csv...
File loaded in 0.39 minutes
Reading ./data/balanced_review_sentiment_val.csv...
File loaded in 0.01 minutes
Reading ./data/balanced_review_sentiment_test.csv...
File loaded in 0.01 minutes


## Models
### Naive bayes

## RNN

In [4]:
# preprocess text -> list[str]
# fit tokenizer and tokenize
tokenizer = prep_utils.get_tokenizer(review_data.train_data[0]['text'])

train_tokens = prep_utils.get_set_tokens(
    review_data.train_data[0]['text'], tokenizer, set='train', task='task1')

test_tokens = prep_utils.get_set_tokens(
    review_data.test_data[0]['text'], tokenizer, set='test', task='task1')

val_tokens = prep_utils.get_set_tokens(
    review_data.val_data[0]['text'], tokenizer, set='val', task='task1')


In [6]:
# create embedding matrix

# get embedded matrix based containing vectors from a pretrained dict
# vectors are related only to words found in train sentences
e_matrix = prep_utils.get_embedding_matrix(const.word_embedding_filepath, 'task1',
                                            tokenizer, len(tokenizer.index_word)+1)

Loading pickled embedding matrix from ./data/embedding/task1_embedding_matrix.npy...
...embedding matrix loaded


In [7]:
embedding_size = 32
word_vector_dim = 100

vocab_size = len(tokenizer.word_index) +1
max_length = len(max(train_tokens, key=len))

embedding_layer = Embedding(vocab_size, word_vector_dim,
                            embeddings_initializer=Constant(e_matrix), trainable=False)

Let's define the functions that return the hypermodel with a specific hyperparameters search space.

Hyperparameters:
- number of units in dense layer
- dropout (yes/no) in order to prevent overfitting

NOTE:
Cell and Hidden states are vectors which have a specific dimension (units parameters).

In [12]:
import tensorflow.keras as keras

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10)

def build_rnn_model(hp):
    # Define the hyperparams
    dropout = hp.Choice("dropout", [0.2, 0.5])
    lstm_units = hp.Choice("units", [15,20,50])
    lr = hp.Choice("lr", [0.01, 0.001] )

    model = Sequential()    
    model.add(embedding_layer)  # the embedding layer
    model.add(LSTM(lstm_units, dropout=dropout))
    # if dropout:
    #     model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    opt = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


tuner = kt.RandomSearch(
    build_rnn_model,
    objective='val_accuracy',
    overwrite = True,
    max_trials=2,
    # directory=os.path.normpath("D:/keras_tuner"),
    )

tuner.search_space_summary()

# The model-building function is called with different hyperparams values in different trial.
# In each trial, the tuner would generate a new set of hyperparameter values.
# The model is then fit and evaluated. The metrics are recorded.
# The tuner progressively explores the space and finally finds a good set of hyperparams values.
tuner.search(train_tokens[:50], review_data.train_data[1][:50], batch_size=128, epochs=1,validation_data=(val_tokens[:500], review_data.val_data[1][:500]), callbacks=[stop_early, keras.callbacks.TensorBoard("./logs")])


Search space summary
Default search space size: 3
dropout (Choice)
{'default': 0.2, 'conditions': [], 'values': [0.2, 0.5], 'ordered': True}
units (Choice)
{'default': 15, 'conditions': [], 'values': [15, 20, 50], 'ordered': True}
lr (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001], 'ordered': True}

Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
dropout           |0.2               |?                 
units             |50                |?                 
lr                |0.01              |?                 



In [2]:
%load_ext tensorboard
%tensorboard --logdir="./untitled_project/"

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 13456), started 0:10:15 ago. (Use '!kill 13456' to kill it.)

In [32]:
# Get the optimal hyperparameters from the results
best_hps=tuner.get_best_hyperparameters()[0]

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10)

# Build model
h_model = tuner.hypermodel.build(best_hps)

# Train the hypertuned model
h_model.fit(train_tokens[:5000], review_data.train_data[1][:5000], epochs=500, validation_data=(val_tokens[:2000], review_data.val_data[1][:2000]), callbacks=[stop_early])


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500


<tensorflow.python.keras.callbacks.History at 0x192fb734640>

In [35]:
best_model = tuner.get_best_models()[0]

# tuner.results_summary()
best_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         27857000  
_________________________________________________________________
lstm (LSTM)                  (None, 20)                9680      
_________________________________________________________________
dense (Dense)                (None, 1)                 21        
Total params: 27,866,701
Trainable params: 9,701
Non-trainable params: 27,857,000
_________________________________________________________________


In [25]:
#  TODO
embedding_size = 32
batch_size = 128

word_vector_dim = 100
vocab_size = len(tokenizer.word_index) +1

max_length = len(max(train_tokens, key=len))

embedding_layer = Embedding(vocab_size, word_vector_dim,
                        embeddings_initializer=Constant(e_matrix), trainable=False)

model = Sequential()
model.add(embedding_layer)  # the embedding layer
model.add(LSTM(15, dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         27857000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 15)                6960      
_________________________________________________________________
dropout (Dropout)            (None, 15)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 16        
Total params: 27,863,976
Trainable params: 6,976
Non-trainable params: 27,857,000
_________________________________________________________________


In [26]:
history = model.fit(train_tokens, review_data.train_data[1], batch_size=batch_size, epochs=5,validation_data=(val_tokens, review_data.val_data[1]))

Epoch 1/5
Epoch 2/5