# TASK 1
Riconoscimento review positiva o negativa

In [4]:
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Embedding
from sklearn.naive_bayes import MultinomialNB

from libraries.dataset import Dataset

import tensorflow.keras as keras

import libraries.preprocessing_utils as prep_utils
import libraries.models_builders as models_builders

import tensorflow as tf
import keras_tuner as kt

import constants as const

## Data retrieving and preprocessing

In [5]:
review_data = Dataset('review', 'sentiment')
#  50_000 for element
review_data.split(['text'], 'sentiment', n_samples=50_000)

Reading ./data_100_000/balanced_review_sentiment_train.csv...
File loaded in 0.03 minutes
Reading ./data_100_000/balanced_review_sentiment_val.csv...
File loaded in 0.0 minutes
Reading ./data_100_000/balanced_review_sentiment_test.csv...
File loaded in 0.0 minutes


In [None]:
# preprocess text -> list[str]
# fit tokenizer and tokenize
tokenizer = prep_utils.get_tokenizer(review_data.train_data[0]['text'])

train_tokens = prep_utils.get_set_tokens(
    review_data.train_data[0]['text'], tokenizer, set='train', task='task1')

test_tokens = prep_utils.get_set_tokens(
    review_data.test_data[0]['text'], tokenizer, set='test', task='task1')

val_tokens = prep_utils.get_set_tokens(
    review_data.val_data[0]['text'], tokenizer, set='val', task='task1')

## Naive bayes

Let's use another texts

In [6]:
from libraries import utils

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer

prep_train_data = prep_utils.preprocess_texts(review_data.train_data[0]['text'], path= utils.cleaned_sentences_file_name(
        "train", "task1"))[:50_000]

prep_test_data = prep_utils.preprocess_texts(review_data.test_data[0]['text'], path= utils.cleaned_sentences_file_name(
        "test", "task1"))

vectorizer = CountVectorizer()

nb_train_data = vectorizer.fit_transform(prep_train_data).toarray()

Loading pickled cleaned sentences data from ./data_100_000/pickled/task1_train_cleaned_sentences.pkl...
Loading pickled cleaned sentences data from ./data_100_000/pickled/task1_test_cleaned_sentences.pkl...


In [7]:
nb_model = MultinomialNB()
nb_model.fit(nb_train_data, review_data.train_data[1][:50_000])

MultinomialNB()

In [22]:
nb_test_data = vectorizer.transform(prep_test_data).toarray()

nb_model.score(nb_test_data, review_data.test_data[1])

0.849

In [32]:
i = 0
res = nb_model.predict(nb_test_data[i].reshape(1, -1))

print(f'''
REVIEW:
{review_data.test_data[0]['text'][i]}

REAL SENTIMENT: {review_data.test_data[1][i]} 
PREDICTED SENTIMENT: {res} - {'positive' if res else 'negative'}''')



REVIEW:
Being a Chinese, I have been to a fair share of Chinese restaurants all over the lower mainland for dim sum with my family and friends.  By far this place has been the ABSOLUTE WORST one!  The service at this place is absolutely horrendous!  Took forever for the food to come out and they just literally throw it onto your table.  The servers have such bad attitude and treated us with hatred and distaste.  After repeatedly asking for extra sauce, we had to go to their stations to get extra soy sauce ourselves.  Everything turned to self-serve, even refilling our own tea!! The servers would treat their regulars with all smiles but completely ignored my table.  I would absolutely never come back here again!  Would not recommend this place to anyone I know!

REAL SENTIMENT: 0.0 
PREDICTED SENTIMENT: [0.] - negative


## RNN - LSTM

In [23]:
e_matrix = prep_utils.get_embedding_matrix(const.word_embedding_filepath, 'task1',
                                            tokenizer, len(tokenizer.index_word)+1)

Loading pickled embedding matrix from ./data_100_000/embedding/task1_embedding_matrix.npy...
...embedding matrix loaded


In [24]:
word_vector_dim = 100

vocab_size = len(tokenizer.word_index) +1
max_length = len(max(train_tokens, key=len))

embedding_layer = Embedding(vocab_size, word_vector_dim,
                            embeddings_initializer=Constant(e_matrix), trainable=False)

Let's define the functions that return the hypermodel with a specific hyperparameters search space.

Hyperparameters:
- number of units
- dropout (yes/no) in order to prevent overfitting
- learning rate

NOTE:
Cell and Hidden states are vectors which have a specific dimension (units parameters).

In [25]:
# define custom callbacks
stop_early_cb = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=15)

#### First hyperparams trial

In [26]:
project_name = "task1_lstm_adam_128"

builder = models_builders.get_rnn_builder(
    drop=[0.2, 0.5],
    units=[15, 20, 50, 80],
    lrate=[0.01, 0.001],
    optimizer=keras.optimizers.Adam,
    embedding_layer=embedding_layer)

tuner = kt.RandomSearch(
    builder,
    objective = 'val_accuracy',
    max_trials = 10,
    directory = const.tuner_path, project_name = project_name
)

tuner.search_space_summary()

Search space summary
Default search space size: 3
dropout (Choice)
{'default': 0.2, 'conditions': [], 'values': [0.2, 0.5], 'ordered': True}
units (Choice)
{'default': 15, 'conditions': [], 'values': [15, 20, 50, 80], 'ordered': True}
lr (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001], 'ordered': True}


In [None]:
tuner.search(train_tokens, review_data.train_data[1],
             batch_size=128, epochs=1000,
             validation_data=(val_tokens, review_data.val_data[1]),
             callbacks=[
                 stop_early_cb,
                 tf.keras.callbacks.TensorBoard(const.logs_path + project_name, update_freq='epoch')],
             verbose=0)

#  executed

## Second trial

In [27]:
project_name = "task1_lstm_adam_64_new"

builder = models_builders.get_rnn_builder(
    drop=[0.2, 0.5],
    units=[100, 150],
    lrate=[0.1, 0.01, 0.001],
    optimizer=keras.optimizers.Adam,
    embedding_layer=embedding_layer)

tuner1 = kt.RandomSearch(
    builder,
    objective = 'val_accuracy',
    max_trials = 10,
    directory = const.tuner_path, project_name = "task1_lstm_adam_64"
)

tuner1.search_space_summary()

# executed
tuner1.search(train_tokens, review_data.train_data[1],
             batch_size=64, epochs=1000,
             validation_data=(val_tokens, review_data.val_data[1]),
             callbacks=[
                 stop_early_cb,
                 tf.keras.callbacks.TensorBoard(const.logs_path + project_name, update_freq='epoch')],
             verbose=0)



Search space summary
Default search space size: 3
dropout (Choice)
{'default': 0.2, 'conditions': [], 'values': [0.2, 0.5], 'ordered': True}
units (Choice)
{'default': 100, 'conditions': [], 'values': [100, 150], 'ordered': True}
lr (Choice)
{'default': 0.1, 'conditions': [], 'values': [0.1, 0.01, 0.001], 'ordered': True}


In [None]:
# # Get the optimal hyperparameters from the results
# best_hps=tuner.get_best_hyperparameters()[0]

# # Build model
# h_model = tuner.hypermodel.build(best_hps)

# # Train the hypertuned model
# h_model.fit(train_tokens, review_data.train_data[1], epochs=1000, validation_data=(val_tokens, review_data.val_data[1]), callbacks=[stop_early_cb])

In [None]:
# best_model = tuner.get_best_models()[0]

# # tuner.results_summary()
# best_model.summary()