# TASK 1
Riconoscimento review positiva o negativa

In [1]:
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Embedding
from sklearn.naive_bayes import MultinomialNB

from libraries.dataset import Dataset

import tensorflow.keras as keras

import libraries.preprocessing_utils as prep_utils
import libraries.models_builders as models_builders

import tensorflow as tf
import keras_tuner as kt

import constants as const

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\aless_vzq3wiu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aless_vzq3wiu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aless_vzq3wiu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data retrieving and preprocessing

In [2]:
review_data = Dataset('review', 'sentiment')
#  50_000 elements for each class
review_data.split(['text'], 'sentiment', n_samples=50_000)

Reading ./data_100_000/balanced_review_sentiment_train.csv...
File loaded in 0.0391 minutes
Reading ./data_100_000/balanced_review_sentiment_val.csv...
File loaded in 0.0006 minutes
Reading ./data_100_000/balanced_review_sentiment_test.csv...
File loaded in 0.0005 minutes


In [None]:
# preprocess text -> list[str]
# fit tokenizer and tokenize
tokenizer = prep_utils.get_tokenizer(review_data.train_data[0]['text'])

train_tokens = prep_utils.get_set_tokens(
    review_data.train_data[0]['text'], tokenizer, set='train', task='task1')

test_tokens = prep_utils.get_set_tokens(
    review_data.test_data[0]['text'], tokenizer, set='test', task='task1')

val_tokens = prep_utils.get_set_tokens(
    review_data.val_data[0]['text'], tokenizer, set='val', task='task1')

## Naive bayes

Let's use another texts

In [3]:
import libraries.filenames_generator as filenames  
# 
from sklearn.feature_extraction.text import CountVectorizer

prep_train_data = prep_utils.preprocess_texts(review_data.train_data[0]['text'], path= filenames.picked_cleaned_sentences(
        "train", "task1"))

prep_test_data = prep_utils.preprocess_texts(review_data.test_data[0]['text'], path= filenames.picked_cleaned_sentences(
        "test", "task1"))

prep_val_data = prep_utils.preprocess_texts(review_data.val_data[0]['text'], path= filenames.picked_cleaned_sentences(
        "val", "task1"))


Loading pickled cleaned sentences data from ./data_100_000/pickled/task1_train_cleaned_sentences.pkl...
Loading pickled cleaned sentences data from ./data_100_000/pickled/task1_test_cleaned_sentences.pkl...
Loading pickled cleaned sentences data from ./data_100_000/pickled/task1_val_cleaned_sentences.pkl...


In [None]:

vectorizer = CountVectorizer()

nb_train_data = vectorizer.fit_transform(prep_train_data[:30_000]).toarray()

In [None]:
nb_model = MultinomialNB()
nb_model.fit(nb_train_data, review_data.train_data[1][:30_000])

In [None]:
nb_test_data = vectorizer.transform(prep_test_data).toarray()

nb_model.score(nb_test_data, review_data.test_data[1])

In [None]:
i = 0
res = nb_model.predict(nb_test_data[i].reshape(1, -1))

print(f'''
REVIEW:
{review_data.test_data[0]['text'][i]}

REAL SENTIMENT: {review_data.test_data[1][i]} 
PREDICTED SENTIMENT: {res} - {'positive' if res else 'negative'}''')


## RNN - LSTM

In [None]:
e_matrix = prep_utils.get_embedding_matrix(const.word_embedding_filepath, 'task1',
                                            tokenizer, len(tokenizer.index_word)+1)

In [None]:
word_vector_dim = 100

vocab_size = len(tokenizer.word_index) +1
max_length = len(max(train_tokens, key=len))

embedding_layer = Embedding(vocab_size, word_vector_dim,
                            embeddings_initializer=Constant(e_matrix), trainable=False)

Let's define the functions that return the hypermodel with a specific hyperparameters search space.

Hyperparameters:
- number of units
- dropout (yes/no) in order to prevent overfitting
- learning rate

NOTE:
Cell and Hidden states are vectors which have a specific dimension (units parameters).

In [None]:
# define custom callbacks
stop_early_cb = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=15)

### First hyperparams trial

In [None]:
project_name = "task1_lstm_adam_128"

builder = models_builders.get_rnn_builder(
    drop=[0.2, 0.5],
    units=[15, 20, 50, 80],
    lrate=[0.01, 0.001],
    optimizer=keras.optimizers.Adam,
    embedding_layer=embedding_layer)

tuner = kt.RandomSearch(
    builder,
    objective = 'val_accuracy',
    max_trials = 10,
    directory = const.tuner_path, project_name = project_name
)

tuner.search_space_summary()

In [None]:
tuner.search(train_tokens, review_data.train_data[1],
             batch_size=128, epochs=1000,
             validation_data=(val_tokens, review_data.val_data[1]),
             callbacks=[
                 stop_early_cb,
                 tf.keras.callbacks.TensorBoard(const.logs_path + project_name, update_freq='epoch')],
             verbose=0)

#  executed

### Second hyperparameters trial

In [None]:
project_name = "task1_lstm_adam_64_new"

builder = models_builders.get_rnn_builder(
    drop=[0.2, 0.5],
    units=[100, 150],
    lrate=[0.1, 0.01, 0.001],
    optimizer=keras.optimizers.Adam,
    embedding_layer=embedding_layer)

tuner1 = kt.RandomSearch(
    builder,
    objective = 'val_accuracy',
    max_trials = 10,
    directory = const.tuner_path, project_name = "task1_lstm_adam_64"
)

tuner1.search_space_summary()

# executed
tuner1.search(train_tokens, review_data.train_data[1],
             batch_size=64, epochs=1000,
             validation_data=(val_tokens, review_data.val_data[1]),
             callbacks=[
                 stop_early_cb,
                 tf.keras.callbacks.TensorBoard(const.logs_path + project_name, update_freq='epoch')],
             verbose=0)



In [None]:
# # Get the optimal hyperparameters from the results
# best_hps=tuner.get_best_hyperparameters()[0]

# # Build model
# h_model = tuner.hypermodel.build(best_hps)

# # Train the hypertuned model
# h_model.fit(train_tokens, review_data.train_data[1], epochs=1000, validation_data=(val_tokens, review_data.val_data[1]), callbacks=[stop_early_cb])

In [None]:
# best_model = tuner.get_best_models()[0]

# # tuner.results_summary()
# best_model.summary()

## BERT transformer

In [4]:
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

In [5]:
#  load models from tf hub, small Bert chosen
handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [6]:
# NOTA: hub.kerasLayer -> wrappa un SavedModel (scaricato dall'hub) in un keras layer

def build_classifier_model():
  # crea un tensore simbolico rappresentante l'input, necessario per la
  # costruzione iniziale del modello keras  
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

  # -- creazione preprocessing layer e preprocessing dei dati -- 
  preprocessing_layer = hub.KerasLayer(handle_preprocess, name='preprocessing')
  # frasi processate dal preprocessing che saranno inputs dell'encoder 
  encoder_inputs = preprocessing_layer(text_input)

  # -- creazione enconder layer e generazione output --
  encoder = hub.KerasLayer(handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  
  # -- def net --
  # dense-> dropout -> output
  net = outputs['pooled_output'] # prendiamo in considerazione solo questo output
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)

  return tf.keras.Model(text_input, net)

In [7]:
model = build_classifier_model()

In [8]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [9]:
#  prepariamo i dati
train_df = tf.data.Dataset.from_tensor_slices((prep_train_data[:1000], review_data.train_data[1][:1000]))
val_df = tf.data.Dataset.from_tensor_slices((prep_val_data[:10], review_data.val_data[1][:10]))

In [10]:
epochs = 5

steps_per_epoch = tf.data.experimental.cardinality(train_df).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps, # lr decay
                                          optimizer_type='adamw')
                                          

In [12]:
for images, labels in train_df.take(1):  # only take first element of dataset
    print(images)
    print(labels)

tf.Tensor(b'yellow dog eats cool funky cafe lot crazy funky food went first time visiting friend orlando area past november looking casual place grab lunch couple beer came radar say yelp app menu large offer lot sandwich option many slathered lot delicious unhealthy thing almost went lighter dish thought hell vacation got fig sandwich pulled pork goat cheese bacon fried onion house made jalapeno fig sauce let tell thing fricken yuge massive piled pulled pork super messy sticky bc cheese sauce could barely eat yet somehow managed girl mess around come good food nice selection draft beer wash yummy sammies including one local fave reef donkey tampa bay brewing friend couple yuenglings definitely place would go diet pretty awesome atmosphere part hippy part hipster floridian sweet outdoor patio imagine size two bar likely get pretty busy weekend food counter service go place order call name pick counter service pretty friendly would definitely go back area', shape=(), dtype=string)
tf.Te

In [11]:
model.compile(optimizer=optimizer,
                loss=loss,
                metrics=metrics)

In [12]:
history = model.fit(x=train_df.batch(32),
                    validation_data=val_df.batch(1),
                    epochs=epochs)

Epoch 1/5
Epoch 2/5
 7/32 [=====>........................] - ETA: 3:41 - loss: 0.7139 - binary_accuracy: 0.5402

KeyboardInterrupt: 