<a href="https://colab.research.google.com/github/mateopolancecofficial/NLP/blob/main/WineReviews/SequenceTextModelWithPretrainedEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install -q -U tensorflow-text

[?25l[K     |                                | 10kB 23.6MB/s eta 0:00:01[K     |▏                               | 20kB 21.4MB/s eta 0:00:01[K     |▎                               | 30kB 16.5MB/s eta 0:00:01[K     |▍                               | 40kB 14.3MB/s eta 0:00:01[K     |▌                               | 51kB 8.5MB/s eta 0:00:01[K     |▋                               | 61kB 9.2MB/s eta 0:00:01[K     |▊                               | 71kB 9.3MB/s eta 0:00:01[K     |▉                               | 81kB 10.4MB/s eta 0:00:01[K     |▉                               | 92kB 9.7MB/s eta 0:00:01[K     |█                               | 102kB 8.1MB/s eta 0:00:01[K     |█                               | 112kB 8.1MB/s eta 0:00:01[K     |█▏                              | 122kB 8.1MB/s eta 0:00:01[K     |█▎                              | 133kB 8.1MB/s eta 0:00:01[K     |█▍                              | 143kB 8.1MB/s eta 0:00:01[K     |█▌                    

In [2]:
pip install -q -U tf-models-official

[K     |████████████████████████████████| 1.1MB 8.7MB/s 
[K     |████████████████████████████████| 358kB 33.2MB/s 
[K     |████████████████████████████████| 102kB 10.0MB/s 
[K     |████████████████████████████████| 37.6MB 74kB/s 
[K     |████████████████████████████████| 1.2MB 51.9MB/s 
[K     |████████████████████████████████| 174kB 54.4MB/s 
[K     |████████████████████████████████| 51kB 8.3MB/s 
[K     |████████████████████████████████| 706kB 39.1MB/s 
[K     |████████████████████████████████| 645kB 54.6MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
pip install -q -U keras-tuner

[?25l[K     |█████▏                          | 10kB 25.4MB/s eta 0:00:01[K     |██████████▍                     | 20kB 19.3MB/s eta 0:00:01[K     |███████████████▋                | 30kB 15.2MB/s eta 0:00:01[K     |████████████████████▉           | 40kB 12.0MB/s eta 0:00:01[K     |██████████████████████████      | 51kB 8.8MB/s eta 0:00:01[K     |███████████████████████████████▎| 61kB 9.0MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 5.5MB/s 
[?25h  Building wheel for keras-tuner (setup.py) ... [?25l[?25hdone
  Building wheel for terminaltables (setup.py) ... [?25l[?25hdone


In [4]:
!git clone -l -s https://github.com/mateopolancecofficial/NLP.git

Cloning into 'NLP'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 64 (delta 29), reused 25 (delta 6), pack-reused 0[K
Unpacking objects: 100% (64/64), done.


In [35]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras.layers import (
    Dense,
    Dropout,
)
import kerastuner as kt
from official.nlp import optimization
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

tf.get_logger().setLevel('ERROR')
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"

In [13]:
import os

if tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')

Using GPU


In [30]:
# set parameters
AUTOTUNE = tf.data.AUTOTUNE
path_v1 = "/content/NLP/WineReviews/data/winemag-data-130k-v2.csv" 
path_v2 = "/content/NLP/WineReviews/data/winemag-data_first150k.csv"
checkpoint_path = "/content/NLP/WineReviews/checkpoints/cp-{epoch:04d}.ckpt"
batch_size = 1024
seed = 42
col_idx = 0
train_size, test_size, val_size = 0.9, 0.1, 0.1
columns = ['description', 'points']

In [15]:
def load_data(path_v1: str, path_v2: str, columns: list, col_idx: int):
  """
  Load and concatenate two datasets with removing duplicates.
  param path_v1: import path of first dataset
  param path_v2: import path of second dataset
  param columns: list of columns to preserve in dataframe
  param col_idx: index of column given in input columns list 
                 on which look for duplicates in dataframe
  return:        pandas dataframe
  """
  
  df_v1 = pd.read_csv(path_v1, index_col=0)
  df_v1 = df_v1[columns]
  # remove numbers form column description from first dataframe
  df_v1.description = df_v1.description.str.replace('\d+', '')

  df_v2 = pd.read_csv(path_v2, index_col=0)
  df_v2 = df_v2[columns]
  # remove numbers form column description from second dataframe
  df_v2.description = df_v2.description.str.replace('\d+', '')

  df = pd.concat([df_v1, df_v2])

  # dropping duplicte values
  df.drop_duplicates(subset = columns[col_idx],
                       keep = 'first', inplace = True)
  
  return df

In [16]:
def split_data(df: pd.DataFrame, train_size: float, test_size: float, val_size: float):
  """
  Split dataset on train, test and validation subsets.
  param df:          input dataframe
  param train_size:  fraction of train size
  param test_size:   fraction of test size
  param val_size:    fraction of validation size
  return:            dictionary, keys=names of dataframes, columns=dataframes
  """
  
  # shuffle dataset
  df = df.sample(frac = 1)
  
  # split on test and train set
  text_train, text_test, y_train, y_test = train_test_split(df.description, df.points,
                                           test_size=test_size, train_size=train_size)
  
  y_train, y_test = y_train.astype('float'), y_test.astype('float')
  
  # split train set on train and validation subsets
  text_train, text_val, y_train, y_val = train_test_split(text_train, y_train,
                                                  test_size=val_size, train_size=train_size)
  
  y_train, y_val = y_train.astype('float'), y_val.astype('float')
  

  dataset_dict = {
      'text_train': text_train,
      'y_train': y_train,
      'text_val': text_val,
      'y_val': y_val,
      'text_test': text_test,
      'y_test': y_test
  }

  return dataset_dict

In [17]:
def create_input_datasets(df_data: dict):
  """
  Create tensorflow datasets based on input dataframes for train, validation 
  and test subsets.
  param df_data: dictionary, keys=names of dataframes, columns=dataframes
  return:        dictionary, keys=names of datasets, columns=datasets
  """

  # create train dataset for input in tensorflow model
  train_dataset = tf.data.Dataset.from_tensor_slices((df_data['text_train'], 
                                                      df_data['y_train']))
  train_dataset = train_dataset.batch(batch_size)
  train_ds = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)

  # create validation dataset for input in tensorflow model
  val_dataset = tf.data.Dataset.from_tensor_slices((df_data['text_val'], 
                                                    df_data['y_val']))
  val_dataset = val_dataset.batch(batch_size)
  val_ds = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

  # create validation dataset for input in tensorflow model
  test_dataset = tf.data.Dataset.from_tensor_slices((df_data['text_test'], 
                                                     df_data['y_test']))
  test_dataset = test_dataset.batch(batch_size)
  test_ds = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

  datasets = {
      'train_dataset': train_dataset,
      'val_dataset': val_dataset,
      'test_dataset': test_dataset
  } 

  return datasets

In [18]:
# define loss functions
from tensorflow.keras import backend as K

def rmse():
  def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 
  return root_mean_squared_error

def rmsle():
  def root_mean_squared_log_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(K.log(1+y_pred) - K.log(1+y_true))))
  return root_mean_squared_log_error

In [36]:
# disable eager execution
tf.config.run_functions_eagerly(False)

def model_builder(hp):
  # use pretrained embeddings for input layer
  hub_model = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
  # 'trainable=True' - boolean controlling whether this layer is trainable
  hub_layer = hub.KerasLayer(hub_model, input_shape=[], dtype=tf.string, 
                             trainable=True)
  model = tf.keras.Sequential()
  model.add(hub_layer)
  hp_units_1 = hp.Int('units_1', min_value=64, max_value=128, step=16)
  activation=hp.Choice(
        'dense_activation',
        values=['relu', 'tanh', 'sigmoid'],
        default='relu'
    )
  model.add(Dense(units=hp_units_1, activation=activation))
  model.add(
            Dropout(rate=hp.Float(
                'dropout_1',
                min_value=0.1,
                max_value=0.5,
                default=0.25,
                step=0.1,
            ))
        )
  hp_units_2 = hp.Int('units_2', min_value=8, max_value=64, step=16)
  model.add(Dense(units=hp_units_2, activation=activation))
  model.add(
            Dropout(rate=hp.Float(
                'dropout_2',
                min_value=0.1,
                max_value=0.5,
                default=0.25,
                step=0.1,
            ))
        )
  model.add(Dense(1))

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
              loss=rmse(),
              metrics=['mean_absolute_error'])

  return model


In [20]:
# call data transformation functions
df = load_data(path_v1, path_v2, columns, col_idx)
df_data = split_data(df, train_size, test_size, val_size)
datasets = create_input_datasets(df_data)

In [37]:
tuner = kt.Hyperband(model_builder,
                     objective='mean_absolute_error',
                     max_epochs=10,
                     directory='SequenceTextPretrained'
                    )
                    

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(df_data['text_train'], df_data['y_train'], 
             validation_data=(df_data['text_val'], df_data['y_val']), 
             epochs=50, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps.get('units')

Trial 5 Complete [00h 10m 34s]
mean_absolute_error: 12.956561088562012

Best mean_absolute_error So Far: 5.738969326019287
Total elapsed time: 00h 52m 45s

Search: Running Trial #6

Hyperparameter    |Value             |Best Value So Far 
units             |80                |96                
units_1           |80                |112               
dense_activation  |sigmoid           |tanh              
dropout_1         |0.2               |0.3               
units_2           |8                 |24                
dropout_2         |0.4               |0.2               
learning_rate     |0.01              |0.01              
tuner/epochs      |2                 |2                 
tuner/initial_e...|0                 |0                 
tuner/bracket     |2                 |2                 
tuner/round       |0                 |0                 

Epoch 1/2
Epoch 2/2
 278/4289 [>.............................] - ETA: 4:43 - loss: 18.4942 - mean_absolute_error: 14.7847

In [None]:
model = tuner.hypermodel.build(best_hps)

In [None]:
history = model.fit(x=train_ds,
                    validation_data=val_ds,
                    epochs=50,
                    callbacks=[es_callback]
                    )