<a href="https://colab.research.google.com/github/mateopolancecofficial/WineReviews/blob/main/RNNWordEmbeddingsRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone -l -s https://github.com/mateopolancecofficial/WineReviews.git

Cloning into 'WineReviews'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 22 (delta 9), reused 11 (delta 3), pack-reused 0[K
Unpacking objects: 100% (22/22), done.


In [2]:
pip install -q -U tensorflow-text

[K     |████████████████████████████████| 3.4MB 11.3MB/s 
[?25h

In [3]:
pip install -q -U keras-tuner

[?25l[K     |█████▏                          | 10kB 20.0MB/s eta 0:00:01[K     |██████████▍                     | 20kB 22.1MB/s eta 0:00:01[K     |███████████████▋                | 30kB 16.4MB/s eta 0:00:01[K     |████████████████████▉           | 40kB 14.5MB/s eta 0:00:01[K     |██████████████████████████      | 51kB 12.2MB/s eta 0:00:01[K     |███████████████████████████████▎| 61kB 13.6MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 6.4MB/s 
[?25h  Building wheel for keras-tuner (setup.py) ... [?25l[?25hdone
  Building wheel for terminaltables (setup.py) ... [?25l[?25hdone


In [7]:
pip install -q -U tf-models-official

[K     |████████████████████████████████| 1.1MB 10.9MB/s 
[K     |████████████████████████████████| 358kB 36.3MB/s 
[K     |████████████████████████████████| 174kB 38.9MB/s 
[K     |████████████████████████████████| 706kB 37.1MB/s 
[K     |████████████████████████████████| 51kB 6.8MB/s 
[K     |████████████████████████████████| 37.6MB 77kB/s 
[K     |████████████████████████████████| 102kB 11.5MB/s 
[K     |████████████████████████████████| 1.2MB 38.5MB/s 
[K     |████████████████████████████████| 645kB 38.6MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone


## DNN regressors

 In this notebook we will use three different ML models. Two of them will use word embeddings for input while the last one wil use TF-IDF features

 First implementation
   - transform input data into TF-IDF features and fed into ML model

Second implementation
   - create dense feature representation vector
   - use trainable embeding layer which will create word embeddings
   - use RNN layer to predict sentence embeddings based on given word embeddings
   - add few Dense and Dropout layers
  
Third implementation
   - use BERT preprocessing and encoder models for creating sentence embeddings
   - add few Dense and Dropout layers

All models will be fine tuned and trained with best parameters.

Finally, R-squared and mean absolute error will be visualize for each model.

In [8]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text
from tensorflow import keras
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Input
)
import kerastuner as kt
from official.nlp import optimization
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

tf.get_logger().setLevel('ERROR')
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"


In [9]:
import os

if tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')

Using GPU


### Load dataset and set global parameters

Before text encoding and generating worde embeddings we need to load dataset, make train, validation and test splits and load it into tensorflow dataset.

In [16]:
# set parameters
AUTOTUNE = tf.data.AUTOTUNE
path_v1 = "/content/WineReviews/Data/winemag-data-130k-v2.csv" 
path_v2 = "/content/WineReviews/Data/winemag-data_first150k.csv"
batch_size = 2048
col_idx = 0
train_size, test_size, val_size = 0.8, 0.2, 0.2
transform = 'normalize'
columns = ['description', 'points']

In [17]:
def load_data(path_v1: str, path_v2: str, columns: list, col_idx: int):
  """
  Load and concatenate two datasets with removing duplicates.
  param path_v1: import path of first dataset
  param path_v2: import path of second dataset
  param columns: list of columns to preserve in dataframe
  param col_idx: index of column given in input columns list 
                 on which look for duplicates in dataframe
  return:        pandas dataframe
  """
  
  df_v1 = pd.read_csv(path_v1, index_col=0)
  df_v1 = df_v1[columns]
  # remove numbers form column description from first dataframe
  df_v1.description = df_v1.description.str.replace('\d+', '')

  df_v2 = pd.read_csv(path_v2, index_col=0)
  df_v2 = df_v2[columns]
  # remove numbers form column description from second dataframe
  df_v2.description = df_v2.description.str.replace('\d+', '')

  df = pd.concat([df_v1, df_v2])

  # dropping duplicte values
  df.drop_duplicates(subset = columns[col_idx],
                       keep = 'first', inplace = True)
  
  return df

In [18]:
def split_data(df: pd.DataFrame, train_size: float, test_size: float, 
               val_size: float):
  """
  Split dataset on train, test and validation subsets.
  param df:          input dataframe
  param train_size:  fraction of train size
  param test_size:   fraction of test size
  param val_size:    fraction of validation size
  return:            dictionary, keys=names of dataframes, columns=dataframes
  """
  
  # shuffle dataset
  df = df.sample(frac = 1)
  
  # split on test and train set
  text_train, text_test, y_train, y_test = train_test_split(df.description, df.points,
                                           test_size=test_size, train_size=train_size)
  
  y_train, y_test = y_train.astype('float'), y_test.astype('float')
  
  # split train set on train and validation subsets
  text_train, text_val, y_train, y_val = train_test_split(text_train, y_train,
                                                  test_size=val_size, train_size=train_size)
  
  y_train, y_val = y_train.astype('float'), y_val.astype('float')

  y_train = y_train.to_numpy().reshape(-1, 1)
  y_val = y_val.to_numpy().reshape(-1, 1)
  y_test = y_test.to_numpy().reshape(-1, 1)

  # apply target variable transformation
  if transform == 'normalize':
    scaler = MinMaxScaler()
    scaler.fit(y_train)
    y_train = scaler.transform(y_train)
    y_val = scaler.transform(y_val)
    y_test = scaler.transform(y_test)
    
  elif transform == 'standardize':
    scaler = StandardScaler()
    scaler.fit(y_train)
    y_train = scaler.transform(y_train)
    y_val = scaler.transform(y_val)
    y_test = scaler.transform(y_test)
    
  else:
    scaler = MinMaxScaler()
    scaler.fit(y_train)
    y_train = scaler.transform(y_train)
    y_val = scaler.transform(y_val)
    y_test = scaler.transform(y_test)
  
  y_train, y_val, y_test = y_train.ravel(), y_val.ravel(), y_test.ravel()

  dataset_dict = {
      'text_train': text_train,
      'y_train': y_train,
      'text_val': text_val,
      'y_val': y_val,
      'text_test': text_test,
      'y_test': y_test,
      'scaler': scaler
  }

  return dataset_dict

In [19]:
def create_input_datasets(df_data: dict):
  """
  Create tensorflow datasets based on input dataframes for train, validation 
  and test subsets.
  param df_data: dictionary, keys=names of dataframes, columns=dataframes
  return:        dictionary, keys=names of datasets, columns=datasets
  """

  # create train dataset for input in tensorflow model
  train_dataset = tf.data.Dataset.from_tensor_slices((df_data['text_train'], 
                                                      df_data['y_train']))
  train_dataset = train_dataset.batch(batch_size)
  train_ds = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)

  # create validation dataset for input in tensorflow model
  val_dataset = tf.data.Dataset.from_tensor_slices((df_data['text_val'], 
                                                    df_data['y_val']))
  val_dataset = val_dataset.batch(batch_size)
  val_ds = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

  # create validation dataset for input in tensorflow model
  test_dataset = tf.data.Dataset.from_tensor_slices((df_data['text_test'], 
                                                     df_data['y_test']))
  test_dataset = test_dataset.batch(batch_size)
  test_ds = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

  datasets = {
      'train_dataset': train_dataset,
      'val_dataset': val_dataset,
      'test_dataset': test_dataset
  } 

  return datasets

In [22]:
# call data transformation functions
df = load_data(path_v1, path_v2, columns, col_idx)
df_data = split_data(df, train_size, test_size, val_size)
datasets = create_input_datasets(df_data)

In [24]:
for example, label in datasets['train_dataset'].take(1):
  print('label: ', label.numpy())

label:  [0.5  0.55 0.4  ... 0.45 0.35 0.45]


In [None]:
# define loss functions
from tensorflow.keras import backend as K

def rmse():
  def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 
  return root_mean_squared_error

def rmsle():
  def root_mean_squared_log_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(K.log(1+y_pred) - K.log(1+y_true))))
  return root_mean_squared_log_error

### RNN

In [26]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(datasets['train_dataset'].map(lambda text, label: text))

In [36]:
# disable eager execution
tf.config.run_functions_eagerly(False)

def rnn_model_builder(hp):
  model = tf.keras.Sequential()
  model.add(encoder)
  model.add(tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True))
  model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)))
  model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
  hp_units_1 = hp.Int('units_1', min_value=64, max_value=128, step=16)
  activation=hp.Choice(
        'dense_activation',
        values=['relu', 'tanh', 'sigmoid'],
        default='relu'
    )
  model.add(Dense(units=hp_units_1, activation=activation))
  model.add(
            Dropout(rate=hp.Float(
                'dropout_1',
                min_value=0.1,
                max_value=0.5,
                default=0.25,
                step=0.1,
            ))
        )
  hp_units_2 = hp.Int('units_2', min_value=8, max_value=64, step=16)
  model.add(Dense(units=hp_units_2, activation=activation))
  model.add(
            Dropout(rate=hp.Float(
                'dropout_2',
                min_value=0.1,
                max_value=0.5,
                default=0.25,
                step=0.1,
            ))
        )
  model.add(Dense(1))

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
              loss=rmse(),
              metrics=['mean_absolute_error'])

  return model

In [37]:
# define early stop callback to prevent overfitting
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15)

In [40]:
tuner = kt.Hyperband(rnn_model_builder,
                     objective='mean_absolute_error',
                     max_epochs=5,
                     directory='RNN'
                    )

In [None]:
tuner.search(df_data['text_train'], df_data['y_train'], 
             validation_data=(df_data['text_val'], df_data['y_val']), 
             epochs=5, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 9 Complete [00h 26m 55s]
mean_absolute_error: 0.07307424396276474

Best mean_absolute_error So Far: 0.0705166757106781
Total elapsed time: 02h 21m 17s

Search: Running Trial #10

Hyperparameter    |Value             |Best Value So Far 
units_1           |64                |112               
dense_activation  |tanh              |tanh              
dropout_1         |0.4               |0.3               
units_2           |8                 |40                
dropout_2         |0.5               |0.2               
learning_rate     |0.01              |0.001             
tuner/epochs      |5                 |5                 
tuner/initial_e...|0                 |2                 
tuner/bracket     |0                 |1                 
tuner/round       |0                 |1                 

Epoch 1/5
Epoch 2/5
Epoch 3/5

In [None]:
model_rnn = tuner.hypermodel.build(best_hps)

In [33]:
# checkpoints callback is not defined because a lack of disk space on Google Colab
history_rnn = model_rnn.fit(datasets['train_dataset'],
                    validation_data=datasets['val_dataset'],
                    epochs=100,
                    callbacks=[stop_early]
                    )



### BERT

BERT model will be loaded from TensorFlow Hub and fine-tuned. There are multiple BERT models available.

We will use Small BERT which have the same general architecture but fewer and/or smaller Transformer blocks than BERT Base.

Text inputs need to be transformed to numeric token ids and arranged in several Tensors before being input to BERT. TensorFlow Hub provides a matching preprocessing model for each of the BERT available.

In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
# disable eager execution
tf.config.run_functions_eagerly(False)

def bert_model_builder(hp):
  """ Use and fine tune BERT model for regression task. """

  text_input = Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = Dropout(rate=hp.Float(
                'dropout_1',
                min_value=0.1,
                max_value=0.5,
                default=0.25,
                step=0.1,
            ))(net)
  net = Dense(units=1, activation=None)(net)
  
  model = tf.keras.Model(text_input, net)

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  
  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=rmse(),
                metrics=['mean_absolute_error'])

  return model

In [None]:
bert_tuner = kt.Hyperband(bert_model_builder,
                          objective='mean_absolute_error',
                          max_epochs=5,
                          directory='BERT'
                         )

In [None]:
bert_tuner.search(df_data['text_train'], df_data['y_train'], 
                  validation_data=(df_data['text_val'], df_data['y_val']), 
                  epochs=5, callbacks=[stop_early])

# Get the optimal hyperparameters
bert_best_hps = bert_tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
model_bert = bert_tuner.hypermodel.build(bert_best_hps)

history_bert = model_bert.fit(datasets['train_dataset'],
                              validation_data=datasets['val_dataset'],
                              epochs=100,
                              callbacks=[stop_early]
                             )

### Visualize scores

Get predictions of all models for points target variable.

In [None]:
y_real = df_data['scaler'].inverse_transform(df_data['y_test'].reshape(-1, 1))

rnn_result = model_rnn.predict(df_data['text_test'])
y_predict_rnn = df_data['scaler'].inverse_transform(rnn_result.reshape(-1, 1))

bert_result = model_bert.predict(df_data['text_test'])
y_predict_bert = df_data['scaler'].inverse_transform(bert_result.reshape(-1, 1))

In [None]:
estimators = {'RNN': 
              {
                  'mae': mean_absolute_error(y_real, y_predict_rnn),
                  'r2_score': r2_score(y_real, y_predict_rnn),
                  'y_predict': y_predict_rnn
              },
              'BERT': 
              {
                  'mae': mean_absolute_error(y_real, y_predict_bert),
                  'r2_score': r2_score(y_real, y_predict_bert),
                  'y_predict': y_predict_bert
              }
             }

In [34]:
from sklearn.metrics import mean_absolute_error

rnn_result = model_rnn.predict(df_data['text_test'])
y_real = df_data['scaler'].inverse_transform(df_data['y_test'].reshape(-1, 1))
y_predict_rnn = df_data['scaler'].inverse_transform(rnn_result.reshape(-1, 1))

mean_absolute_error(y_real, y_predict_rnn)

Test Loss: 0.10764910280704498
Test Accuracy: 0.08494479954242706


In [None]:
from sklearn.metrics import r2_score

r2_score(y_real, y_predict_rnn)

In [None]:
def plot_regression_results(ax, y_true, y_pred, title, scores):
    """Scatter plot of the predicted vs true targets. """
    
    ax.plot([y_true.min(), y_true.max()],
            [y_true.min(), y_true.max()],
            '--r', linewidth=2)
    ax.scatter(y_true, y_pred, alpha=0.2)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    ax.set_xlim([y_true.min(), y_true.max()])
    ax.set_ylim([y_true.min(), y_true.max()])
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
                          edgecolor='none', linewidth=0)
    ax.legend([extra], [scores], loc='upper left')
    title = title
    ax.set_title(title)

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(9, 7))
axs = np.ravel(axs)

for ax, name in zip(axs, list(estimators.keys())):
    
    plot_regression_results(
        ax, y_real, estimators[name]['y_predict'],
        name,
        (r'r2_score={:.2f}' + '\n' + r'mae={:.2f}')
        .format(estimators[name]['r2_score'],
                estimators[name]['mae']))

plt.suptitle('Predictors comparison ')
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()