# NLP Task 2-b
In this step you will implement a “vanilla” model of the architecture. 
For this you will need to use Py- Torch or Tensorflow/Keras functional API and various layers
like Input, Embedding, Conv1D, Dropout, MaxPooling1D, Flatten, concatenate, Dense, etc. as well as
other utility functions such as Tokenizer. For some of the parameters, you should consider the values
suggested in Table 1. Please note that those values are “typical” but not necessarily optimal. First, you need to tokenize the texts and cut/pad them to a common max length size. Then you derive the train and test samples and labels. The “vanilla” model should contain the Embedding layer, a single convolution layer of only one block followed by a max-pooling layer and a single dense layer. You will report the classification accuracy of this model. (18 points)


In [1]:
#!pip install -r requirements.txt

In [10]:
import numpy as np
import pandas as pd
import random, sys
import tensorflow
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Conv1D, GlobalMaxPooling1D, MaxPooling1D, AveragePooling1D, Flatten, Concatenate, LeakyReLU, Dropout
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.utils import plot_model 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pydot
import graphviz
# Collect Text 
import keras_tuner as kt

#### Parameters (Recommendation)

W or number of convolution blocks in each layer: 2-5.  
L or number of consecuetive convolution-pooling layers: 2-4  
Maximal length of each review sequence: 300-500    
Dimension of word embbeddings: 150-300.  
Train : Test split of the data samples: 4:1 or 9:1.  
Number of filters in each convolution layer: 10-50.  
Kernel size in each convolution block: 1-5.  

Here we set up the parameters for our model.

In [11]:
# Number of convolution blocks in each layer
# W = 1
# Number of consecuetive convolution-pooling layers
# L = 1
#  Max lenght of each review sequence
UNIFORM_LENGTH = 300
# The dimension of the word embeddings
WORD_EMBEDDING_DIM = 150
# Number of filters in each convolution layer
CONV_FILTERS = 10
# Kernel size in each convolution block
KERNEL_SIZE = 1
# Vocabulary: number of most frequent words
VOCABULARY = 30000
# POOL_SIZE: Downsamples the input representation by taking the maximum value 
POOL_SIZE = 2
# Training and evaluation:
EPOCHS = 3
BATCH_SIZE = 64
VERBOSE = 1

### Loading and preparing data

Loading the pre-processed data and splitting them into train and test sets. Random state is fixed for reproducibility.

In [12]:
df = pd.read_csv('review_preprocessed.csv')
training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 40000
No. of testing examples: 10000


In [13]:
### Converting the pandas dataframe to lists and numpy arrays
X_train = training_data.loc[:,'review'].to_list()
X_test = testing_data.loc[:,'review'].to_list()
y_train = training_data.loc[:,'polarity'].to_numpy()
y_test = testing_data.loc[:,'polarity'].to_numpy()

The Tokenize class helps us to vectorize a text corpus by tunring them into a sequence of integers. 

In [14]:
t  = Tokenizer(num_words = VOCABULARY)

t.fit_on_texts(X_train)
X_train_enc = t.texts_to_sequences(X_train)
X_test_enc = t.texts_to_sequences(X_test)

Here we force a uniform length for each review. Longer reviews are truncated and shorted reviews and padded with zeros.

In [15]:
X_train_pad = pad_sequences(X_train_enc, maxlen=UNIFORM_LENGTH)
X_test_pad = pad_sequences(X_test_enc, maxlen=UNIFORM_LENGTH)

X_train = X_train_pad
X_test = X_test_pad

## CNN vanilla model with keras tuner

In [9]:
def call_existing_code(VOCABULARY, WORD_EMBEDDING_DIM, UNIFORM_LENGTH, CONV_FILTERS, KERNEL_SIZE):
    inputs = Input(shape=(UNIFORM_LENGTH,))
    x = Embedding(VOCABULARY, WORD_EMBEDDING_DIM, input_length=UNIFORM_LENGTH)(inputs)
    x = Conv1D(filters=CONV_FILTERS, kernel_size=KERNEL_SIZE, activation='relu')(x)
    x = MaxPooling1D(pool_size=POOL_SIZE)(x)
    x = Flatten()(x)
    x = Dense (100, activation='relu')(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=outputs)
    plot_model(model, to_file='model.png', show_shapes=True)
    model.compile(
        #optimizer=keras.optimizers.Adam(learning_rate=lr), ### Uncomment this to tune learning rate
        optimizer='adam',
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model


def build_model(hp):
    VOCABULARY = 30000
    WORD_EMBEDDING_DIM = hp.Int("output_dim", min_value=150, max_value=300, step=50)
    UNIFORM_LENGTH = 300
    CONV_FILTERS = hp.Int("filters", min_value=10, max_value=50, step=10)
    KERNEL_SIZE = hp.Int("kernel_size", min_value=1, max_value=5, step=1)

    # call existing model-building code with the hyperparameter values.
    model = call_existing_code(
        VOCABULARY=VOCABULARY, 
        WORD_EMBEDDING_DIM=WORD_EMBEDDING_DIM, 
        UNIFORM_LENGTH=UNIFORM_LENGTH, 
        CONV_FILTERS=CONV_FILTERS, 
        KERNEL_SIZE=KERNEL_SIZE
        )

    return model

build_model(kt.HyperParameters())


<keras.engine.functional.Functional at 0x7fd526057a00>

In [45]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    overwrite=True)
    
tuner.search_space_summary()

Search space summary
Default search space size: 3
output_dim (Int)
{'default': None, 'conditions': [], 'min_value': 150, 'max_value': 300, 'step': 50, 'sampling': None}
filters (Int)
{'default': None, 'conditions': [], 'min_value': 10, 'max_value': 50, 'step': 10, 'sampling': None}
kernel_size (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 5, 'step': 1, 'sampling': None}


In [46]:
tuner.search(X_train, y_train, epochs=2, batch_size=BATCH_SIZE, validation_data=(X_test, y_test))
best_model = tuner.get_best_models()[0]

Trial 5 Complete [00h 02m 28s]
val_accuracy: 0.9053000211715698

Best val_accuracy So Far: 0.9053000211715698
Total elapsed time: 00h 10m 31s
INFO:tensorflow:Oracle triggered exit


In [47]:
best_hps=tuner.get_best_hyperparameters()[0]
print("Optimal parameter for CONV_FILTERS: ", best_hps.get('filters'))
print("Optimal parameter for WORD_EMBEDDING_DIM: ", best_hps.get('output_dim'))
print("Optimal parameter for KERNEL_SIZE: ", best_hps.get('kernel_size'))

Optimal parameter for CONV_FILTERS:  30
Optimal parameter for WORD_EMBEDDING_DIM:  200
Optimal parameter for KERNEL_SIZE:  4


## Conclusion of Task 2(b)
The optimal parameters obtained in task 2(b) were a word embedding dimension of 200 and 30 convolution filters. Those are now used in task 2(c)

# Task 2(c)

In [22]:
def set_elaborate_model(vocabulary, word_embedding_dim, uniform_length, conv_filters, W, L):
    inputs = Input(shape=(uniform_length,))
    x = Embedding(vocabulary, word_embedding_dim, input_length=uniform_length)(inputs)
    
    #convolutional layers
    for l in range (L):
        c = []
        for w in range(W):
            if l == 0:
                c.append(Conv1D(filters=conv_filters, kernel_size=w+1, activation='relu')(x))
            else:
                c.append(Conv1D(filters=conv_filters, kernel_size=w+1, activation='relu')(p[w]))
        p = []
        for w in range(W):    
            p.append(MaxPooling1D(pool_size=POOL_SIZE)(c[w]))

    x = Concatenate(axis=1)(p)
    x = Flatten()(x)
    x = Dense (100, activation='relu')(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=outputs)
    plot_model(model, to_file='elaborate_model.png', show_shapes=True)
    model.compile(
        #optimizer=keras.optimizers.Adam(learning_rate=lr), ### Uncomment this to tune learning rate
        optimizer='adam',
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

elaborate_model = set_elaborate_model(VOCABULARY, 200, UNIFORM_LENGTH, 30, 3, 2)

In [23]:
history = elaborate_model.fit (X_train, y_train, batch_size=BATCH_SIZE, epochs=2, validation_data=(X_test, y_test))

Epoch 1/2
Epoch 2/2


In [68]:
print (history.history['val_accuracy'][-1])

0.8991000056266785


In [25]:
# grid search for W and L

W = [2, 3, 4, 5]
L = [2, 3, 4]

for w in W:
    for l in L:
            elaborate_model = set_elaborate_model(VOCABULARY, 200, UNIFORM_LENGTH, 30, w, l)
            history = elaborate_model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=2, validation_data=(X_test, y_test))
            print (f"Validation Accuracy with W={w} and L={l}: {history.history['val_accuracy'][-1]}.")

Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=2: 0.8970999717712402.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=3: 0.8880000114440918.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=4: 0.8917999863624573.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=3 and L=2: 0.9010000228881836.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=3 and L=3: 0.9003999829292297.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=3 and L=4: 0.8906999826431274.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=4 and L=2: 0.8988999724388123.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=4 and L=3: 0.9053000211715698.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=4 and L=4: 0.9003000259399414.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=2: 0.9059000015258789.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=3: 0.9010999798774719.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=4: 0.8973000049591064.


Best Validation Accuracy with W=5 and L=2: 0.9059000015258789.

In [24]:
# grid search for W and L and number of filters in convolution layer

W = [2, 3, 4, 5]
L = [2, 3, 4]
Conv_Layer = [20, 30, 40]

for w in W:
    for l in L:
        for c in Conv_Layer:
            elaborate_model = set_elaborate_model(VOCABULARY, 200, UNIFORM_LENGTH, c, w, l)
            history = elaborate_model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=2, validation_data=(X_test, y_test))
            print (f"Validation Accuracy with W={w} and L={l} and Conv_Layer={c}: {history.history['val_accuracy'][-1]}.")

Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=2 and Conv_Layer=20: 0.8970999717712402.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=2 and Conv_Layer=30: 0.8981999754905701.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=2 and Conv_Layer=40: 0.8974999785423279.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=3 and Conv_Layer=20: 0.8928999900817871.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=3 and Conv_Layer=30: 0.8999000191688538.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=3 and Conv_Layer=40: 0.9009000062942505.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=4 and Conv_Layer=20: 0.8776999711990356.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=4 and Conv_Layer=30: 0.8925999999046326.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=2 and L=4 and Conv_Layer=40: 0.8973000049591064.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=3 and L=2 and Conv_Layer=20: 0.8988999724388123.
Epoch 1/2
Epoch 2/2
Validation Accuracy 

Validation Accuracy with W=4 and L=3 and Conv_Layer=30: 0.9021000266075134.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=4 and L=3 and Conv_Layer=40: 0.8985999822616577.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=4 and L=4 and Conv_Layer=20: 0.8952999711036682.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=4 and L=4 and Conv_Layer=30: 0.8944000005722046.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=4 and L=4 and Conv_Layer=40: 0.8919000029563904.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=2 and Conv_Layer=20: 0.9046000242233276.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=2 and Conv_Layer=30: 0.907800018787384.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=2 and Conv_Layer=40: 0.8912000060081482.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=3 and Conv_Layer=20: 0.904699981212616.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=3 and Conv_Layer=30: 0.90420001745224.
Epoch 1/2
Epoch 2/2
Validation Accuracy with W=5 and L=3 and Con

Best Validation Accuracy with W=5 and L=2 and Conv_Layer=30: 0.907800018787384.

In [16]:
# Average Pooling
def set_elaborate_model(vocabulary, word_embedding_dim, uniform_length, conv_filters, W, L):
    inputs = Input(shape=(uniform_length,))
    x = Embedding(vocabulary, word_embedding_dim, input_length=uniform_length)(inputs)
    
    #convolutional layers
    for l in range (L):
        c = []
        for w in range(W):
            if l == 0:
                c.append(Conv1D(filters=conv_filters, kernel_size=w+1, activation='relu')(x))
            else:
                c.append(Conv1D(filters=conv_filters, kernel_size=w+1, activation='relu')(p[w]))
        p = []
        for w in range(W):    
            p.append(AveragePooling1D(pool_size=POOL_SIZE)(c[w]))

    x = Concatenate(axis=1)(p)
    x = Flatten()(x)
    x = Dense (100, activation='relu')(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=outputs)
    plot_model(model, to_file='elaborate_model_average.png', show_shapes=True)
    model.compile(
        #optimizer=keras.optimizers.Adam(learning_rate=lr), ### Uncomment this to tune learning rate
        optimizer='adam',
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

elaborate_model = set_elaborate_model (VOCABULARY, 200, UNIFORM_LENGTH, 30, 3, 2)

In [17]:
history = elaborate_model.fit (X_train, y_train, batch_size=BATCH_SIZE, epochs=2, validation_data=(X_test, y_test))

Epoch 1/2
Epoch 2/2


In [16]:
# with a lot of regularization

def set_elaborate_model(vocabulary, word_embedding_dim, uniform_length, conv_filters, W, L):
    inputs = Input(shape=(uniform_length,))
    x = Embedding(vocabulary, word_embedding_dim, input_length=uniform_length)(inputs)
    x = Dropout (0.4)(x)

    #convolutional layers
    for l in range (L):
        c = []
        for w in range(W):
            if l == 0:
                c.append (Conv1D(filters=conv_filters, kernel_size=w+3, activation=LeakyReLU(alpha=0.1),
                          kernel_regularizer=l2(1e-4),
                          bias_regularizer=l2(1e-4),
                          activity_regularizer=l2(1e-4))(x))
            else:
                c.append (Conv1D(filters=conv_filters, kernel_size=w+3, activation=LeakyReLU(alpha=0.1),
                          kernel_regularizer=l2(1e-4),
                          bias_regularizer=l2(1e-4),
                          activity_regularizer=l2(1e-4))(p[w-1]))
        p = []
        for w in range(W):    
            p.append (MaxPooling1D(pool_size=POOL_SIZE)(c[w]))

    x = Concatenate(axis=1)(p)
    x = Flatten()(x)
    x = Dense (1000, activation=LeakyReLU(alpha=0.1), activity_regularizer=l2(1e-2))(x)
    x = Dropout (0.4)(x)
    x = Dense (100, activation=LeakyReLU(alpha=0.1), activity_regularizer=l2(1e-2))(x)
    x = Dropout (0.2)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=outputs)
    plot_model(model, to_file='elaborate_model.png', show_shapes=True)
    model.compile(
        #optimizer=keras.optimizers.Adam(learning_rate=lr), ### Uncomment this to tune learning rate
        optimizer='adam',
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

In [17]:
elaborate_model = set_elaborate_model (VOCABULARY, 200, UNIFORM_LENGTH, 30, 2, 2)
history = elaborate_model.fit (X_train, y_train, batch_size=BATCH_SIZE, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
