<a href="https://colab.research.google.com/github/mavho/CSE143/blob/master/A2_Part3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

Installs the correct packages.

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0.

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    #pip install -q -U tensorflow-addons
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.test.is_gpu_available():
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

TensorFlow 2.x selected.
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


# Sentiment Analysis

Below is an example of how to do custom preprocessing

In [2]:
import tensorflow_datasets as tfds
train_data, dev_data, test_data = tfds.load("imdb_reviews", split=('train[:60%]', 'train[60%:]', 'test'), as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…






HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK8EDEY/imdb_reviews-train.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK8EDEY/imdb_reviews-test.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK8EDEY/imdb_reviews-unsupervised.tfrecord


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
for X_batch, y_batch in train_data.batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [0]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

Construct the vocaulary, counts occurence of each word.

In [0]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in train_data.batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

Look at only the first 10000 most common words in the vocab

In [0]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

# Using GloVe Embeddings


Get and extract Glove datasets

In [7]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!rm glove.6B.zip
!mkdir glove
!mv *.txt glove

--2020-02-18 00:03:03--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-02-18 00:03:04--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-02-18 00:03:04--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [8]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"This This this".split():
    print(word_to_id.get(word) or vocab_size)



22
22
9


In [0]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [10]:
train_set = train_data.batch(32).map(preprocess)
for X_batch, y_batch in train_set.batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review[:200], "...")
        print("Label:", label)
        print()

Review: [[b'This' b'was' b'an' ... b'<pad>' b'<pad>' b'<pad>']
 [b'I' b'have' b'been' ... b'<pad>' b'<pad>' b'<pad>']
 [b'Mann' b'photographs' b'the' ... b'<pad>' b'<pad>' b'<pad>']
 ...
 [b'This' b'movie' b'never' ... b'went' b'st' b'<pad>']
 [b'Mike' b'Brady' b'Michael' ... b'<pad>' b'<pad>' b'<pad>']
 [b'Honestly' b'Barbra' b'I' ... b'<pad>' b'<pad>' b'<pad>']] ...
Label: [0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0]

Review: [[b'The' b'Confidential' b'part' ... b'<pad>' b'<pad>' b'<pad>']
 [b'I' b'was' b'thirteen' ... b'an' b'<pad>' b'<pad>']
 [b"I'm" b'sorry' b'I' ... b'<pad>' b'<pad>' b'<pad>']
 ...
 [b'A' b'hilarious' b'and' ... b'<pad>' b'<pad>' b'<pad>']
 [b'Judging' b'from' b'this' ... b'<pad>' b'<pad>' b'<pad>']
 [b'I' b"didn't" b'expect' ... b'<pad>' b'<pad>' b'<pad>']] ...
Label: [0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1]



In [0]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch
    
train_set = train_set.map(encode_words).prefetch(1)


In [12]:
for X_batch, y_batch in train_set.batch(1).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review[:200], "...")
        print("Label:", label)
        print()

Review: [[  22   11   28 ...    0    0    0]
 [   6   21   71 ...    0    0    0]
 [3278 6289    1 ...    0    0    0]
 ...
 [  22   12  120 ...  332 1030    0]
 [1810 3594  490 ...    0    0    0]
 [2997 5393    6 ...    0    0    0]] ...
Label: [0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0]



In [13]:
embeddings_index = dict()
f = open('/content/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [0]:
embed_size = vocab_size+num_oov_buckets
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((embed_size, 100))
for word, index in word_to_id.items():
    if index > embed_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word.decode('utf8'))
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector


In [46]:
print(embedding_matrix[word_to_id[b'the']])

[-0.038194   -0.24487001  0.72812003 -0.39961001  0.083172    0.043953
 -0.39140999  0.3344     -0.57545     0.087459    0.28786999 -0.06731
  0.30906001 -0.26383999 -0.13231    -0.20757     0.33395001 -0.33848
 -0.31742999 -0.48335999  0.1464     -0.37303999  0.34577     0.052041
  0.44946    -0.46970999  0.02628    -0.54154998 -0.15518001 -0.14106999
 -0.039722    0.28277001  0.14393     0.23464    -0.31020999  0.086173
  0.20397     0.52623999  0.17163999 -0.082378   -0.71787    -0.41531
  0.20334999 -0.12763     0.41367     0.55186999  0.57907999 -0.33476999
 -0.36559001 -0.54856998 -0.062892    0.26583999  0.30204999  0.99774998
 -0.80480999 -3.0243001   0.01254    -0.36941999  2.21670008  0.72201002
 -0.24978     0.92136002  0.034514    0.46744999  1.10790002 -0.19358
 -0.074575    0.23353    -0.052062   -0.22044     0.057162   -0.15806
 -0.30798    -0.41624999  0.37972     0.15006    -0.53211999 -0.20550001
 -1.25259995  0.071624    0.70564997  0.49744001 -0.42063001  0.26148
 -

In [0]:
glove_model = keras.Sequential([
    keras.layers.Embedding(embed_size,output_dim=100,weights=[embedding_matrix],input_length=100, trainable=False),
    keras.layers.GRU(100, activation="tanh",dropout=0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(1, activation="sigmoid")
])
glove_model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [26]:

batch_size = 32
history = glove_model.fit(train_set, epochs=13)

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


In [0]:
glove_model.summary()

In [0]:
test_set = test_data.batch(32).map(preprocess)
test_set = test_set.map(encode_words).prefetch(1)
dev_set = dev_data.batch(32).map(preprocess)
dev_set = dev_set.map(encode_words).prefetch(1)


In [28]:
loss,accurracy = glove_model.evaluate(dev_set, steps=10, verbose=True)



In [29]:
loss,accuracy = glove_model.evaluate(test_set, steps=10,verbose=True)



In [0]:
sentence = b"the movie was so good I loved it"
arr = []
for i in range(92):
  arr.append(b"<pad>")
for i in sentence.split():
  arr.append(word_to_id.get(i) or vocab_size )

print(arr)
nup = np.array(arr)
print(nup)
nup = nup.reshape(100,1).T

In [0]:
glove_model.predict(nup)

# Using pretrained embeddings