<a href="https://colab.research.google.com/github/narsym/deep-learning-with-tensorflow-2.0/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Many to one 

Imports 

In [0]:
import numpy as np
import os
import shutil
import tensorflow as tf

from sklearn.metrics import confusion_matrix, accuracy_score

download and read

In [0]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    local_file = local_file.replace("%20", " ")
    p = tf.keras.utils.get_file(local_file, url, 
        extract=True, cache_dir=".")
    local_folder = os.path.join("datasets", local_file.split('.')[0])
    labeled_sentences = []
    for labeled_filename in os.listdir(local_folder):
        if labeled_filename.endswith("_labelled.txt"):
            with open(os.path.join(local_folder, labeled_filename), "r") as f:
                for line in f:
                    sentence, label = line.strip().split('\t')
                    labeled_sentences.append((sentence, label))
    return labeled_sentences

labeled_sentences = download_and_read(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip")
sentences = [s for (s, l) in labeled_sentences]
labels = [int(l) for (s, l) in labeled_sentences]

Tokenizing

In [9]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_counts)
print(f'vocabulary size: {vocab_size}')

word2idx = tokenizer.word_index
idx2word = {v:k for (k, v) in word2idx.items()}

vocabulary size: 5271


Fixing maximum sequence length

In [10]:
seq_length = np.array([len(s.split()) for s in sentences])
print([(p, np.percentile(seq_length, p)) for p in [75, 80, 90, 95, 99, 100]])

[(75, 16.0), (80, 18.0), (90, 22.0), (95, 26.0), (99, 36.0), (100, 71.0)]


99% of the sentences are under 36.0

In [0]:
max_seqlen = 64
sentences_as_ints = tokenizer.texts_to_sequences(sentences)
sentences_as_ints = tf.keras.preprocessing.sequence.pad_sequences(sentences_as_ints, maxlen = max_seqlen)
labels_as_ints = np.array(labels)
dataset = tf.data.Dataset.from_tensor_slices((sentences_as_ints, labels_as_ints))

Train_test_split

In [0]:
dataset = dataset.shuffle(10000)
test_size = len(sentences) // 3
val_size = (len(sentences) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(val_size + test_size)

In [0]:
batch_size = 64
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

Chapter 8[ 303 ]Next we define our model. As you can see, the model is fairly straightforward, each input sentence is a sequence of integers of size max_seqlen (64). This is input into an Embedding layer that converts each word into a vector given by the size of the vocabulary + 1. The additional word is to account for the padding integer 0 that was introduced during the pad_sequences() call above. The vector at each of the 64 time steps are then fed into a bidirectional LSTM layer, which coverts each word to a vector of size (64,). The output of the LSTM at each time step is fed into a Dense layer, which produces a vector of size (64,) with ReLU activation. The output of this Dense layer is then fed into another Dense layer, which outputs a vector of (1,) at each time step, modulated through a sigmoid activation.

In [18]:
class SentimentAnalysisModel(tf.keras.Model):
    def __init__(self, vocab_size, max_seqlen, **kwargs):
        super(SentimentAnalysisModel, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_size, max_seqlen)
        self.bilstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(max_seqlen)
        )
        self.dense = tf.keras.layers.Dense(64, activation="relu")
        self.out = tf.keras.layers.Dense(1, activation="sigmoid")

    def call(self, x):
        x = self.embedding(x)
        x = self.bilstm(x)
        x = self.dense(x)
        x = self.out(x)
        return x

model = SentimentAnalysisModel(vocab_size + 1, max_seqlen)
model.build(input_shape = (batch_size, max_seqlen))#send input shape as parameter always, for model subclassing
model.summary()

Model: "sentiment_analysis_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  337408    
_________________________________________________________________
bidirectional_1 (Bidirection multiple                  66048     
_________________________________________________________________
dense_2 (Dense)              multiple                  8256      
_________________________________________________________________
dense_3 (Dense)              multiple                  65        
Total params: 411,777
Trainable params: 411,777
Non-trainable params: 0
_________________________________________________________________


compile the model

In [0]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

Train

In [20]:
data_dir = './data'
logs_dir = os.path.join('./logs')
best_model_file = os.path.join(data_dir, 'best_model.h5')
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_file, save_weights_only = True, save_best_only = True)
tensorboard = tf.keras.callbacks.TensorBoard(logs_dir)
num_epochs = 10
history = model.fit(train_dataset, epochs = num_epochs, validation_data = val_dataset, callbacks = [checkpoint, tensorboard])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
best_model = SentimentAnalysisModel(vocab_size + 1, max_seqlen)
best_model.build(input_shape = (batch_size, max_seqlen))
best_model.load_weights(best_model_file)
best_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [22]:
test_loss, test_accuracy = best_model.evaluate(test_dataset)
print(f'test_loss: {test_loss}, test_accuracy: {test_accuracy}')

test_loss: 0.032562725245952606, test_accuracy: 0.9940000176429749


Prediction manually

In [24]:
# predict on batches
labels, predictions = [], []
idx2word[0] = "PAD"
is_first_batch = True
for test_batch in test_dataset:
    inputs_b, labels_b = test_batch
    pred_batch = best_model.predict(inputs_b)
    predictions.extend([(1 if p > 0.5 else 0) for p in pred_batch])
    labels.extend([l for l in labels_b])
    if is_first_batch:
        for rid in range(inputs_b.shape[0]):
            words = [idx2word[idx] for idx in inputs_b[rid].numpy()]
            words = [w for w in words if w != "PAD"]
            sentence = " ".join(words)
        is_first_batch = False

print("accuracy score: {:.3f}".format(accuracy_score(labels, predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))

accuracy score: 0.990
confusion matrix
[[485   4]
 [  6 505]]


We got 99% accuracy