# Recurrent Neural Network for Modeling Sentences

In this task, we will use RNNs to model sentences. The task is to predict the next character in a sentence. 

In [1]:
# As usual, a bit of setup
import time
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%autosave 180


Autosaving every 180 seconds


## Load the data


In [28]:

import csv
import string
import numpy as np

def load_data(data_file, with_labels = False):
    """Load the data into a list of strings"""
    
    with open(data_file) as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        rows = list(reader)

    if data_file == 'train.csv':
        sentences, labels = zip(*rows[1:])
        labels            = [0 if l=="False" else 1 for l in labels]
        sentences = list(sentences)
    elif data_file == 'test.csv':
        sentences = [row[0] for row in rows[1:]]
    else:
        print("Can only load 'train.csv' or 'test.csv'")
    
    # replace non ascii chars to spaces
    count = 0
    for i, sen in enumerate(sentences):
        count = count + sum([0 if ord(i) < 128 else 1 for i in sen])
        
        # '\n' indicates the end of the sentence
        sentences[i] = ''.join([i if ord(i) < 128 else ' ' for i in sen]) + '\n'
        
    print('The total of ', count, 'non-ascii chars are removed \n')

    if not with_labels:
        return sentences
    else:
        return sentences, labels

def char_to_index(sentence, str_voc):
    """Convert a string to an array by using the index in the vocabulary"""
    
    sen_int = np.array([str_voc.index(c) for c in sentence])
    return sen_int

def convert_sen_to_data(sentences, str_voc):
    """ Convert a list of strings to a list of numpy arrays"""
    data = [None] * len(sentences)
    for i, sen in enumerate(sentences):
        data[i] = char_to_index(sen, str_voc)
        
        # sanity check
        #if i < 5:
        #    recover = "".join([str_voc[k] for k in data[i]])
        #    print(recover)
    return data


train_sentences, labels = load_data('train.csv', with_labels = True)
# NOTE: you need to use the same vocabulary to handle your test sentences
vocabulary = list(set("".join(train_sentences))) 
vocabulary.sort()
str_voc = "".join(vocabulary)

train_data = convert_sen_to_data(train_sentences, str_voc)


num_sen = len(train_data)
sen_lengths = [sen.shape[0] for sen in train_data]
max_len = max(sen_lengths)
min_len = min(sen_lengths)
num_chars = sum(sen_lengths)

print('Data statistics:')
print('Number of sentences: ', num_sen)
print('Maximum and minimum sentence lengths:', max_len, min_len)
print('Total number of characters:', num_chars)
print('Vocabulary size: ', len(vocabulary))

uniq, uniq_counts = np.unique(np.concatenate(train_data), return_counts=True)
freq = np.zeros_like(uniq_counts)
freq[uniq] = uniq_counts

print('Chars in vocabulary and their frequencies:')
print(list(zip(vocabulary, freq.tolist())))
    

The total of  4328 non-ascii chars are removed 

Data statistics:
Number of sentences:  160000
Maximum and minimum sentence lengths: 100 32
Total number of characters: 10954565
Vocabulary size:  95
Chars in vocabulary and their frequencies:
[('\n', 160000), (' ', 1762678), ('!', 12100), ('#', 496), ('$', 1212), ('%', 450), ('&', 1366), ("'", 88729), ('(', 8734), (')', 8890), ('*', 4310), ('+', 123), (',', 33680), ('-', 20064), ('.', 108694), ('/', 1586), ('0', 11139), ('1', 10960), ('2', 7690), ('3', 3517), ('4', 2882), ('5', 4272), ('6', 2673), ('7', 2496), ('8', 2071), ('9', 2801), (':', 22223), (';', 607), ('<', 12), ('=', 103), ('>', 9), ('?', 48816), ('@', 34), ('A', 8259), ('B', 4063), ('C', 5317), ('D', 6787), ('E', 2239), ('F', 3232), ('G', 2668), ('H', 11482), ('I', 15839), ('J', 2999), ('K', 2315), ('L', 2612), ('M', 7724), ('N', 3017), ('O', 2211), ('P', 3722), ('Q', 1036), ('R', 2942), ('S', 7281), ('T', 15062), ('U', 1014), ('V', 720), ('W', 37161), ('X', 17), ('Y', 2381),

### Implement an RNN and a GRU with tensorflow

**Q7 (10 points)** In this problem, you are supposed to train a recurrent neural network to model sentences. Particuarly, your model will receive 10 starting characters and should predict the rest of sentence. The model will be evaluated by per-character cross-entropy loss. You will get 
* 5 points if your per-character cross-entropy loss is less than 3.13 (the loss by predicting with character frequencies). 
* 8 points if your per-character cross-entropy loss is less than 2
* 10 points if your per-character cross-entropy loss is less than 1.5

\*The performance from a [paper](https://arxiv.org/pdf/1808.04444.pdf) indicates that an LSTM can achieve performance of 1.43 * ln(2) = 0.991. 
\*The `zip` program for compressing files roughly can achieve a performances of 3.522 bits per character. It corresponds to a performance of  3.522 * ln(2) = 2.441

In [43]:
## Create RNN and train the model
## NOTE: you may want to put this part of code in a separate .py file

from rnn_lm import masked_lm_loss

voc_size = len(str_voc)


# You don't have to do padding yourself if your model support varied lengths of sequences. 
train_mat = tf.keras.preprocessing.sequence.pad_sequences(train_data, maxlen=max_len, 
                                                         padding='post', truncating='post',
                                                         value=-1)
# I use a small fraction of data to train the model for a quick demo
# You probably want to use all the data
train_mat = train_mat[:1600]

# prepare the input and the desired output
train_x = np.concatenate([- np.ones([train_mat.shape[0], 1]), train_mat[:, :-1]], axis=1)
train_y = train_mat


# construct the model
# Here I include a Lambda layer and an embedding layer for your reference
batch_size = 32
model_batch = tf.keras.Sequential()
model_batch.add(tf.keras.layers.InputLayer(batch_input_shape=(batch_size, 100, 1)))
model_batch.add(tf.keras.layers.Lambda(lambda x: tf.squeeze(x + 1, axis=[-1])))
model_batch.add(tf.keras.layers.Embedding(input_dim=voc_size + 1, output_dim=10, input_length=max_len))
model_batch.add(tf.keras.layers.SimpleRNN(95, activation='tanh', return_sequences=True, stateful=False))

# NOTE: the output of the model should be `[batch_size, seq_length, voc_size]`
# `seq_length` can be either the original length if you do not pad, or the 
# length after padding
model_batch.summary()
model_batch.compile(optimizer="Adam", loss=masked_lm_loss)
model_batch.fit(x=train_x, y=train_y, epochs=4, batch_size=batch_size)


# NOTE: The following code converts the trained model to a "stateful" one so it can do stepwise 
# predictions without forgetting previous hidden states. We do this by allocating a
# a new model and copying weights from the trained model to this new model. 

# TODO: you need to do the same thing for your own model. This example only works for this example

# NOTE: the batch size needs to be one because your model will be used to generate 
# a single sentence below. 

batch_size = 1
model = tf.keras.Sequential()
# NOTE: You need to use exactly the same way to construct this model as your trained model BUT set 
# `stateful=True` to EVERY recurrent layer

model.add(tf.keras.layers.InputLayer(batch_input_shape=(batch_size, 100, 1)))
model.add(tf.keras.layers.Lambda(lambda x: tf.squeeze(x + 1, axis=[-1])))
model.add(tf.keras.layers.Embedding(input_dim=voc_size + 1, output_dim=10, input_length=max_len))
model.add(tf.keras.layers.SimpleRNN(95, activation='tanh', return_sequences=True, stateful=True))


# Then copy weights from the trained model to this new model
for il, layer in enumerate(model_batch.layers):
    model.layers[il].set_weights(layer.get_weights())


model.save('rnn_lm.mod') 


Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_26 (Lambda)           (32, 100)                 0         
_________________________________________________________________
embedding_27 (Embedding)     (32, 100, 10)             960       
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (32, 100, 95)             10070     
Total params: 11,030
Trainable params: 11,030
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
INFO:tensorflow:Assets written to: rnn_lm.mod/assets


### Test the trained model

In [4]:

from rnn_lm import masked_lm_loss

# load the test data. NOTE: need to use the same vocabulary as the training data
sentences = load_data('test.csv')

# NOTE: To speed up the testing speed, I randomly select 1000 sentences as the test set. 
# Let me know if you get a much better performance on the entire test set. 
np.random.seed(137)
selection = np.random.choice(len(sentences), size=1000, replace=False)

# prepare the input
test_sentences = [sentences[i] for i in selection]
test_data = convert_sen_to_data(test_sentences, str_voc)
test_mat = tf.keras.preprocessing.sequence.pad_sequences(test_data, maxlen=max_len, 
                                                         padding='post', truncating='post',
                                                         value=-1)

test_x = np.concatenate([- np.ones([test_mat.shape[0], 1]), test_mat[:, :-1]], axis=1)

# Load your powerful model and compile it with the loss I have defined.
# NOTE: compiling your model with my loss should not matter because I only use 
# your model for prediction. 
model = tf.keras.models.load_model('rnn_lm.mod', compile=False)
model.compile(optimizer="adam", loss=masked_lm_loss)

# set batch size to 1
batch_size = 1

# Evaluate the model on test sentences in batch mode
model.reset_states()
batch_pred = model.predict(test_x, batch_size=1)
losses = masked_lm_loss(test_mat, batch_pred)
per_char_loss = np.mean(losses.numpy())

# Your points will be decided by the per-char-loss
print('predict and calculate loss:')
print('The per-char-loss is about %f' % per_char_loss)


# make sure that stepwise predictions are the same as batch predictions
# test the model on a single sentence

test_x_single = test_x[0:1]
test_single = test_mat[0:1]

# batch prediction
model.reset_states()
batch_pred = model.predict(test_x_single, batch_size = batch_size)

# step-wise prediction
model.reset_states()
diff = 0
for t in range(max_len):
        
    predict = model.predict(test_x_single[0:1, t:t+1], batch_size=1)
       
    max_per_entry_diff = np.max(np.abs(predict[0, 0] - batch_pred[0, t]))

    if diff < max_per_entry_diff:
        diff = max_per_entry_diff

# The difference should be zero
print('Difference between the two types of predictions is ', diff)
    

The total of  1131 non-ascii chars are removed 

predict and calculate loss:
The per-char-loss is about 3.548162
Difference between the two types of predictions is  0


### Use the model to generate sentences

Now we can use the trained model to generate text with a starting string. The naive model just predict frequent characters in the text, so there is no meaningful generation yet. See what you get from your models.

In [1]:
def generate_text(model, start_string, str_voc):
    """ Generate random text from a starting string. The code is modified from this 
    [example](https://www.tensorflow.org/tutorials/text/text_generation)"""

    # Number of characters to generate
    num_generate = 100 - len(start_string)

    # Converting our start string to numbers (vectorizing)
    input_eval = np.array([str_voc.index(s) for s in start_string])
    input_eval = np.reshape(input_eval, [1, -1, 1])

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = predictions[0]

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.reshape([predicted_id], [1, 1, 1])

        text_generated.append(str_voc[predicted_id])

    return (start_string + ''.join(text_generated))


start_string = 'There '
gen_sen = generate_text(model, start_string, str_voc)
gen_sen = gen_sen.split('\n')[0]

print('Starting from "' + start_string + '", the generated sentence is:')
print('"' + gen_sen + '"')

NameError: name 'model' is not defined

## Extra Credit: Implement A Sentence Classifier using RNN

(Q8) In this OPTIONAL problem, you need to perform sentence classification using RNN. The datasets that we use here is the same dataset that we used in the earlier text generation problem. You will get
* 5 additional points if the test accuracy is above 70%
* 7 additional points if the test accuracy is above 80%
* 9 additional points if the test accuracy is above 85%

In [106]:
# Generating Test and Train Datasets
all_sentences, all_labels = load_data('train.csv', with_labels = True)
# NOTE: you need to use the same vocabulary to handle your test sentences
vocabulary = list(set("".join(all_sentences))) 
vocabulary.sort()
str_voc = "".join(vocabulary)
train_data = convert_sen_to_data(train_sentences, str_voc)
train_mat = tf.keras.preprocessing.sequence.pad_sequences(train_data, maxlen=100, 
                                                         padding='post', truncating='post',
                                                         value=-1)

print(f"The shape of the complete dataset: {train_mat.shape}")
# The training and test examples
x_train  = train_mat[:80000]
x_test   = train_mat[80000:160000]

# The training and test labels
y_train  = np.array(all_labels[:80000])
y_test   = np.array(all_labels[80000:160000])


The total of  4328 non-ascii chars are removed 

The shape of the complete dataset: (160000, 100)


In [107]:
# Example Model
# You can update your model here
model_c = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(100,)),
    tf.keras.layers.Lambda(lambda x: x+1),
    tf.keras.layers.Embedding(
        input_dim=voc_size + 1,
        output_dim=10,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.SimpleRNN(10),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
])

model_c.summary()

# Compile and Fit the model
model_c.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

# Training on only 100 datasets: You need to run the model on all the training data for 
# higher classification accuracy
model_c.fit(x=x_train[:100], y = y_train[:100], epochs = 200, batch_size = 32)


Model: "sequential_60"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_58 (Lambda)           (None, 100)               0         
_________________________________________________________________
embedding_60 (Embedding)     (None, 100, 10)           960       
_________________________________________________________________
simple_rnn_9 (SimpleRNN)     (None, 10)                210       
_________________________________________________________________
dense_108 (Dense)            (None, 10)                110       
_________________________________________________________________
dense_109 (Dense)            (None, 1)                 11        
Total params: 1,291
Trainable params: 1,291
Non-trainable params: 0
_________________________________________________________________
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/

Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x7f9a469a4890>

## Saving the Classifier

In [108]:
model_c.save("rnn_classifier.kmod")

INFO:tensorflow:Assets written to: rnn_classifier.kmod/assets


## Testing the classifier

In [110]:
# Computing test accuracy
model_cl = tf.keras.models.load_model("rnn_classifier.kmod")
y_pred = (model_cl(x_test) > 0.5).numpy().flatten().astype(int)
accuracy = np.sum(y_test == y_pred) / y_pred.shape[0]
print(f"Accuracy : {accuracy}")

Accuracy : 0.501175
