## Sequence Generation 

### Preprocessing

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from keras.preprocessing.sequence import pad_sequences
import nltk
import numpy as np
import random

Using TensorFlow backend.


In [2]:
with open("./trump_speeches.txt") as f:
    data = f.read()
    data = data.lower()

In [3]:
import re
sentences = re.sub(r"[^a-zA-Z0-9. ]+", " ", data)
#sentences = re.sub(r"...",".", sentences)

#### Tokenization and padding sentences

In [4]:
sent_tokens = sent_tokenize(sentences)
word_tokens = word_tokenize(sentences)

In [5]:
for i in range(len(sent_tokens)):
    sent_tokens[i] = "<s> " + sent_tokens[i] + " </s>"

Split data for easier use

In [6]:
from sklearn.model_selection import train_test_split

train, test, _, _ = train_test_split(sent_tokens, range(len(sent_tokens)),test_size = 0.2)

In [7]:
train[:10], test[:10]

(['<s> i have many  many executives upstairs and in different buildings that i have that are women. </s>',
  '<s> and now  to top it off  we have isis. </s>',
  '<s> it doesn t have to be this way. </s>',
  '<s> we don t win anymore. </s>',
  '<s> but to chart our path forward  we must first briefly take a look back. </s>',
  '<s> that s the end of it. </s>',
  '<s> i don t know. </s>',
  '<s> you see him. </s>',
  '<s> they think it s  2.5 trillion. </s>',
  '<s> i don t want to do. </s>'],
 ['<s> it s not a friend even to the united states of america  where as you know  it has its home. </s>',
  '<s> and i watched the next door neighbor saying   oh  well we didn t report them because we didn t want to racially profile or we didn t want to profile. </s>',
  '<s> he understand. </s>',
  '<s> but  you know  i started and i started at like 2  or 3  and then it went to 6  and then it went to 9 . </s>',
  '<s> he ll never make it. </s>',
  '<s> and people knew that they were radicalized an

#### Extimation

In [8]:
# Predict choice wrt multinomial distribution

def predict(str, N, freq_count):
    pvals=[]
    words=[]
    seq = ' '.join(str.split()[-(N-1):])
    if N!=1:
        choices = freq_count[seq].items()
    else:
        choices = freq_count[''].items()

    total = sum(weight for choice, weight in choices)
    
    for key,values in choices:
        words.append(key)
        pvals.append(values/total)
        
    distribution = np.random.multinomial(1, pvals)
    idx = np.argmax(distribution)
    choice = words[idx]
    
    return choice

In [9]:
# Find distribution of ngrams to return frequecny distribution

def ngrams(lines, N):

    ngrams = []
    freq_count = {}
#     print sentences[:10]
    for line in lines:
        tokens = line.split()
        for i in range(len(tokens)-N+1):
            ngrams.append(tokens[i:i+N])  
    
    for i in ngrams:
        seq  = ' '.join(i[:-1])
        last = str(i[-1])

        if seq not in freq_count:
            freq_count[seq] = {};
        
        if last not in freq_count[seq]:
            freq_count[seq][last] = 0;

        freq_count[seq][last] += 1;
        
    return freq_count

### Evaluation

We will do smoothing to some contsant value which is smalles in perplexity

In [10]:
# Smoothing
def smoothing(perp, c):
    for i in range(len(perp)):
        if perp[i]==1:
            perp[i]= c
    return perp

#### Determine Perplexity based on N

In [11]:
def PerpMat(N):
    perp = []
    freq = ngrams(test, N)
    
    for sent in test:
        n_grams = []
        tokens = sent.split()

        for i in range(len(tokens)-N+1):
            n_grams.append(tokens[i:i+N])

        for ngram in n_grams:
            seq  = ' '.join(ngram[:-1])
            last = str(ngram[-1])

            if seq not in freq:
                perp.append(1.0)

            elif last not in freq[seq]:
                perp.append(1.0)

            else:
                if N!=1:
                    choices = freq[seq].items()
                else:
                    choices = freq[''].items()

                pvals=[]
                key_words=[]
                total = sum(weight for choice, weight in choices)
                perp.append(freq[seq][last]*1.0/total)
    
    c = min(perp)
    perp = smoothing(perp,c)

    return perp

### Obtaining counts for each MLE
General study of counts - unigrams, bigram, trigram, quadgram

In [12]:
len(PerpMat(1)), len(PerpMat(2)), len(PerpMat(3)), len(PerpMat(4))

(40921, 37927, 34933, 31939)

In [15]:
for N in range(2,5):
    perp = PerpMat(N)
    log_perp = abs(np.log(perp))
    log_perp = sum(log_perp)/len(perp)
    perplexity = np.exp(log_perp)
    print("Perplexity for",N,"gram: ",perplexity)

('Perplexity for', 2, 'gram: ', 47.409962590543735)
('Perplexity for', 3, 'gram: ', 38.14224260140589)
('Perplexity for', 4, 'gram: ', 64.04565812702502)


### Text Generation

In [16]:
# Function to generate sequence of words from MLE model
# Eg. Inputs: N=2 (bigram), freq_count is got from ngrams()

def generator(N, freq_count):
    start_seq = None
    taglist = []
    
    for i in freq_count.keys():
        a = i.split()
        if N!=1 and a[0]=="<s>":
            taglist.append(i)
    
    if(start_seq == None) and N!=1: 
        start_seq = random.choice(taglist)
    elif(start_seq == None) and N==1:
        start_seq="<s>"
    out = start_seq.lower()
    
    next_word = ""
    
    while next_word!= "</s>":
        next_word = predict(out, N, freq_count)
        out += ' ' + next_word
        
    return out


### Text prediction

In [18]:
for N in range(2,5):
    print("Generated sentences with model complexity  ",str(N))
    freq = ngrams(train, N)
    print(generator(N, freq))
    print(generator(N, freq))
    print(generator(N, freq))
    print(generator(N, freq))
    print(generator(N, freq))

('Generated sentences with model complexity  ', '2')
<s> once he left out. </s>
<s> once he left out. </s>
<s> once he left out. </s>
<s> once he left out. </s>
<s> once he left out. </s>
('Generated sentences with model complexity  ', '3')
<s> number one problem is he intelligent then i mean bernie sanders gave her a second class citizen will ever change. </s>
<s> 38 to 12. that s having a celebration. </s>
<s> oh my brother was such that i mean bernie sanders gave her a second class citizen will ever change. </s>
<s> frankly when i first came in second. </s>
<s> houdini couldn t care they re strong. </s>
('Generated sentences with model complexity  ', '4')
<s> i will. </s>
<s> now with that being said the small money comes in and it s disgraceful. </s>
<s> there were many times that. </s>
<s> the list of humiliations go on and on and on. </s>
<s> so go back. </s>


## Neural Method

In [19]:
with open('./trump_speeches.txt', 'r') as file:
    data = file.read().replace('\n', '')
    
data=data.lower()
x = re.split(r'(\.|,)', data)

In [20]:
from keras.preprocessing.text import Tokenizer

final_data=[]
for i in x:
    if(i not in [",", ".", " "]):
        final_data.append(i.strip())

train_data=final_data[:13500]
test_data=final_data[13500:]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(final_data)
tot_words = len(tokenizer.word_index) + 1


input_sequences = []
for i in train_data:
    token_list = tokenizer.texts_to_sequences([i])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,   
                          maxlen=15, padding='pre'))

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = utils.to_categorical(label, num_classes=tot_words)

NameError: name 'utils' is not defined

In [None]:
def generate_text(seed_text, next_words, max_sequence_len, model):
    for j in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen= 
                             max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
  
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

## RNN

In [21]:
from keras.layers import Embedding, LSTM, Dense, Dropout,SimpleRNN 
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as utils

input_size = 14
rnn=Sequential()
rnn.add(Embedding(tot_words, 100, input_length=input_size))
rnn.add(SimpleRNN(300))
rnn.add(Dropout(0.2))
rnn.add(Dense(tot_words, activation='softmax'))

print(rnn.summary())

rnn.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
rnn.fit(predictors, label, batch_size=1024, epochs=5)

W1007 02:20:05.555737 140346571187968 deprecation_wrapper.py:119] From /home/manas/.local/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1007 02:20:05.831521 140346571187968 deprecation_wrapper.py:119] From /home/manas/.local/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1007 02:20:05.866033 140346571187968 deprecation_wrapper.py:119] From /home/manas/.local/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1007 02:20:06.197400 140346571187968 deprecation_wrapper.py:119] From /home/manas/.local/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 100)           619800    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 300)               120300    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6198)              1865598   
Total params: 2,605,698
Trainable params: 2,605,698
Non-trainable params: 0
_________________________________________________________________
None


ValueError: Error when checking target: expected dense_1 to have shape (6198,) but got array with shape (1,)

### Predicated Text

In [None]:
print(generate_text("i am", 4, 15, rnn))
print(generate_text("this is", 9, 15, rnn))
print(generate_text("why this", 5, 15, rnn))
print(generate_text("obama", 7, 15, rnn))
print(generate_text("she can't", 6, 15, rnn))

### Evaluation

In [22]:
output_sequences = []
for line in test_data:
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        output_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in output_sequences])
output_sequences = np.array(pad_sequences(output_sequences,   
                          maxlen=15, padding='pre'))

print(output_sequences.shape)
x_test, y_test = output_sequences[:,:-1],output_sequences[:,-1]
y_test = utils.to_categorical(y_test, num_classes=tot_words)

score = rnn.evaluate(x_test, y_test, verbose=False) 
print('Test score: ', score[0])    
print('Test accuracy: ', score[1])
print("Perplexity ", np.exp(score[0]))

(67767, 15)
('Test score: ', 8.730955104635646)
('Test accuracy: ', 4.426933463190048e-05)
('Perplexity ', 6191.6389510813815)


## LSTM

In [23]:
input_len = 14
lstm=Sequential()
lstm.add(Embedding(tot_words, 50, input_length=input_len))
lstm.add(LSTM(300))
lstm.add(Dense(tot_words, activation='softmax'))

lstm.summary()

lstm.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
lstm.fit(predictors, label, batch_size=1024, epochs=5)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 14, 50)            309900    
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               421200    
_________________________________________________________________
dense_2 (Dense)              (None, 6198)              1865598   
Total params: 2,596,698
Trainable params: 2,596,698
Non-trainable params: 0
_________________________________________________________________


ValueError: Error when checking target: expected dense_2 to have shape (6198,) but got array with shape (1,)

In [None]:
print(generate_text("i don't", 11, 15, model))
print(generate_text("how", 9, 15, model))
print(generate_text("that was", 6, 15, model))
print(generate_text("trump", 6, 15, model))
print(generate_text("he", 7, 15, model))

In [None]:
output_sequences = []
for line in test_data:
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        output_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in output_sequences])
output_sequences = np.array(pad_sequences(output_sequences,   
                          maxlen=15, padding='pre'))

x_test, y_test = output_sequences[:,:-1],output_sequences[:,-1]
y_test = utils.to_categorical(y_test, num_classes=tot_words)

score = model.evaluate(x_test, y_test, verbose=False)
print('Test score: ', score[0])    
print('Test accuracy: ', score[1])
print("Perplexity ", np.exp(score[0]))

## Does Neural performs better than Classical, if so, why?

Yes, the neural network perfroms better than calssical.
The output of classical is highly reprititive. With neural networks trained at higher epochs,
the lesser are the amount of repititions. However, grammatically,
neural networks take more data and time to attain perfection.