# Text Generation using LSTM Seq-to-Seq Network

![title](https://camo.githubusercontent.com/9a5b885799c2d8e50f3f049fde2ada7696e974ca/68747470733a2f2f692e696d6775722e636f6d2f484646575674432e706e673f32)

In [None]:
import pandas as pd
import numpy as np

In [None]:
import os
path = os.getcwd()
print(path)

/content


In [None]:
ls

[0m[01;34msample_data[0m/


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
headlines = []
for filename in os.listdir(path+"/drive/MyDrive"):
    if "Articles" in filename:
        article_df = pd.read_csv(path + "/drive/MyDrive/" + filename)
        headlines.extend(list(article_df["headline"].values))
        break
        
headlines = [ h for h in headlines if h != "Unknown" ]
print("The number of headline is :", len(headlines))

The number of headline is : 831


In [None]:
headlines

['Finding an Expansive View  of a Forgotten People in Niger',
 'And Now,  the Dreaded Trump Curse',
 'Venezuela’s Descent Into Dictatorship',
 'Stain Permeates Basketball Blue Blood',
 'Taking Things for Granted',
 'The Caged Beast Awakens',
 'An Ever-Unfolding Story',
 'O’Reilly Thrives as Settlements Add Up',
 'Mouse Infestation',
 'Divide in G.O.P. Now Threatens Trump Tax Plan',
 'Variety Puzzle: Acrostic',
 'They Can Hit a Ball 400 Feet. But Play Catch? That’s Tricky.',
 'In Trump Country, Shock at Trump Budget Cuts',
 'Why Is This Hate Different From All Other Hate?',
 'Pick Your Favorite Ethical Offender',
 'My Son’s Growing Black Pride',
 'Jerks and the Start-Ups They Ruin',
 'Trump  Needs  a Brain',
 'Manhood in the Age of Trump',
 'The Value of a Black College',
 'Initial Description',
 'Rough Estimates',
 'El Pasatiempo Nacional',
 'Cooling Off on a Hot Day at Yankee Stadium',
 'Trump’s Staff Mixed Politics and Paydays',
 'A Virtuoso Rebuilding Act Requires Everyone in Tune',

### 3.1 Dataset cleaning

In [None]:
import string
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

def clean_text(headline):
    text = "".join( word for word in headline if word not in string.punctuation ).lower()
    text = text.encode("utf8").decode("ascii", "ignore")
    return text

corpus = [ clean_text(headline) for headline in headlines ]

In [None]:
corpus

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuelas descent into dictatorship',
 'stain permeates basketball blue blood',
 'taking things for granted',
 'the caged beast awakens',
 'an everunfolding story',
 'oreilly thrives as settlements add up',
 'mouse infestation',
 'divide in gop now threatens trump tax plan',
 'variety puzzle acrostic',
 'they can hit a ball 400 feet but play catch thats tricky',
 'in trump country shock at trump budget cuts',
 'why is this hate different from all other hate',
 'pick your favorite ethical offender',
 'my sons growing black pride',
 'jerks and the startups they ruin',
 'trump  needs  a brain',
 'manhood in the age of trump',
 'the value of a black college',
 'initial description',
 'rough estimates',
 'el pasatiempo nacional',
 'cooling off on a hot day at yankee stadium',
 'trumps staff mixed politics and paydays',
 'a virtuoso rebuilding act requires everyone in tune',
 'homeland seaso

### 3.2 Generating Sequence of N-gram Tokens



In [None]:
vocab = []
for line in corpus:
    words = line.split()
    for word in words:
        vocab.append(word)

vocabulary = set(vocab)

In [None]:
len(vocabulary)

2421

In [None]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tokenizer = Tokenizer(2000)
tokenizer.fit_on_texts(corpus)
word2index = tokenizer.word_index
len(word2index)

2421

In [None]:
word2index

{'the': 1,
 'a': 2,
 'to': 3,
 'of': 4,
 'in': 5,
 'and': 6,
 'on': 7,
 'for': 8,
 'is': 9,
 'trump': 10,
 'new': 11,
 'at': 12,
 'what': 13,
 'trumps': 14,
 'with': 15,
 'be': 16,
 'an': 17,
 'from': 18,
 'season': 19,
 'you': 20,
 'how': 21,
 'as': 22,
 'episode': 23,
 'us': 24,
 'this': 25,
 'it': 26,
 'but': 27,
 'its': 28,
 'not': 29,
 'york': 30,
 'today': 31,
 'that': 32,
 'up': 33,
 'good': 34,
 '3': 35,
 'health': 36,
 'who': 37,
 'why': 38,
 'all': 39,
 'my': 40,
 'when': 41,
 'by': 42,
 'can': 43,
 'your': 44,
 '2': 45,
 'have': 46,
 'or': 47,
 'out': 48,
 'are': 49,
 'syria': 50,
 'plan': 51,
 'about': 52,
 'will': 53,
 'questions': 54,
 'tax': 55,
 '5': 56,
 'could': 57,
 'mr': 58,
 'i': 59,
 'war': 60,
 'do': 61,
 'heart': 62,
 'over': 63,
 'day': 64,
 'no': 65,
 'his': 66,
 'love': 67,
 'college': 68,
 'house': 69,
 'dont': 70,
 'like': 71,
 'recap': 72,
 'our': 73,
 'has': 74,
 'days': 75,
 'policy': 76,
 'work': 77,
 'better': 78,
 'home': 79,
 'now': 80,
 'they': 81,


In [None]:
dictionary = {}
rev_dictionary = {}
for word, idx in word2index.items():
    if idx > 1406:
        continue
    dictionary[word] = idx
    rev_dictionary[idx] = word

In [None]:
max(rev_dictionary.keys())

1406

In [None]:
input_seqences = tokenizer.texts_to_sequences(corpus)

In [None]:
input_seqences

[[169, 17, 665, 367, 4, 2, 666, 170, 5, 667],
 [6, 80, 1, 668, 10, 669],
 [670, 671, 129, 672],
 [673, 674, 368, 675, 676],
 [105, 171, 8, 677],
 [1, 678, 679, 680],
 [17, 681, 227],
 [130, 682, 22, 683, 369, 33],
 [684, 685],
 [228, 5, 229, 80, 686, 10, 55, 51],
 [131, 172, 230],
 [81, 43, 687, 2, 688, 689, 690, 27, 691, 692, 231, 693],
 [5, 10, 694, 370, 12, 10, 371, 372],
 [38, 9, 25, 106, 107, 18, 39, 232, 106],
 [233, 44, 695, 696, 373],
 [40, 374, 697, 132, 698],
 [699, 6, 1, 700, 81, 701],
 [10, 702, 2, 133],
 [703, 5, 1, 375, 4, 10],
 [1, 376, 4, 2, 132, 68],
 [704, 705],
 [706, 707],
 [708, 709, 710],
 [711, 173, 7, 2, 174, 64, 12, 712, 377],
 [14, 713, 714, 234, 6, 378],
 [2, 715, 716, 717, 718, 719, 5, 720],
 [235, 19, 108, 23, 379, 9, 721, 134, 2, 722, 723],
 [109, 236, 724, 6, 1, 135, 4, 725],
 [726, 2, 727],
 [237, 175, 6, 238, 23, 56, 105, 1, 380],
 [136, 19, 45, 23, 381, 728, 9, 34, 382, 41, 28, 29],
 [110, 82, 7, 5, 25, 176, 137, 35, 138],
 [46, 20, 239, 729, 730, 42, 

In [None]:
len(input_seqences)

831

### 3.3 Padding the Sequences and obtain Variables

In [None]:
input_data = []
target = []
for line in input_seqences:
    for i in range(1, len(line)-1):
        input_data.append(line[:i])
        target.append(line[i+1])

In [None]:
input_data[:5]

[[169], [169, 17], [169, 17, 665], [169, 17, 665, 367], [169, 17, 665, 367, 4]]

In [None]:
target[:5]

[665, 367, 4, 2, 666]

In [None]:
MAX_LEN = 0
for seq in input_data:
    if len(seq) > MAX_LEN:
        MAX_LEN = len(seq)
MAX_LEN

16

In [None]:
input_data = pad_sequences(input_data, maxlen=MAX_LEN, padding="post", truncating="post")

In [None]:
len(input_data[0])

16

In [None]:
input_data.shape

(3583, 16)

In [None]:
target = to_categorical(target, num_classes=len(word2index))

In [None]:
target

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
target.shape

(3583, 2421)

In [None]:
VOCAB_SIZE = 2001
VOCAB_SIZE

2001

In [None]:
MAX_LEN

16

## 4. LSTMs for Text Generation

### 4.1 LSTM ( Long Short-Term Memory  )    
  
1. Input Layer : Takes the sequence of words as input  
2. LSTM Layer : Computes the output using LSTM units. I have added 100 units in the layer, but this number can be fine tuned later.  
3. Dropout Layer : A regularisation layer which randomly turns-off the activations of some neurons in the LSTM layer.  
4. Output Layer : Computes the probability of the best possible next word as output  

![title](https://cdn-images-1.medium.com/max/1600/1*yBXV9o5q7L_CvY7quJt3WQ.png)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping

In [None]:
model = Sequential()

model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=100, input_length=MAX_LEN))

model.add(LSTM(units=100))
model.add(Dropout(rate=0.1))

model.add(Dense(units=target.shape[1], activation="softmax"))

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 100)           200100    
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2421)              244521    
                                                                 
Total params: 525,021
Trainable params: 525,021
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])

In [None]:
model.fit(input_data, target, batch_size=10, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa73403cc10>

### 4.2 GRU ( Gated recurrent unit )

In [None]:
gru_model = Sequential()
gru_model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=100, input_length=MAX_LEN))
gru_model.add(GRU(units=100))
gru_model.add(Dropout(rate=0.1))
gru_model.add(Dense(units=target.shape[1], activation="softmax"))

In [None]:
gru_model.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
gru_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 16, 100)           200100    
                                                                 
 gru (GRU)                   (None, 100)               60600     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 2421)              244521    
                                                                 
Total params: 505,221
Trainable params: 505,221
Non-trainable params: 0
_________________________________________________________________


#### GRU

In [None]:
gru_model.fit(input_data, target, batch_size=10, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa734017ac0>

## 5. Generating the text 

In [None]:
import tensorflow
from numpy.random import seed
tensorflow.random.set_seed(2)
seed(1)

In [None]:
def text_generater(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding="post")
        #predicted = model.predict_classes(token_list, verbose=0)
        predicted=model.predict(token_list) 
        predicted=np.argmax(predicted,axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
text1 = "Trump decided"
text_generater(text1, 5, model, MAX_LEN)



'Trump Decided The The The The The'

In [None]:
text_generater(text1, 5, gru_model, MAX_LEN)



'Trump Decided A A A A A'

## I need more data to training I guess.... 