# Using Deep Learning (LSTMs) to produce new quotes

### 1. Preparing the data

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
data = pd.read_json('Data/data.json')

In [3]:
data = data['Quote'].unique()

In [4]:
df = pd.DataFrame({'Quote':data})

In [5]:
print(df.head())

                                               Quote
0  Don't cry because it's over, smile because it ...
1  I'm selfish, impatient and a little insecure. ...
2       Be yourself; everyone else is already taken.
3  Two things are infinite: the universe and huma...
4  Be who you are and say what you feel, because ...


In [6]:
vocab = pd.read_csv('Data/vocab_list.txt',header=None,sep=" ")
vocab = np.append(vocab,".")
vocab = np.append(vocab,",")
vocab = np.append(vocab,"\"")
vocab = np.append(vocab,";")
vocab = np.append(vocab,"?")
vocab = np.append(vocab,"\'")
vocab = np.append(vocab,"-")
vocab = np.append(vocab,":")
vocab = np.append(vocab,"UNKNOWN")
vocab = np.append(vocab,"ENDPAD")

In [7]:
print(vocab.shape)

(3010,)


In [8]:
n_words = vocab.shape[0]

In [9]:
word_map = {}
word_map_rev = {}
for index,value in enumerate(vocab):
    word_map[value] = index
    word_map_rev[index] = value

In [None]:
print(word_map["great"])
print(word_map_rev[1202])

In [11]:
#Format x_data and y_data
x_data = []
y_data = []

def get_matrix_ids(s):
    id_matrix = []
    w = nltk.word_tokenize(s)
    w = [i.lower() for i in w]
    
    for i in w:
        if i in vocab:
            id_matrix.append(word_map[i])
        else :
            id_matrix.append(word_map["UNKNOWN"]) #Unknown token
    return id_matrix

for index,row in df.iterrows():
    
    cur_row = get_matrix_ids(row['Quote'])
    i = 2
    if len(cur_row) > 10:
        
        while i!=len(cur_row)-2:
            x_data.append(np.array(cur_row[i-2:i+1]))
            y = np.zeros(n_words).astype(np.uint8)
            next_w = cur_row[i+1]
            y[next_w] = 1
            y_data.append(y)
            i += 1

KeyboardInterrupt: 

We have a dataset where each x training point is a sequence of 3 words and its corresponding y is the 4th word of the sequence.
X data is in the form of a sequence matrix (of 3 words), but its corresponding output (the 4th word of the sequence) is a one-hot encoded word.
Let's check it out.

In [None]:
print(x_data[69])
print(y_data[69])

In [None]:
for i in x_data[69]:
    print(word_map_rev[i])
print(word_map_rev[np.argmax(y_data[69])])

In [None]:
y_data = np.array(y_data)
x_data = np.array(x_data)

We have 942,728 training examples.

### 2. Building the model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=65, input_length=3))
model.add(LSTM(units=100, recurrent_dropout=0.1))
model.add(Dense(n_words,activation='softmax'))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
print(model.summary())

### 3. Training the model

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
a = np.array([])

In [None]:
a = np.append(a,3)
a = np.append(a,4)
a = np.append(a,33)