# Using Deep Learning (LSTMs) to produce new quotes

### 1. Preparing the data

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
data = pd.read_json('Data/data.json')

In [3]:
data = data['Quote'].unique()

In [4]:
df = pd.DataFrame({'Quote':data})

In [5]:
print(df.head())

                                               Quote
0  Don't cry because it's over, smile because it ...
1  I'm selfish, impatient and a little insecure. ...
2       Be yourself; everyone else is already taken.
3  Two things are infinite: the universe and huma...
4  Be who you are and say what you feel, because ...


In [6]:
vocab = pd.read_csv('Data/vocab_list.txt',header=None,sep=" ")
vocab = np.append(vocab,".")
vocab = np.append(vocab,",")
vocab = np.append(vocab,"\"")
vocab = np.append(vocab,";")
vocab = np.append(vocab,"?")
vocab = np.append(vocab,"\'")
vocab = np.append(vocab,"-")
vocab = np.append(vocab,":")
vocab = np.append(vocab,"UNKNOWN")
vocab = np.append(vocab,"STARTPAD")
vocab = np.append(vocab,"ENDPAD")

In [7]:
print(vocab.shape)

(3011,)


In [8]:
n_words = vocab.shape[0]

In [9]:
word_map = {}
word_map_rev = {}
for index,value in enumerate(vocab):
    word_map[value] = index
    word_map_rev[index] = value

In [10]:
print(word_map["great"])
print(word_map_rev[1202])

1202
great


In [11]:
def get_matrix_ids(s):
    id_matrix = []
    w = nltk.word_tokenize(s)
    w = [i.lower() for i in w]
    
    for i in w:
        if i in vocab:
            id_matrix.append(word_map[i])
        else :
            id_matrix.append(word_map["UNKNOWN"]) #Unknown token
    return id_matrix

In [12]:
from keras.preprocessing.sequence import pad_sequences

#Format x_data

x_data = []

max_len = 20
for index,row in df.iterrows():
    
    cur_row = get_matrix_ids(row['Quote'])
    cur_row = np.insert(cur_row,0,word_map["STARTPAD"])
    x_data.append(cur_row)
x_data = np.array(x_data)
x_data = pad_sequences(maxlen=max_len, sequences=x_data, padding="post", value=n_words - 1)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
#format y_data

y_data = []

for i in x_data:
    y = []
    for j in i:
        temp = np.zeros(n_words,dtype=int)
        temp[j] = 1
        y.append(temp)
    y_data.append(np.array(y))


In [14]:
y_data = np.array(y_data)

In [15]:
print(x_data.shape)
print(y_data.shape)

(36937, 20)
(36937, 20, 3011)


Now, we have a dataset where each x training point is a sequence of words (quotes) at each time step and every y training point is the correct word of that timestep.

In [16]:
print(x_data[690])
print(y_data[690])

[3009 2685 1524 2281 2685 1684 1533 3000 3010 3010 3010 3010 3010 3010
 3010 3010 3010 3010 3010 3010]
[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]


### 2. Building the model

In [17]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Flatten,Dense, TimeDistributed, Dropout

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=55, input_length=max_len))
model.add(LSTM(units=100,return_sequences=True,recurrent_activation='tanh',recurrent_dropout=0.35))
model.add(TimeDistributed(Dropout(0.37)))
model.add(TimeDistributed(Dense(n_words,activation='softmax')))


model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [18]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 55)            165605    
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 100)           62400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 20, 100)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 20, 3011)          304111    
Total params: 532,116
Trainable params: 532,116
Non-trainable params: 0
_________________________________________________________________
None


### 3. Training the model

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(29549, 20) (29549, 20, 3011)
(7388, 20) (7388, 20, 3011)


In [None]:
model.fit(x_train,y_train,batch_size=10,epochs=1,verbose=1,validation_split=0.1)

Train on 26594 samples, validate on 2955 samples
Epoch 1/1
