In [1]:
import numpy as np
import pandas as pd
import re
import random
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.models import Model 
from keras.layers import Input, Activation, Embedding, LSTM
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical


In [5]:
df = pd.read_csv('shortjokes.csv')

In [7]:
text  = df['Joke'].values

In [8]:
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(text)
 vocab_size = len(tokenizer.word_counts) + 1
 print("Total Vocabulary: ", vocab_size )

Total Vocabulary:  70649


In [10]:
 N = text.shape[0]
 print("Total Row Count: ", N)
 prop_train = 0.8
 train = int(N*prop_train)
 print("Training Data Count: ", train)
 test = N - train
 print("Test Data Coount: ", test)

Total Row Count:  231657
Training Data Count:  185325
Test Data Coount:  46332


In [11]:
sequences, index_train, index_test = [], [], []
count = 0
for irow,line in enumerate(text):
    #print(irow, line)
    encoded = tokenizer.texts_to_sequences([line])[0]    
    #print(encoded)
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        if irow < train:        
            index_train.append(count)     
        else:         
            index_test.append(count)
        count += 1
print('Total Sequences: %d' % (len(sequences)))

Total Sequences: 3850485


In [12]:
from keras_preprocessing.sequence import pad_sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % (max_length))

Max Sequence Length: 93


In [13]:
sequences = np.array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
print(y.shape)
y = to_categorical(y, num_classes=vocab_size)
X_train, y_train, X_test, y_test = X[index_train], y[index_train],X[index_test],  y[index_test]
print(X_train.shape)
print(y_train.shape)
#make it smaller for testing
X_train = X_train[:1000]
y_train = y_train[:1000]
X_test = X_test[:100]
y_test = y_test[:100]

(3850485,)


MemoryError: Unable to allocate 1013. GiB for an array with shape (3850485, 70649) and data type float32

In [14]:
def build_model(vocab_size,
                  input_length=1,
                  dim_dense_embedding=10,
                  hidden_unit_LSTM=5):
    main_input = Input(shape=(input_length,),dtype='int32',name='main_input')
    embedding = Embedding(vocab_size, dim_dense_embedding, 
                          input_length=input_length)(main_input)
    x = LSTM(hidden_unit_LSTM)(embedding)
    main_output = Dense(vocab_size, activation='softmax')(x)
    model = Model(inputs=[main_input],
                   outputs=[main_output])
    return(model)

In [None]:
model = build_model(vocab_size,
                   input_length=X.shape[1],
                   dim_dense_embedding=30,
                   hidden_unit_LSTM=64)
 ##compile network
model.compile(loss='categorical_crossentropy', 
               optimizer='adam', metrics=['accuracy'])
 ##fit network
tf_model = model.fit(X_train, y_train, 
                  validation_data = (X_test,y_test),
                  epochs=20, verbose=2,
                  batch_size=128)

In [None]:
model.save_weights('model.h5')

In [None]:
model.load_weights('model.h5')

In [None]:
def generate_text():
    in_text = "America"
    for _ in range(50):
        # encode the text as integer
        enc = tokenizer.texts_to_sequences([in_text])[0]
        #print(enc)
        # pre-pad sequences to a fixed length
        enc_padding = pad_sequences([enc], maxlen=max_length-1, padding='pre')
        #print(enc_padding)
        probs = model.predict(enc_padding, verbose=0).flatten()
        #print(probs)
        index = np.random.choice(range(len(probs)),p=probs)
        #print(index)
        index_word = {v: k for k,v in tokenizer.word_index.items()}
        word = index_word[index] 
        in_text += ' ' + word
    print(in_text)

In [None]:
for _ in range(100):
    generate_text()