In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorboard.plugins.hparams import api as hp
import time
import pickle
import bcolz

tf.random.set_seed(1234)
%load_ext tensorboard

logdir = './logs/'
modeldir = './models/'
rundir = './run/'
glove_path = './glove/'

tf.test.is_built_with_cuda()

True

In [2]:

max_words = 10000
sentence_len = 20
pred_len = 1  # fix
train_len = sentence_len - pred_len

batch_size = 512


## load glove 
(if files do not exist run glove_embedding.ipynb first)

In [3]:
# pre trained word embeddings

vectors = bcolz.open(f'{glove_path}6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

embedding_dim = 50

## build vocabulary and text representation

In [4]:

def build_vocab(words, max_words=None):
    if max_words is None:
        return list(set(words))
    
    vocab = {}
    for word in word_seq:
        try:
            vocab[word] += 1
        except KeyError:
            vocab[word] = 1
    most_freq = sorted(vocab.keys(), key=vocab.get, reverse=True)
    return most_freq[:max_words]


def replace_by_tbl(text, tbl):
    for k,v in tbl.items():
        text = text.replace(k,v)
    return text


def preprocess_text(text):
    text = text.translate({                 # uniform apostrophe
        ord(key): '\'' for key in '\`\´\’'
    })
    text = replace_by_tbl(text, {           # split shortforms
        'n\'t' : ' not',
        '\'ve' : ' have',
        '\'ll' : ' will',
        '\'m' : ' am',
        '\'re' : ' are',
        '\'s' : ' is',
        '\'d' : ' would',
    })
    text = text.translate({                 # remove rest of '
        ord('\''): None
    })
    text = replace_by_tbl(text, {           # restore o'clock
        'oclock': 'o\'clock'
    })
    return text


def text2words(text, vocab_set=None):
    words = text_to_word_sequence(preprocess_text(text))
    words = [ word for word in words if word in word2idx]
    if not vocab_set is None:
        words = [ word for word in words if word in vocab_set]
    return words 

In [5]:
with open('processed_texts.csv', 'r', encoding='UTF-8') as file:
    texts = [line.strip('\n') for line in file]

# sequences of words
word_seqs = [text2words(text) for text in texts]

# flatten seqneces to one long sequence 
word_seq = [inner for outer in word_seqs for inner in outer]

# vocabulary - list of words that are used
vocab = build_vocab(word_seq, max_words)
vocab_set = set(vocab)
vocab_size = len(vocab)

# remove all words that are not in the vocabulary
word_seq = [word for word in word_seq if word in vocab_set]

# tokenized sequence of words
w2tk = {}
for i, word in enumerate(vocab):
    w2tk[word] = i

# create token sequence
tkn_seq = [w2tk[word] for word in word_seq]
tkn_seq = np.array(tkn_seq)


def tk2emb(token):
    return glove[vocab[token]]

print(f'size of vocablary: {vocab_size}')

size of vocablary: 10000


In [None]:
# Sliding window to generate train data
seq = []
for i in range(len(tkn_seq)-sentence_len):
    seq.append(tkn_seq[i:i+sentence_len])

# set data, label
X = []
y = []
for i in seq:
    X.append(i[:train_len])
    y.append(i[-1])

num_samples = len(y)
print('training samples: ', num_samples)

## create the datasets

In [None]:
# to numpy
X = np.array(X)
y = np.array(y)

# split data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.4)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=.5)

# make a tf dataset
train_ds = tf.data.Dataset.from_tensor_slices( (X_train, y_train) ).shuffle(buffer_size=1000).batch(batch_size)
valid_ds = tf.data.Dataset.from_tensor_slices( (X_valid, y_valid) ).shuffle(buffer_size=1000).batch(batch_size)
test_ds  = tf.data.Dataset.from_tensor_slices( (X_test , y_test ) ).shuffle(buffer_size=1000).batch(batch_size)

# use prefetching to parallize training and loading of next batch
train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)
valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)
test_ds  =  test_ds.prefetch(tf.data.experimental.AUTOTUNE)

# cache dataset on first epoch
train_ds = train_ds.cache()
valid_ds = valid_ds.cache()
test_ds  =  test_ds.cache()

## create the weights for the glove embedding layer

In [None]:
weights_matrix = np.zeros((vocab_size, embedding_dim))
words_found = 0

for i, word in enumerate(vocab):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))

print(weights_matrix.shape)

# models

In [None]:
def plot_hist(history):
    plt.figure(figsize=(12,4))
    # summarize history for accuracy
    plt.subplot(121)   
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])   
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    # summarize history for loss
    plt.subplot(122)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

## model 2

In [None]:
# define model

model_2 = Sequential([
    Embedding(input_dim=vocab_size, 
            output_dim=embedding_dim, 
            input_length=train_len, 
            trainable=False, 
            weights=[weights_matrix]),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(100, activation='relu'),
    Dropout(0.1),
    Dense(vocab_size, activation='softmax')
])

model_2.compile(optimizer='adam',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])

In [None]:
date_str = ''.join(['_'+str(t) for t in time.localtime(time.time())[:4]])
filepath = modeldir + 'model_2' + date_str +'.hdf5'

checkpoint1 = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True)
checkpoint2 = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

callbacks_list = [checkpoint1, checkpoint2]

In [None]:
m2_history = model_2.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs = 30, batch_size = 1024, callbacks = callbacks_list)
plot_hist(m2_history)

In [None]:
test_loss, test_accuracy = model_2.evaluate(X_test, y_test, batch_size=1024)
print(f'The model achived a loss of {test_loss:.4f} and a accuracy of {test_accuracy * 100:.2f}% on the test-set')

# Text Generation

In [6]:
def gen(model, sentence, max_len=30):
    ''' Generates a sequence given a string seq using specified model until the total sequence length
    reaches max_len'''
    # Tokenize the input string
    tokenized_sent = [w2tk[word] for word in text2words(sentence, vocab_set)]
    max_len = max_len+len(tokenized_sent)
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds 
    # zeroes to the left side of our sequence until it becomes 19 long, the number of input features.
    while len(tokenized_sent) < max_len:
        padded_sentence = tf.keras.preprocessing.sequence.pad_sequences([tokenized_sent][-19:],maxlen=19)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent.append(op.argmax() )
        
    return " ".join([vocab[tk] for tk in tokenized_sent]) + '\n'

In [7]:
# test start-sentences

sen1 = "Good evening everyone. My name"
sen2 = "Donald Trump"
sen3 = "Climate change"
sen4 = "My research about"


In [15]:

model_name = 'model_2_2020_7_23_21'

model = tf.keras.models.load_model(modeldir + model_name + '.hdf5')

print(gen(model, sen1))
print(gen(model, sen2))
print(gen(model, sen3))
print(gen(model, sen4))


good evening everyone my name is the first thing is the first time i am going to show you a little bit of the other things i am going to show you a little bit

donald trump is the first thing is the first thing is the first thing is the first thing is the first time i am going to show you a little bit of

climate change is the same thing that is the same thing that is the same thing that is the same thing that is the same thing that is the same thing that

my research about the world is the first thing that is the first thing that is the first thing that is the same thing that is the same thing that is the same

