In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import time
import pickle

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('pytorch using ', device.type)

pytorch using  cuda


In [7]:
# hyperparameter

max_words = 10000
sentence_len = 20
pred_len = 1  # fix
train_len = sentence_len - pred_len

max_samples = int(4e5) #6e6 max

In [8]:
# pre trained word embeddings

import bcolz

glove_path = './'
vectors = bcolz.open(f'{glove_path}/6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

embedding_dim = 50

# if files do not exist run glove_embedding.ipynb first

In [9]:

def build_vocab(words, max_words=None):
    if max_words is None:
        return list(set(words))
    
    vocab = {}
    for word in word_seq:
        try:
            vocab[word] += 1
        except KeyError:
            vocab[word] = 1
    most_freq = sorted(vocab.keys(), key=vocab.get, reverse=True)
    return most_freq[:max_words]


def replace_by_tbl(text, tbl):
    for k,v in tbl.items():
        text = text.replace(k,v)
    return text


# preprossing mapping tables
apostrophe_tbl = {ord(key): '\'' for key in '\`\´\’'}
shortform_tbl = {
    'n\'t' : ' not',
    '\'ve' : ' have',
    '\'ll' : ' will',
    '\'m' : ' am',
    '\'re' : ' are',
    '\'s' : ' is',
    '\'d' : ' would',
}
remove_apostrophe_tbl = {ord('\''): None}
restore_oclock = {'oclock': 'o\'clock'}


def preprocess_text(text):
    text = text.translate(apostrophe_tbl)       # uniform apostrophe
    text = replace_by_tbl(text, shortform_tbl)  # split shortforms
    text = text.translate(remove_apostrophe_tbl)  # remove rest of '
    text = replace_by_tbl(text, restore_oclock)   # restore o'clock
    return text


def text2words(text, vocab_set=None):
    words = text_to_word_sequence(preprocess_text(text))
    words = [ word for word in words if word in word2idx]
    if not vocab_set is None:
        words = [ word for word in words if word in vocab_set]
    return words 

In [10]:
with open('processed_texts.csv', 'r', encoding='UTF-8') as file:
    texts = [line.strip('\n') for line in file]

# sequences of words
word_seqs = [text2words(text) for text in texts]

# flatten seqneces to one long sequence 
word_seq = [inner for outer in word_seqs for inner in outer]

# vocabulary - list of words that are used
vocab = build_vocab(word_seq, max_words)
vocab_size = len(vocab)
print(f'size of vocablary: {vocab_size}')

# remove all words that are not in the vocabulary
vocab_set = set(vocab)
word_seq = [word for word in word_seq if word in vocab_set]

# tokenized sequence of words
w2tk = {}
for i, word in enumerate(vocab):
    w2tk[word] = i

tkn_seq = [w2tk[word] for word in word_seq]
tkn_seq = np.array(tkn_seq)

def tk2emb(token):
    return glove[vocab[token]]

size of vocablary: 10000


In [12]:
# Sliding window to generate train data
seq = []
for i in range(len(tkn_seq)-sentence_len):
    seq.append(tkn_seq[i:i+sentence_len])

# reduce length of seq for performance reasons and of cause test_set
print('available samples: ', len(seq))
seq = seq[:max_samples]

# set data, label
X = []
y = []
for i in seq:
    X.append(i[:train_len])
    y.append(i[-1])

num_samples = len(y)

X = np.array(X)
y = np.array(y)

print('training samples: ', num_samples)

# split data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.4)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=.5)

available samples:  5070539
training samples:  400000


In [None]:
# create the weights for the embedding layer

weights_matrix = np.zeros((vocab_size, embedding_dim))
words_found = 0

for i, word in enumerate(vocab):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))

print(weights_matrix.shape)

In [None]:
# define model
model_2 = Sequential([
    Embedding(vocab_size+1, 50, input_length=train_len),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(100, activation='relu'),
    Dropout(0.1),
    Dense(vocab_size, activation='softmax')
])

In [None]:
model_2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
filepath = "./model_2_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
x = np.asarray(trainX)
y = np.asarray(trainy)

In [None]:
model_2.fit(x,y, epochs = 10, batch_size = 512, callbacks = callbacks_list)

In [None]:
def gen(model, sentence, iterations=10):

    seq = [w2tk[word] for word in text2words(sentence, vocab_set)]
    #if(len(seq) < train_len): return -1
    
    for i in range(iterations):
        iput = np.array(seq[-19:])
        iput = np.vstack(iput).T
        iput = torch.from_numpy(iput).type(X_type)

        # oput = F.log_softmax( model(iput) )  # prob dist
        oput = model(iput)
        
        oput = oput.cpu().detach().numpy()[0]
        oput = np.argmax(oput) # max of softmax to long
        seq.append(oput)

    pred_sen = ''
    for tk in seq:
        pred_sen += vocab[tk] + ' '
    return pred_sen

In [None]:
sen = "Good evening today i will talk about something of high importance. I hope you will enjoy hearing about"
print(gen(model_2, sen))