In [2]:
!nvidia-smi

Sun May 23 14:09:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.31       Driver Version: 465.31       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   53C    P8     7W /  N/A |     47MiB /  6078MiB |     29%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [19]:
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import tensorflow.keras.callbacks as callbacks
from tensorflow.keras.utils import plot_model 


from nltk import word_tokenize, sent_tokenize
import nltk
from tqdm import tqdm 
import numpy as np
import re
import string
from collections import OrderedDict

In [4]:
# nltk.download('punkt')
# !wget https://www.corpusdata.org/now/samples/text.zip
# !unzip -q text.zip

In [7]:
path_to_file = 'data/text.txt'
# path_to_file = 'text.txt'

In [8]:
with open(path_to_file) as f:
    corpus = f.read().splitlines()
len(corpus)

2917

### Preprocessing 

In [9]:
punctuation_string = '!"\\\\#\\$%\\&\\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~\d'

In [10]:
def preprocess(corpus):
    sentences = sent_tokenize(' '.join(w for w in corpus))
    sentences = [sent.lower() for sent in sentences]
    sentences = [re.sub('<\w+>', '', sent) for sent in sentences]
    sentences = [sent.decode() for sent in 
                 tf.strings.regex_replace(sentences,f'[{punctuation_string}]', '').numpy()]
    
    tokens = [word_tokenize(sent) for sent in tqdm(sentences)]
    
    # return tokenized sentences
    return tokens

In [11]:
tokens_list = preprocess(corpus[3:])

100%|██████████| 77044/77044 [00:11<00:00, 6716.99it/s]


In [12]:
len(tokens_list)

77044

In [13]:
print(tokens_list[0][:5])

['sol', 'yurick', 'the', 'writer', 'whose']


In [20]:
def build_vocabulary(tokens_list, top_n=10000):
    freqs = {}

    # get the freqs
    for tokens in tqdm(tokens_list):
        for word in tokens:
            if word not in freqs:
                freqs[word] = 1
            else:
                freqs[word] += 1
    # sort the freqs
    freqs = OrderedDict(sorted(freqs.items(), key=lambda x: x[1], reverse=True))
    
    # get the first top_n words
    top_n_words = list(freqs)[:top_n]
    
    
    stoi = {'<pad>': 0, '<unk>': 1}
    itos = {0: '<pad>', 1: '<unk>'}
    idx = 2
    
    for word in tqdm(top_n_words):
        if word not in stoi:
            stoi[word] = idx
            itos[idx] = word
            idx += 1
    
    
    return stoi, itos

In [21]:
stoi, itos = build_vocabulary(tokens_list)

100%|██████████| 77044/77044 [00:00<00:00, 235862.29it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1278868.19it/s]


In [22]:
def numericalize(tokens_list):
    numericalized = []
    for sent in tokens_list:
        n_set = []
        for w in sent:
            if w in stoi:
                n_set.append(stoi[w])
            else:
                n_set.append(stoi['<unk>'])
        numericalized.append(n_set)
    
    return numericalized

In [23]:
numericalized = numericalize(tokens_list)

In [24]:
vocab_size = len(stoi)
embed_size = 300
window_size = 2

print(f"Vocab Size  : {vocab_size}")
print(f"Vocab Sample: {list(stoi.items())[:5]}")

Vocab Size  : 10002
Vocab Sample: [('<pad>', 0), ('<unk>', 1), ('the', 2), ('to', 3), ('of', 4)]


In [31]:
# save stoi
# Save Labels separately on a line-by-line manner.
with open('logs/metadata_10k.tsv', "w") as f:
    for subwords in stoi:
        f.write("{}\n".format(subwords))

In [32]:
def generate_context_target_pairs(corpus=numericalized, window_size=2, vocab_size=vocab_size):
    
    context_length = window_size * 2
    
    for sentence in corpus:
        sentence_length = len(sentence)
        
        for index, word in enumerate(sentence):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sentence[i] for i in range(start, end)
                                 if 0 <= i < sentence_length and i != index])
            label_word.append(word)
            
            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield(x[0], y[0])

In [33]:
i = 0
for x, y in generate_context_target_pairs(numericalized, 2, vocab_size):
    if 0 not in x:
        if i == 10: break
        i+=1

        print(f'Target: {itos[np.argmax(y)]}, Context: {[itos[w] for w in x]}')

Target: the, Context: ['<unk>', '<unk>', 'writer', 'whose']
Target: writer, Context: ['<unk>', 'the', 'whose', 'novel']
Target: whose, Context: ['the', 'writer', 'novel', 'the']
Target: novel, Context: ['writer', 'whose', 'the', 'warriors']
Target: the, Context: ['whose', 'novel', 'warriors', 'was']
Target: warriors, Context: ['novel', 'the', 'was', 'adapted']
Target: was, Context: ['the', 'warriors', 'adapted', 'into']
Target: adapted, Context: ['warriors', 'was', 'into', 'a']
Target: into, Context: ['was', 'adapted', 'a', 'film']
Target: a, Context: ['adapted', 'into', 'film', 'years']


In [34]:
output_shapes = ([None, window_size*2],[None, vocab_size])
output_shapes

([None, 4], [None, 10002])

In [35]:
output_shapes = ([window_size*2],[vocab_size])
dataset = tf.data.Dataset.from_generator(generate_context_target_pairs, (tf.int32, tf.int32), output_shapes=output_shapes)
dataset = dataset.batch(1000)

In [39]:
for batch, (x, y) in enumerate(dataset):
    break
    #print("batch: ", batch)
    #if x.shape != (1000, 4) or y.shape != (1000, 37188):
print(x.shape, y.shape)

(1000, 4) (1000, 10002)


In [41]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2),
    Lambda(lambda x: keras.backend.mean(x, axis=1), output_shape=(embed_size)),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 300)            3000600   
_________________________________________________________________
lambda (Lambda)              (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 10002)             3010602   
Total params: 6,011,202
Trainable params: 6,011,202
Non-trainable params: 0
_________________________________________________________________


In [45]:
plot_model(model,show_shapes=True, show_layer_names=True, rankdir='TB', expand_nested=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [46]:
google_drive_path = 'models/local_en_10k_test.h5'
# google_drive_path = '/content/drive/MyDrive/weights/cbow/now_1000.h5'

In [47]:
early_stopping_cb = callbacks.EarlyStopping(monitor='loss', patience=4, verbose=True)
checkpoint_cb = callbacks.ModelCheckpoint(google_drive_path, monitor='loss', verbose=True, save_best_only=True)
tensorboard_callback = callbacks.TensorBoard(log_dir="logs")

callbacks_list = [early_stopping_cb, checkpoint_cb, tensorboard_callback]

history = model.fit(dataset, epochs=1, callbacks=callbacks_list)


Epoch 00001: loss improved from inf to 6.48802, saving model to models/local_en_10k_test.h5
