<a href="https://colab.research.google.com/github/oliverquintana/CBOWWordPrediction/blob/main/NextWordPredictionCBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CBOW word predictor based on 1-word context



# Load dependencies

In [None]:
import json
import nltk
import spacy
import numpy as np
import tensorflow as tf
from utils import *
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
!python -m spacy download es_core_news_sm

# Corpus Preprocessing

Load corpus

In [None]:
corpus = readFile('corpus.txt')

Tokenize corpus

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
tokens_str = tokenizer.tokenize(corpus)
print('Tokens in corpus: {}'.format(len(tokens_str)))

Clean corpus

In [None]:
print('Input Corpus Size: {}'.format(len(tokens_str)))

# Remove symbols 
punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''

for i, token in enumerate(tokens_str):
    n_token = ''
    for char in token:
        if char not in punc:
            n_token += char

    tokens_str[i] = n_token

# Remove numeric tokens
i = len(tokens_str) - 1
while i >= 0:
    if tokens_str[i] == '':
        tokens_str.pop(i)
    else:
        for char in tokens_str[i]:
            try:
                int(char)
                tokens_str.pop(i)
            
            except:
                continue

    i -= 1

# Remove tokens with length < 2
i = len(tokens_str) - 1
while i >= 0:
    if len(tokens_str[i]) < 2:
        tokens_str.pop(i)

    i -= 1

# Remove stop words
stop_words = nltk.corpus.stopwords.words('spanish')
tokens_str = [token for token in tokens_str if token not in stop_words]

print('Ouput Corpus Size: {}'.format(len(tokens_str)))

Lemmatization

In [None]:
nlp = spacy.load('es_core_news_sm')
for i in range(len(tokens_str)):
    tokens_str[i] = tokens_str[i].lower()
    token = nlp(tokens_str[i])
    lemmas = [tok.lemma_.lower() for tok in token]
    tokens_str[i] = lemmas[0]
    
    if i % 1000 == 0:
        print('Progress: {} / {}'.format(i, len(tokens_str)))

Preview corpus sample

In [None]:
for _ in range(10):
    print(tokens_str[np.random.randint(len(tokens_str))])

Save clean corpus

In [None]:
str_out = ''
file = open('corpus_clean.txt', 'w')
for i, word in enumerate(tokens_str):
    temp = word + ' '
    file.write(temp)

file.close()


Save unique tokens with fixed indices

In [None]:
dictTokens = vectDict(tokens)
with open('dictTokens.txt', 'w') as outfile:
    json.dump(dictTokens, outfile)

# CBOW

Define CBOW model

In [None]:
class Cbow:
    def __init__(self, vSize, cSize = 3, eSize = 100, lr = 0.001):

        self.cSize = cSize
        self.model = Sequential()
        self.model.add(Dense(100, input_dim = vSize))
        self.model.add(Dense(vSize, activation = 'softmax'))

        opt = tf.keras.optimizers.Adam(learning_rate = lr)
        self.model.compile(loss = 'categorical_crossentropy', optimizer = opt)

        self.model.summary()

    def update_lr(lr = 0.0001):

        opt = tf.keras.optimizers.Adam(learning_rate = lr)
        self.model.compile(loss = 'categorical_crossentropy', optimizer = opt)

    def train(self, corpus, dictTokens, epochs = 10, batch_size = 10, fname = 'cbow.h5'):

        def getContextWords(corpus, dictTokens, batch_size):

            indices = np.random.randint(self.cSize, len(corpus) - self.cSize, batch_size)
            #X = np.zeros([batch_size, self.cSize*2, len(list(dictTokens.keys()))])
            X = np.zeros([batch_size, self.cSize, len(list(dictTokens.keys()))])        # Context before wn only
            y = np.zeros([batch_size, len(list(dictTokens.keys()))])

            for i, index in enumerate(indices):
                context = []
                word = corpus[index]
                context.extend(corpus[index-self.cSize : index])                        # Context before wn
                #context.extend(corpus[index+1 : index+1+self.cSize])                   # Context after wn

                y[i, dictTokens[word]] = 1
                for j, context_word in enumerate(context):
                    X[i, j, dictTokens[context_word]] = 1

            return X, y

        #steps = int(np.floor((len(corpus) - 2*self.cSize) / batch_size))
        steps = int(np.floor((len(corpus) - self.cSize) / batch_size))
        for epoch in range(epochs):
            for step in range(steps):
                X_batch, y_batch = getContextWords(corpus, dictTokens, batch_size)
                X_batch = np.sum(X_batch, axis = 1)
                loss = self.model.train_on_batch(X_batch, y_batch)
                print('Epoch: {}/{} Step: {}/{} Loss: {}'.format(epoch, epochs, step, steps, loss))

            self.model.save(fname)

        return


    def predict(self, indices, dictTokens, nPredictions = 3):

        vocab = list(dictTokens.keys())
        X = np.zeros([len(indices), len(vocab)], dtype = 'ushort')
        for i, index in enumerate(indices):
            X[i,index] = 1

        pred = self.model.predict(X)
        dPred = {}

        for i in range(pred.shape[0]):
            wPred = []
            for _ in range(nPredictions):
                index = np.argmax(pred[i])
                word = vocab[index]
                prob = pred[i,index]
                wPred.append([word, prob])
                pred[i,index] = 0

            dPred[vocab[indices[i]]] = wPred
            
        for key, value in dPred.items():
            s = ''
            for x in value:
                s += x[0] + '-' + str(np.round(x[1]*100, 3)) + '%' + ' '

            print('Context: {} Predictions: {}'.format(key, s))

        return

Load corpus

In [None]:
corpus = readFile('corpus_clean.txt')
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(corpus)
stop_words = nltk.corpus.stopwords.words('spanish')
tokens = [token for token in tokens if token not in stop_words]

with open('dictTokens.txt') as json_file:
    dictTokens = json.load(json_file)

Build CBOW model

In [None]:
cSize = 1                                           # Context size
vSize = len(list(dictTokens.keys()))                # Vocabulary size for units in input and output layers
lr = 0.001                                          # Learning rate
model = Cbow(cSize = cSize, vSize = vSize, lr = lr) # Build model

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               4947800   
_________________________________________________________________
dense_1 (Dense)              (None, 49477)             4997177   
Total params: 9,944,977
Trainable params: 9,944,977
Non-trainable params: 0
_________________________________________________________________


Load pre-trained model weights

In [None]:
model.model = tf.keras.models.load_model('cbow.h5')

Model training

In [None]:
model.train(tokens, dictTokens, epochs = 100, batch_size = 5000)

Predict next word from context

In [None]:
samples = 5         # Number of examples to predict
nPredictions = 3    # Predictions per sample
indices = np.random.randint(0, len(list(dictTokens.keys())), samples)
model.predict(indices, dictTokens = dictTokens, nPredictions = nPredictions)

Context: grotesco Predictions: joven-32.12% comer-20.331% solo-14.18% 
Context: cautivo Predictions: guapo-22.512% ahí-16.757% contar-9.144% 
Context: olor Predictions: particular-5.579% dermatólogo-3.626% fragante-3.007% 
Context: difuso Predictions: fusión-20.399% nebuloso-6.105% esconder-5.811% 
Context: corporal Predictions: hp-3.09% poder-2.977% solubilidad-2.511% 


Save trained model

In [None]:
model.model.save('cbow.h5')
del model