# Word prediction - neural network approach

In [1]:
import os
import codecs
import numpy as np
import random
from datetime import datetime
import pickle
import json
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [2]:
from config import START_SYMBOL, PADDING_SYMBOL, UNK_SYMBOL, NUM_PREDICTIONS, EXTRINSIC_EVAL_SIZE
from prepare import tokenize_sentences, train_val_test_split, buildWordIdMappings
from evaluate import evaluate_extrinsic
from gui import get_gui

In [3]:
import gc
gc.collect()

0

In [4]:
# Hyperparameters
BATCH_SIZE = 64
HIDDEN_SIZE = 25
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
USE_GRU = True
TUNE_EMBEDDINGS = True

if torch.cuda.is_available():
    print("Current device: {}".format(torch.cuda.get_device_name(0)))
else:
    print('Running on CPU')
print()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Current device: NVIDIA H100 80GB HBM3 MIG 1g.10gb



In [5]:
sentences = tokenize_sentences("data/HP_all.txt", [START_SYMBOL])
train_sentences, validation_sentences, test_sentences = train_val_test_split(sentences)
w2i, i2w = buildWordIdMappings(train_sentences)
train_vocab = len(i2w)

Tokenizing input corpus...
Tokenization ready.
Train dataset size: 51035
Validation dataset size: 17012
Test dataset size: 17012


In [6]:
class WordDataset(Dataset):
    def __init__(self, sentences):
        self.previous_words = []
        self.next_word = []
        for s in sentences:
            for (i, word) in enumerate(s):
                if i > 0:
                    self.previous_words.append([w2i.get(s[j], w2i[UNK_SYMBOL]) for j in range(i)])
                    self.next_word.append(w2i.get(word, w2i[UNK_SYMBOL]))

    def __len__(self):
        return len(self.previous_words)

    def __getitem__(self, idx):
        return self.previous_words[idx], self.next_word[idx]

In [7]:
# The function below will take care of the case of sequences of unequal lengths
def pad_sequence(batch, pad=w2i[PADDING_SYMBOL]):
    previous_words, next_word = zip(*batch)
    max_len = max(map(len, previous_words))
    padded_previous_words = [[b[i] if i < len(b) else pad for i in range(max_len)] for b in previous_words]
    return padded_previous_words, next_word

In [8]:
training_dataset = WordDataset(train_sentences)
validation_dataset = WordDataset(validation_sentences)
test_dataset = WordDataset(test_sentences)

train_loader = DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_sequence)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, collate_fn=pad_sequence)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=pad_sequence)

In [9]:
def load_glove_embeddings(embedding_file):
    """
    Reads pre-made embeddings from a file
    """
    N = len(w2i)
    embeddings = [0] * N
    with codecs.open(embedding_file, 'r', 'utf-8') as f:
        for line in f:
            data = line.split()
            word = data[0].lower()
            if word not in w2i:
                w2i[word] = N
                i2w.append(word)
                N += 1
                embeddings.append(0)
            vec = [float(x) for x in data[1:]]
            D = len(vec)
            embeddings[w2i[word]] = vec
    # Add a '0' embedding for the padding symbol
    embeddings[0] = [0] * D
    # Check if there are words that did not have a ready-made Glove embedding
    # For these words, add a random vector
    for word in w2i:
        index = w2i[word]
        if embeddings[index] == 0:
            embeddings[index] = (np.random.random(D) - 0.5).tolist()
    return D, embeddings


print("Loading GloVe embeddings...")
embedding_size, embeddings = load_glove_embeddings('data/glove.6B.50d.txt')
print("Word vectors loaded.")

Loading GloVe embeddings...
Word vectors loaded.


In [10]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embeddings=None, embedding_size=16, hidden_size=25, device='cpu', use_gru=True, tune_embeddings=False):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        if embeddings is not None:
            self.embedding.weight = nn.Parameter(torch.tensor(embeddings, dtype=torch.float), requires_grad=tune_embeddings)
        if use_gru:
            self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.output = nn.Linear(hidden_size, vocab_size)
        self.device = device
        self.to(device)

    def forward(self, x, hidden=None):
        hidden, output = self.rnn(self.embedding(x), hidden)
        output = self.output(output)
        return output, hidden

In [14]:
def evaluate(data_loader, model):
    model.eval()
    correct = 0
    total = 0
    print('Evaluating RNN model...')
    with torch.no_grad():
        for x, y in data_loader:
            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)
            y_pred, _ = model(x)
            y_pred = y_pred.squeeze()
            _, predicted = torch.max(y_pred.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
    print('Accuracy: {:.2f}%'.format(100 * correct / total))

In [13]:
model = RNNModel(len(w2i), embeddings, embedding_size, HIDDEN_SIZE, device, USE_GRU, TUNE_EMBEDDINGS)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)



In [14]:
for epoch in range(NUM_EPOCHS):
    print('Epoch: {}/{}'.format(epoch + 1, NUM_EPOCHS))
    model.train()
    for x, y in train_loader:
        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)
        optimizer.zero_grad()
        y_pred, _ = model(x)
        loss = criterion(y_pred.squeeze(), y)
        loss.backward()
        optimizer.step()
    evaluate(validation_loader, model)

Epoch: 1/10
Evaluating RNN model...
Accuracy: 16.45%
Epoch: 2/10
Evaluating RNN model...
Accuracy: 18.98%
Epoch: 4/10
Evaluating RNN model...
Accuracy: 18.90%
Epoch: 5/10
Evaluating RNN model...
Accuracy: 19.35%
Epoch: 6/10
Evaluating RNN model...
Accuracy: 19.76%
Epoch: 7/10
Evaluating RNN model...
Accuracy: 19.72%
Epoch: 8/10
Evaluating RNN model...
Accuracy: 19.71%
Epoch: 9/10
Evaluating RNN model...
Accuracy: 19.73%
Epoch: 10/10
Evaluating RNN model...
Accuracy: 20.24%


In [15]:
def save_model():
    dt = str(datetime.now()).replace(' ','_').replace(':','_').replace('.','_')
    newdir = 'nn/model_' + dt
    os.mkdir(newdir)
    torch.save(model.state_dict(), os.path.join(newdir, 'model'))
    with open(os.path.join(newdir, 'w2i'), 'wb') as f:
        pickle.dump(w2i, f)
        f.close()
    with open(os.path.join(newdir, 'i2w'), 'wb') as f:
        pickle.dump(i2w, f)
        f.close()

    settings = {
        'epochs': NUM_EPOCHS,
        'learning_rate': LEARNING_RATE,
        'batch_size': BATCH_SIZE,
        'hidden_size': HIDDEN_SIZE,
        'embedding_size': embedding_size,
        'use_gru': USE_GRU,
        'tune_embeddings': TUNE_EMBEDDINGS
    }
    with open(os.path.join(newdir, 'settings.json'), 'w') as f:
        json.dump(settings, f)


save_model()

## Evaluation

In [11]:
# Load model
def load_model(model_dir):    
    f = open(model_dir + 'settings.json')
    settings = json.load(f)    
    m = RNNModel(len(w2i), None, settings['embedding_size'], settings['hidden_size'], device, settings['use_gru'], settings['tune_embeddings'])
    f.close()
    m.load_state_dict(torch.load(model_dir + 'model'))
    m.eval()
    return m

In [12]:
model = load_model('nn/model_2024-05-21_21_14_32_520995/')

In [15]:
evaluate(test_loader, model)

Evaluating RNN model...
Accuracy: 20.36%


In [16]:
class WordPredictorModel:
    def __init__(self, model, predict_from_all_words=True):
        self.model = model
        self.predict_from_all_words = predict_from_all_words
        self.last_previous_words = None
        self.y_pred = None
        self.possible_words = None
    
    def predict(self, previous_words, typed_characters, k):
        if self.last_previous_words == previous_words:
            self.possible_words = [i for i in self.possible_words if i2w[i].startswith(typed_characters)]
        else:
            x = [w2i.get(w, w2i[UNK_SYMBOL]) for w in previous_words]
            x = torch.tensor(x).to(device)
            y_pred, _ = self.model(x)
            self.y_pred = y_pred.squeeze()
            if self.predict_from_all_words:
                self.possible_words = [i for (i, w) in enumerate(i2w) if w.startswith(typed_characters)]
            else:
                self.possible_words = [i for i in range(train_vocab) if i2w[i].startswith(typed_characters)]
            self.last_previous_words = previous_words
        possible_pred = self.y_pred[self.possible_words]
        _, best = possible_pred.topk(min(k, len(possible_pred)))
        best = [i2w[self.possible_words[b]] for b in best]
        return best, len(self.possible_words)

In [17]:
model.eval()
word_predictor = WordPredictorModel(model)

In [None]:
with torch.no_grad():
    evaluate_extrinsic(test_sentences[:EXTRINSIC_EVAL_SIZE], word_predictor, NUM_PREDICTIONS)

In [18]:
with torch.no_grad():
    get_gui(word_predictor, 'recurrent neural network')

Running on local URL:  http://127.0.0.1:7862
Running on public URL: https://5a843cd8b6633298cc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


In [34]:
guardian_test = tokenize_sentences("data/guardian_test.txt", [START_SYMBOL])
_, _, guardian_test = train_val_test_split(guardian_test)
with torch.no_grad():
    evaluate_extrinsic(guardian_test[:EXTRINSIC_EVAL_SIZE], word_predictor, NUM_PREDICTIONS[-1:])

Tokenizing input corpus...
Tokenization ready.
Train dataset size: 20203
Validation dataset size: 6734
Test dataset size: 6735
k=4 suggestion(s)


Keystrokes evaluation: 100%|██████████| 1000/1000 [45:00<00:00,  2.70s/it] 

 Keystrokes: 67488
 All characters: 112965
 Keystroke savings: 40.26%
 Average number of possible words when correctly guessed: 55884



