In [1]:
import urllib.request
from os.path import isfile, isdir, join
from os import listdir
import numpy as np
from collections import defaultdict, Counter
import pandas as pd
import itertools
import time
import random

import nltk, string, re
nltk.download("punkt")
from nltk.tokenize import sent_tokenize, word_tokenize
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer

# RegEx to capture punctuation and numbers
punct = re.compile(f'^[{re.escape(string.punctuation)}]+$')
num = re.compile("^[\d.]+$")

[nltk_data] Downloading package punkt to /home/max/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
!pip install python-datamuse



In [3]:
from datamuse import datamuse
import re

if not isfile("word_replacements.csv"):
  word_replacements = {}
else:
  word_replacements = pd.read_csv("word_replacements.csv", header=None, index_col=0, squeeze=True).to_dict()

api = datamuse.Datamuse()

def get_similar_words(word, n=25):

  if word not in word_replacements.keys():
    query_result = api.words(sl=word)

    score = 100
    sim_words = []
    i = 0
    while (score > 90 and len(sim_words) < n):
      try:
          word = query_result[i]['word']
      except IndexError:
          break;
      word = re.sub('[^A-Za-z0-9]+', '', word)
      sim_words.append(word)
      score = query_result[i]['score']
      i += 1
    word_replacements[word] = list(set(sim_words))
  return word_replacements[word]

In [6]:
vocab = list(set(itertools.chain(*trn_texts)))

In [7]:
def context_target_pairs(data, window_size, total_vocab):
    tot_length = window_size*2
    for seq in data:
        text_len = len(seq)
        for idx, word in enumerate(seq):
            context_word = []
            target   = []            
            begin = idx - window_size
            end = idx + window_size + 1
            context_word.append([seq[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])
            target.append(word)
            contextual = sequence.pad_sequences(context_word, total_length=tot_length)
            target = np_utils.to_categorical(target, total_vocab)
            yield(contextual, target) 

In [8]:
import torch, torch.nn, torch.utils, torch.optim
import torchtext
from torchtext.datasets import imdb
from datasets import load_dataset

In [9]:
tokenizer = Tokenizer(WordLevel(unk_token="UNK"))
tokenizer.add_special_tokens(['<pad>', 'UNK'])
tokenizer.pre_tokenizer = Whitespace()
trainer = WordLevelTrainer()
tokenizer.train(['arxiv/trn-arxiv.txt'], trainer)
tokenizer.enable_padding(pad_id=tokenizer.token_to_id('<pad>'), pad_token='<pad>')

dataset = load_dataset('text', data_files={'train': 'arxiv/trn-arxiv.txt', 'dev': 'arxiv/dev-arxiv.txt'})

def collate(examples):
    texts = [ex['text'] for ex in examples[:1000]]
    return torch.LongTensor([text.ids for text in tokenizer.encode_batch(texts)])

Using custom data configuration default-61e398a81f070b0e
Reusing dataset text (/home/max/.cache/huggingface/datasets/text/default-61e398a81f070b0e/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
class cbow_classifier(torch.nn.Module):
    
    def __init__(self, vocab_size, input_dim=16, hidden_dim=64, dropout=0):
        super().__init__()               
        self.embedding = torch.nn.Embedding(vocab_size, input_dim)
        self.hidden_layer = torch.nn.Linear(input_dim, hidden_dim)
        self.top_layer = torch.nn.Linear(hidden_dim, vocab_size)
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, texts):
        embedded = self.embedding(texts)
        cbow = embedded.mean(dim=0)
        cbow_drop = self.dropout(cbow)
        hidden = torch.relu(self.hidden_layer(cbow_drop))
        scores = self.top_layer(hidden)
        return scores 



In [19]:
def evaluate_validation(scores, loss_function, truth):
    guesses = scores.argmax(dim=1)
    n_correct = (guesses == truth).sum().item()
    for i in range(int(guesses.cpu().numpy().shape[0])):
        scores[i][guesses[i]] = 0
    guesses_2 = scores.argmax(dim=1)
    n_correct_2 = n_correct + (guesses == truth).sum().item()
    
    return n_correct, n_correct_2, loss_function(scores, truth).item()


def eval_with_sound(scores, sounds_like, truth):
    max_guesses = []
    max_guesses_2 = []
    for i, score_vec in enumerate(scores):
        sound_like_vec = sounds_like[i]
        sound_scores = score_vec[sound_like_vec.ids]
        if len(sound_scores) == 0:
            max_guesses.append(0)
            max_guesses_2.append(0)
            continue
        max_guesses.append(sound_like_vec.ids[sound_scores.argmax(dim=0)])
        if len(sound_scores) == 1:
            max_guesses_2.append(0)
            continue
        sound_scores[sound_scores.argmax(dim=0)] = 0
        max_guesses_2.append(sound_like_vec.ids[sound_scores.argmax(dim=0)])
    n_correct = (torch.LongTensor(max_guesses).cuda() == truth).sum().item()
    n_correct_2 = n_correct + (torch.LongTensor(max_guesses_2).cuda() == truth).sum().item()
    return n_correct, n_correct_2

In [31]:
model = cbow_classifier(vocab_size=len(vocab))   
model.train()

model.to('cuda')

train_iter = torch.utils.data.DataLoader(dataset["train"], batch_size=128, collate_fn=collate)
dev_iter = torch.utils.data.DataLoader(dataset["dev"], batch_size=64, collate_fn=collate)

loss_function = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=0.001)


history = defaultdict(list)

def train_model(model, epochs, reset=False):
    
    if reset:
        model = cbow_classifier(vocab_size=len(vocab))   
        model.train()

        model.to('cuda')

    for i in range(epochs):

        t0 = time.time()

        loss_sum = 0
        n_batches = 0
        model.train()

        for b, batch in enumerate(train_iter):

            label_idxs = [random.randint(0,11) for seq in batch]
            labels = torch.cat(tuple([batch[i][label_idxs[i]:label_idxs[i]+1] for i in range(len(label_idxs))]))
            for j in range(len(label_idxs)):
                batch[j] = torch.cat((batch[j][:label_idxs[j]], batch[j][label_idxs[j]+1:], batch[j][label_idxs[j]:label_idxs[j]+1]))

            batch = batch.transpose(1,0).to('cuda')

            scores = model(batch.cuda())

            loss = loss_function(scores, labels.cuda())

            optimizer.zero_grad()            
            loss.backward()
            optimizer.step()

            loss_sum += loss.item()
            n_batches += 1

        train_loss = loss_sum / n_batches
        history['train_loss'].append(train_loss)

        n_correct = 0
        n_valid = len(dev_iter) * 64
        loss_sum = 0
        n_batches = 0

        model.eval()

        for b, batch in enumerate(dev_iter):
            label_idxs = [random.randint(0,11) for seq in batch]
            labels = torch.cat(tuple([batch[i][label_idxs[i]:label_idxs[i]+1] for i in range(len(label_idxs))]))
            for j in range(len(label_idxs)):
                batch[j] = torch.cat((batch[j][:label_idxs[j]], batch[j][label_idxs[j]+1:], batch[j][label_idxs[j]:label_idxs[j]+1]))
            scores = model(batch.transpose(1,0).cuda())
            n_corr_batch, _, loss_batch = evaluate_validation(scores, loss_function, labels.cuda())
            loss_sum += loss_batch
            n_correct += n_corr_batch
            n_batches += 1
        val_acc = n_correct / n_valid
        val_loss = loss_sum / n_batches

        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)        

        t1 = time.time()

        if (i+1) % 10 == 0:

            print(f'Epoch {i+1}: train loss = {train_loss:.4f}, val loss = {val_loss:.4f}, val acc: {val_acc:.4f}, time = {t1-t0:.4f}')


In [32]:

epochs_per_step = 20
total_epochs = 0

for i in range(5):
    valid_seen = 0
    n_correct = 0
    n_correct_s = 0
    n_correct_2 = 0
    n_correct_2_s = 0
    train_model(model, epochs_per_step)
    model.eval()
    for b, batch in enumerate(dev_iter):
            label_idxs = [random.randint(0,11) for seq in batch]
            labels = torch.cat(tuple([batch[i][label_idxs[i]:label_idxs[i]+1] for i in range(len(label_idxs))]))

            sims = [get_similar_words(tokenizer.decode([label])) for label in labels]
            labels_sound_like = [tokenizer.encode(" ".join(sim)) for sim in sims]


            for j in range(len(label_idxs)):
                batch[j] = torch.cat((batch[j][:label_idxs[j]], batch[j][label_idxs[j]+1:], batch[j][label_idxs[j]:label_idxs[j]+1]))
            scores = model(batch.transpose(1,0).cuda())
            n_corr_batch, n_corr_batch_2, _ = evaluate_validation(scores, loss_function, labels.cuda())
            n_correct += n_corr_batch
            n_correct_2 += n_corr_batch_2
            n_corr_batch_s, n_corr_batch_2_s = eval_with_sound(scores, labels_sound_like, labels.cuda())
            n_correct_s += n_corr_batch_s
            n_correct_2_s += n_corr_batch_2_s
            if b% 10 == 0:
                pass
            valid_seen = b * 64
            if b >=30:
                break
    val_acc = n_correct / valid_seen
    print("Epoch", (i+1)*epochs_per_step, " Top-1 Accuracy, No Filter:", val_acc)
    val_acc_2 = n_correct_2 / valid_seen
    print("Epoch", (i+1)*epochs_per_step, " Top-2 Accuracy, No Filter:", val_acc_2)
    val_acc_s = n_correct_s / valid_seen
    print("Epoch", (i+1)*epochs_per_step, " Top-1 Accuracy, With Filter:", val_acc_s)
    val_acc_2_s = n_correct_2_s / valid_seen
    print("Epoch", (i+1)i*epochs_per_step, " Top-2 Accuracy, With Filter:", val_acc_2_s)
0.

Epoch 10: train loss = 7.3366, val loss = 7.8378, val acc: 0.0843, time = 5.5491
Epoch 20: train loss = 6.8520, val loss = 7.3830, val acc: 0.0866, time = 5.4632
Epoch 0  Top-1 Accuracy, No Filter: 0.08177083333333333
Epoch 0  Top-2 Accuracy, No Filter: 0.16354166666666667
Epoch 0  Top-1 Accuracy, With Filter: 0.6447916666666667
Epoch 0  Top-2 Accuracy, With Filter: 0.8442708333333333
Epoch 10: train loss = 6.6207, val loss = 7.2157, val acc: 0.0853, time = 5.4295
Epoch 20: train loss = 6.4575, val loss = 7.1387, val acc: 0.0897, time = 5.4296
Epoch 20  Top-1 Accuracy, No Filter: 0.08489583333333334
Epoch 20  Top-2 Accuracy, No Filter: 0.16979166666666667
Epoch 20  Top-1 Accuracy, With Filter: 0.6984375
Epoch 20  Top-2 Accuracy, With Filter: 0.91875
Epoch 10: train loss = 6.3587, val loss = 6.9904, val acc: 0.0868, time = 5.3490
Epoch 20: train loss = 6.2818, val loss = 6.9272, val acc: 0.0896, time = 5.3845
Epoch 40  Top-1 Accuracy, No Filter: 0.08802083333333334
Epoch 40  Top-2 Accur