In [1]:
#!conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch -c nvidia -y
#!pip install --upgrade --force-reinstall allennlp==2.5.0
#!git clone https://github.com/mhagiwara/realworldnlp.git
#%cd realworldnlp

In [2]:
#!allennlp test-install
#%cd /home/jupyter/realworldnlp/examples/embeddings/realworldnlp/
%cd /home/admin_paulhykim_altostrat_com/realworldnlp

/home/admin_paulhykim_altostrat_com/realworldnlp


In [3]:
from collections import Counter

import torch
import torch.optim as optim
from allennlp.data.data_loaders import SimpleDataLoader
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.token_embedders import Embedding
from allennlp.training import GradientDescentTrainer
from torch.nn import CosineSimilarity
from torch.nn import functional

In [4]:
from examples.embeddings.word2vec import SkipGramReader

In [5]:
EMBEDDING_DIM = 256
BATCH_SIZE = 256

In [6]:
class SkipGramModel(Model):
    def __init__(self, vocab, embedding_in, cuda_device=-1):
        super().__init__(vocab)
        self.embedding_in = embedding_in
        self.linear = torch.nn.Linear(
            in_features=EMBEDDING_DIM,
            out_features=vocab.get_vocab_size('token_out'),
            bias=False)
        if cuda_device > -1:
            self.linear = self.linear.to(cuda_device)
            self.embedding_in = self.embedding_in.to(cuda_device)

    def forward(self, token_in, token_out):
        embedded_in = self.embedding_in(token_in)
        logits = self.linear(embedded_in)
        loss = functional.cross_entropy(logits, token_out)

        return {'loss': loss}

In [7]:
def get_related(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10):
    """Given a token, return a list of top N most similar words to the token."""
    token_id = vocab.get_token_index(token, 'token_in')
    token_vec = embedding.weight[token_id]
    cosine = CosineSimilarity(dim=0)
    sims = Counter()

    for index, token in vocab.get_index_to_token_vocabulary('token_in').items():
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim

    return sims.most_common(num_synonyms)

In [8]:
reader = SkipGramReader()
text8 = reader.read('https://realworldnlpbook.s3.amazonaws.com/data/text8/text8')

In [9]:
text8 = list(text8)
print(len(text8))
text8 = text8[:1000000]

Your label namespace was 'token_in'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.
Your label namespace was 'token_out'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


9999970


In [10]:
vocab = Vocabulary.from_instances(
    text8, min_count={'token_in': 5, 'token_out': 5})

building vocab:   0%|          | 0/1000000 [00:00<?, ?it/s]

In [11]:
data_loader = SimpleDataLoader(text8, batch_size=BATCH_SIZE)
data_loader.index_with(vocab)

In [12]:
embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                         embedding_dim=EMBEDDING_DIM)

In [13]:
model = SkipGramModel(vocab=vocab,
                      embedding_in=embedding_in, 
                      cuda_device=0)

In [14]:
optimizer = optim.Adam(model.parameters())

In [15]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=data_loader,
    num_epochs=5,
    cuda_device=0)

In [16]:
trainer.train()

  0%|          | 0/3907 [00:00<?, ?it/s]

  0%|          | 0/3907 [00:00<?, ?it/s]

  0%|          | 0/3907 [00:00<?, ?it/s]

  0%|          | 0/3907 [00:00<?, ?it/s]

  0%|          | 0/3907 [00:00<?, ?it/s]

{'best_epoch': 4,
 'peak_worker_0_memory_MB': 5502.75,
 'peak_gpu_0_memory_MB': 144.255859375,
 'training_duration': '0:01:22.019050',
 'epoch': 4,
 'training_loss': 6.703777892588959,
 'training_worker_0_memory_MB': 5502.75,
 'training_gpu_0_memory_MB': 144.255859375}

In [17]:
print(get_related('one', embedding_in, vocab))

[('one', 1.0), ('six', 0.6718567609786987), ('nine', 0.6529061198234558), ('seven', 0.6342512965202332), ('eight', 0.6255412101745605), ('print', 0.5833466053009033), ('ca', 0.5806261301040649), ('babenberg', 0.5540953278541565), ('comics', 0.5510942339897156), ('five', 0.5507804751396179)]


In [18]:
print(get_related('december', embedding_in, vocab))

[('december', 1.0), ('april', 0.8366215229034424), ('banquet', 0.8100413680076599), ('wedding', 0.7887839674949646), ('freeman', 0.7791705131530762), ('births', 0.7750635743141174), ('dienne', 0.7722298502922058), ('july', 0.7683294415473938), ('breed', 0.7642992734909058), ('montana', 0.7613508105278015)]


In [19]:
import torch
x = torch.rand(5, 3).to("cuda")
print(x)

tensor([[0.1642, 0.9040, 0.2277],
        [0.2589, 0.9153, 0.7911],
        [0.6089, 0.0236, 0.8649],
        [0.0117, 0.5031, 0.7691],
        [0.2925, 0.6072, 0.5689]], device='cuda:0')
