In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
from collections import Counter
import math
import numpy as np
import os
import random
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename


def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = f.read(f.namelist()[0]).decode().split()
  return data


url = 'http://mattmahoney.net/dc/'
filename = maybe_download('./data/text8.zip', 31344016)
words = read_data(filename)
print(words[:10])
print('Data size %d' % len(words))

Found and verified ./data/text8.zip
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
Data size 17005207


In [3]:
vocabs = set(words)
vocab_size = len(vocabs)
print(vocab_size)
print(Counter(words).most_common(10))
print(len(set(words[:1000])))

253854
[('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644)]
442


In [4]:
def build_dataset(words, vocab_size):
    counter = [('UNK', -1)]
    counter.extend(Counter(words).most_common(vocab_size-1))
    
    word2index = dict()
    index = 1
    for word, _ in counter:
        word2index[word] = index
        index += 1
    
    data = list()
    for w in words:
        if w in word2index:
            index = word2index[w]
        else:
            index = 0
        data.append(index)
    
    index2word = dict(zip(word2index.values(), word2index.keys()))
    
    return data, word2index, index2word


vocab_size = 50000
data, word2index, index2word = build_dataset(words, vocab_size)
print(len(data))
print(data[:10])
print([index2word[i] for i in data[:10]])

17005207
[5235, 3082, 13, 7, 196, 3, 3135, 47, 60, 157]
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [42]:
def generate_batch(data, begin_index, batch_size, bag_size=2):
    assert begin_index >= bag_size
    end_index = min(len(data)-2, begin_index+batch_size)
    train_batch, train_labels = list(), list()
    
    for i in range(begin_index, end_index):
        context = list()
        target = data[i]
        for j in range(1, bag_size+1):
            context.append(data[i-j])
            context.append(data[i+j])
        train_batch.append(context)
        train_labels.append(target)
    if len(train_batch) < batch_size:
        end_index, attach_batch, attach_labels = generate_batch(data, bag_size, 
                                        batch_size-len(train_batch), bag_size)
        train_batch.extend(attach_batch)
        if len(train_batch) < batch_size:
            print('error')
            print(len(train_batch))
            print(attach_batch)
        train_labels.extend(attach_labels)
    return end_index, train_batch, train_labels

begin_index = 3
batch_size = 1000
begin_index, train_batch, train_labels = generate_batch(data, begin_index, batch_size, bag_size=2)
print(begin_index)
print([index2word[i] for i in data[:10]])
print([[index2word[i] for i in bi] for bi in train_batch[:10]])
print([index2word[t] for t in train_labels[:10]])
print(train_batch[0], train_labels[0])
print(torch.max(torch.Tensor(train_labels)))
print(len(train_batch))

1003
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
[['as', 'term', 'originated', 'of'], ['a', 'of', 'as', 'abuse'], ['term', 'abuse', 'a', 'first'], ['of', 'first', 'term', 'used'], ['abuse', 'used', 'of', 'against'], ['first', 'against', 'abuse', 'early'], ['used', 'early', 'first', 'working'], ['against', 'working', 'used', 'class'], ['early', 'class', 'against', 'radicals'], ['working', 'radicals', 'early', 'including']]
['a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class']
[13, 196, 3082, 3] 7
44613.0
1000


In [45]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size, bag_size, batch_size):
        super(CBOW, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.bag_size = bag_size
        self.batch_size = batch_size
        
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.linear1 = nn.Linear(2*self.bag_size*self.embedding_size, self.embedding_size)
        self.linear2 = nn.Linear(self.embedding_size, self.vocab_size)
        
    def forward(self, inputs):
        embeded = self.embedding(inputs)
#         print('embed: ', embeded.size())
        embeded = embeded.view(1, -1)
#         print('embeded size: ', embeded.size())
        out = F.relu(self.linear1(embeded))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs

    
embedding_size = 128
bag_size = 2
batch_size = 8

cbow = CBOW(vocab_size=vocab_size, embedding_size=embedding_size, 
            bag_size=bag_size, batch_size=batch_size)
print(cbow)

CBOW (
  (embedding): Embedding(50000, 128)
  (linear1): Linear (512 -> 128)
  (linear2): Linear (128 -> 50000)
)


In [58]:
x = autograd.Variable(torch.LongTensor(train_batch[0]))
print(x)
true_y = autograd.Variable(torch.LongTensor([train_labels[0]]))
print(true_y)
y = cbow(x)
print(y.size())
_, predict_y = torch.max(y.data, 1)
print(predict_y)
print(train_labels[0])

Variable containing:
   13
  196
 3082
    3
[torch.LongTensor of size 4]

Variable containing:
 7
[torch.LongTensor of size 1]

torch.Size([1, 50000])

 6463
[torch.LongTensor of size 1x1]

7


In [72]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(cbow.parameters(), lr=0.001)

num_steps = 2
begin_index = 3
for i in range(num_steps):
#     begin_index, train_batch, train_labels = generate_batch(data, begin_index, batch_size, bag_size=2)
    c = 0
    total_loss = 0
    for j in range(1,batch_size):        
        inputs = autograd.Variable(torch.LongTensor(train_batch[j]))
    #     print('input size: ', inputs.size())
        cbow.zero_grad()
        out = cbow(inputs)
        print(out.size())
        _, predict_y = torch.max(y.data, 1)
        print(predict_y)
        try:
            true_y = cbow.embedding(autograd.Variable(torch.LongTensor([train_labels[j]])))
            predict_y = cbow.embedding(autograd.Variable(predict_y))
            predict_y = predict_y.view(-1, 128)
            print(type(true_y))
            print(true_y.size())
            print(type(predict_y))
            print(predict_y.size())
            loss = loss_function(predict_y, true_y)
            
            loss.backward()
            optimizer.step()
            c += 1
            total_loss += loss.data
            if c % 1000 == 0:
                print('total loss', total_loss)
                total_loss = 0
        except Exception as e:
            print(e)
            print('error：', train_labels[j])
        



torch.Size([1, 50000])

 6463
[torch.LongTensor of size 1x1]

<class 'torch.autograd.variable.Variable'>
torch.Size([1, 128])
<class 'torch.autograd.variable.Variable'>
torch.Size([1, 128])
nn criterions don't compute the gradient w.r.t. targets - please mark these variables as volatile or not requiring gradients
error： 196
torch.Size([1, 50000])

 6463
[torch.LongTensor of size 1x1]

<class 'torch.autograd.variable.Variable'>
torch.Size([1, 128])
<class 'torch.autograd.variable.Variable'>
torch.Size([1, 128])
nn criterions don't compute the gradient w.r.t. targets - please mark these variables as volatile or not requiring gradients
error： 3
torch.Size([1, 50000])

 6463
[torch.LongTensor of size 1x1]

<class 'torch.autograd.variable.Variable'>
torch.Size([1, 128])
<class 'torch.autograd.variable.Variable'>
torch.Size([1, 128])
nn criterions don't compute the gradient w.r.t. targets - please mark these variables as volatile or not requiring gradients
error： 3135
torch.Size([1, 50000])


In [None]:
num_points = 200

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(cbow.embedding([1:num_points+1]))

In [None]:
def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [index2word[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)