In [0]:
# http://pytorch.org/
!pip3 install torch -U # --force
!pip3 install torchvision -U  #--force
!pip3 install --force https://github.com/chengs/tqdm/archive/colab.zip
!pip3 install bokeh

https://github.com/theeluwin/pytorch-sgns

In [0]:
import pandas as pd
import numpy as np

from collections import Counter
import operator
import random

import urllib.request

import os

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import torch.nn.functional as F

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.io import show, output_notebook, push_notebook, export_png

In [0]:
class SkipGram(nn.Module):
    
    def __init__(self, embedding_dim, vocab_size):
        super(SkipGram, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        
        self.in_embedding = nn.Embedding(num_embeddings = self.vocab_size, embedding_dim = self.embedding_dim)
        self.out_embedding = nn.Embedding(num_embeddings = self.vocab_size, embedding_dim =self.embedding_dim)
        
        self.in_embedding.weight = nn.Parameter(torch.cat([torch.zeros(1, self.embedding_dim), torch.FloatTensor(self.vocab_size - 1, self.embedding_dim).uniform_(-0.5 / self.embedding_dim, 0.5 / self.embedding_dim)]))
        self.out_embedding.weight = nn.Parameter(torch.cat([torch.zeros(1, self.embedding_dim), torch.FloatTensor(self.vocab_size - 1, self.embedding_dim).uniform_(-0.5 / self.embedding_dim, 0.5 / self.embedding_dim)]))
        
        self.in_embedding.weight.requires_grad = True
        self.out_embedding.weight.requires_grad = True
        
    def forward(self, center, context):
#         batch_size = center.size()[0]
#         context_size = context.size()[1]        
        center_vectors = self.in_embedding(center).unsqueeze(2)
        context_vectors = self.out_embedding(context)
        
        loss = torch.bmm(context_vectors, center_vectors).squeeze().sigmoid().log().mean(1)
        
#         loss = torch.mean(torch.bmm(context_vectors, center_vectors) * center.shape[0])
            
        return -loss.mean()
        

In [0]:
class Preprocess(object):
    
    def __init__(self, window_size, unk):
        self.window = window_size
        self.unk = unk
        
    def skipgram(self, sentence, i):
        center = sentence[i]
        left = sentence[max(i-self.window, 0): i]
        right = sentence[i:i+1: i+1+self.window]
                
        return center, [self.unk for _ in range(self.window - len(left))] + left + right + [self.unk for _ in range(self.window - len(right))]
    
    def build(self, file, url=False, subsampling=True, threshold=1e-5, word2idx=None):
        print("Creating vocab")
        
        self.sentences = []
        self.wc = {self.unk: 1}
        for line in file.readlines():
            sent = []
            for word in line.split():
                if url:
                    w = word.decode()
                else:
                    w = word
                sent.append(w)
                    
                self.wc[w] = self.wc.get(w, 0) + 1
            self.sentences.append(sent)
                                
        self.idx2word = [self.unk] + sorted(self.wc, key=self.wc.get, reverse=True)      
        self.word2idx = {self.idx2word[idx]: idx for idx, _ in enumerate(self.idx2word)}
        
        if word2idx is not None:
            self.word2idx = word2idx
        
        self.vocab = set([word for word in self.word2idx])
        self.vocab_size = len(self.vocab)
                
        print("Done with building vocab")
        
        if subsampling:
            self.subsampling(self.wc, threshold)
        
        self.convert(subsampling, url)
        
    def subsampling(self, counts, threshold):
        
        N = sum(counts.values())
        
        freqs = {w: c/N for (w,c) in counts.items()}
        
        discard_table = {w:1-np.sqrt(threshold/f) for (w,f) in freqs.items()}
        
        self.discard_table = discard_table
        
    def discard(self, word_id):
        return random.random() > self.discard_table[word_id]
        
    def convert(self, subsampling, url):
        print("Converting corpus")
        data = []
                
        for sent in self.sentences:
            for i in range(len(sent)):
                center, contexts = self.skipgram(sent, i)
                data.append((self.word2idx[center], np.array([self.word2idx[context] for context in contexts])))
            
        self.data = data

        print("Done")


In [0]:
window_size = 5
unk = "<unk>"

print("Train...")
# url_train = 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt'
# response = urllib.request.urlopen(url_train)

f = open(filedir + 'proteins.train.txt', 'r')
preprocess_train = Preprocess(window_size=window_size, unk=unk)
preprocess_train.build(file=f, subsampling=False)
f.close()

print("Valid...")

f = open(filedir + 'proteins.val.txt', 'r')
# url_valid = 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt'
preprocess_valid = Preprocess(window_size=window_size, unk=unk)
preprocess_valid.build(file=f, subsampling=False, word2idx=preprocess_train.word2idx)
f.close()

Train...
Creating vocab
Done with building vocab
Converting corpus


In [0]:
output_notebook()

source_train = ColumnDataSource(data=dict(aa=list(preprocess_train.wc.keys()), count=list(preprocess_train.wc.values())))
source_valid = ColumnDataSource(data=dict(aa=list(preprocess_valid.wc.keys()), count=list(preprocess_valid.wc.values())))
p = figure(plot_width = 600, plot_height = 600, x_range = source.data['aa'], title="Amino acid frequencies")

p.vbar(x='aa', top='count', source=source_train, width=0.9, legend='train')
p.vbar(x='aa', top='count', source=source_valid, width=0.9, color='orange', legend='valid')

show(p)

In [0]:
net = SkipGram(vocab_size = preprocess_train.vocab_size+1, embedding_dim=50)
print(net)

SkipGram(
  (in_embedding): Embedding(22, 50)
  (out_embedding): Embedding(22, 50)
)


In [0]:
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
# optimizer.param_groups

In [0]:
train_loader = data_utils.DataLoader(preprocess_train.data, batch_size=128, shuffle=True)
valid_loader = data_utils.DataLoader(preprocess_valid.data, batch_size=128, shuffle=True)

In [0]:
use_cuda = torch.cuda.is_available()

if use_cuda:
    print("Cuda available")
    net.cuda()

Cuda available


In [0]:
from tqdm import tqdm

valid_loss = []
train_loss = []

epochs = 100
for epoch in range(epochs):
    pbar_train = tqdm(train_loader,position=0)
    pbar_valid = tqdm(valid_loader,position=0)

    pbar_train.set_description("[Epoch {}, train]".format(epoch+1))
    pbar_valid.set_description("[Epoch {}, valid]".format(epoch+1))
    
    running_loss, running_length = 0, 0
    
    net.train()
    for center, contexts in pbar_train:
        
        center = center.long()
        contexts = contexts.long()
        
        if use_cuda:
            center = center.cuda()
            contexts = contexts.cuda()
        
        loss = net(center, contexts)                
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        running_length += center.shape[0]
        
        pbar_train.set_postfix(loss=running_loss, perplexity=np.exp(running_loss))
        
    train_loss.append(running_loss)

    net.eval()
    running_loss, running_length = 0, 0 
    
    for center, context in pbar_valid:
        center = center.long()
        context = context.long()
        
        if use_cuda:
            center = center.cuda()
            context = context.cuda()
        
        loss = net(center, context)
        
        running_loss += loss.item()
        running_length += center.shape[0]
        
        pbar_valid.set_postfix(loss=running_loss, perplexity=np.exp(running_loss))
        
    valid_loss.append(running_loss)
    
    # early stopping check
    if len(valid_loss) > 5 and all([round(np.exp(i), 4) == 1.0 for i in valid_loss[-5:]]):
        print("Perplexity has been 1 for last 5 rounds, early stopping.")
        break
    
#     print("Epoch: {}, Train loss: {:.3f}, Perplexity: {:.3f}".format(epoch+1, train_loss[-1], np.exp(train_loss[-1])))

#     print("Epoch: {}, Valid loss: {:.3f}, Perplexity: {:.3f}".format(epoch+1, valid_loss[-1], np.exp(valid_loss[-1])))

[Epoch 1, valid]: : 0it [00:00, ?it/s]

[Epoch 2, valid]: : 0it [00:00, ?it/s]

[Epoch 3, valid]: : 0it [00:00, ?it/s]

[Epoch 4, valid]: : 0it [00:00, ?it/s]

[Epoch 5, valid]: : 0it [00:00, ?it/s]

[Epoch 6, valid]: : 0it [00:00, ?it/s]

Perplexity has been 1 for last 5 rounds, early stopping.






In [0]:
# save model
import datetime
import os
try:
    os.mkdir('data')
except: pass

name = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M") + "_model.pt"

torch.save(net.state_dict(), 'data/' + name)

# check where file is saved
print( os.getcwd() )
print( os.listdir('data') )

# save to local machine
from google.colab import files
files.download('data/'+name) 

/content
['2018_12_04-13_23_model.pt']


In [0]:
output_notebook()

runned_epochs = list(range(0, len(train_loss)+1))

train_loss = ColumnDataSource(data=dict(epochs=runned_epochs, loss=train_loss))
valid_loss = ColumnDataSource(data=dict(epochs=runned_epochs, loss=valid_loss))

p = figure(plot_width=400, plot_height=400, title = "Losses")
p.line(x='epochs', y='loss', color='blue', alpha=0.7, source=train_loss, legend='Training')
p.line(x='epochs', y='loss', color='red', alpha=0.7, source=valid_loss, legend='Validation')

tooltips = [("epoch", "@epochs"), ("loss", "@loss")]

p.add_tools(HoverTool(tooltips=tooltips))

show(p)

TypeError: ignored

In [0]:
url_train = 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'
preprocess_test = Preprocess(window_size=window_size, unk=unk)
preprocess_test.build(url_train, subsampling=False, word2idx=preprocess_train.word2idx)

Creating vocab
Done with building vocab
Converting corpus
Done


In [0]:
model = TSNE(n_components=2, perplexity=30, n_iter=5000, method='exact', verbose=1)

idx2vec = net.in_embedding.weight.data.cpu().numpy()
word2idx = preprocess_train.word2idx

top_k_words = 1000

words_train = sorted(preprocess_train.wc, key=preprocess_train.wc.get, reverse=True)
words_train_array = np.array(words_train)
sampled_words_train = words_train_array[:top_k_words] #np.random.choice(words_train_array, size=1000)

X_train = np.array([idx2vec[word2idx[word]] for word in sampled_words_train])
X_train = model.fit_transform(X_train)

words_test = sorted(preprocess_test.wc, key=preprocess_test.wc.get, reverse=True)
words_test_array = np.array(words_test)
sampled_words_test = words_test_array[:top_k_words] # np.random.choice(words_test_array, size=1000)

X_test=np.array([idx2vec[word2idx[word]] for word in sampled_words_test])
X_test=model.fit_transform(X_test)


In [0]:
output_notebook()
source_train = ColumnDataSource(
    data=dict(x=X_train[:,0], y=X_train[:,1], w=sampled_words_train)
)
source_test = ColumnDataSource(
    data=dict(x=X_test[:,0], y=X_test[:,1], w=sampled_words_test)
)
hover_tool = HoverTool(tooltips=[("word", "@w")])

p = figure(plot_width=600, plot_height=600, title="t-SNE, top {} words".format(top_k_words))
p.scatter('x', 'y', color='blue', alpha=0.7, source=source_train, name="train", legend="train")
p.scatter('x', 'y', color='orange', alpha=0.7, source=source_test, name="test", legend="test")

p.add_tools(hover_tool)

show(p)

In [0]:
test_loader = data_utils.DataLoader(preprocess_test.data, batch_size=128, shuffle=True)

test_loss = 0

for center, contexts in test_loader:
    
    test_loss += net(center.cuda(), contexts.cuda()).item()
    

print(test_loss, np.exp(test_loss))


1.958566935467143e-06 1.0000019585688535


In [0]:
net_cpu = SkipGram(vocab_size = preprocess_train.vocab_size+1, embedding_dim=50)
net_cpu.load_state_dict(torch.load('data/2018_12_04-13_23_model.pt', map_location=torch.device('cpu'))) #os.listdir('data')
net_cpu.eval()

SkipGram(
  (in_embedding): Embedding(10000, 50)
  (out_embedding): Embedding(10000, 50)
)

In [0]:
center
torch.tensor(center).long()

tensor(3)

In [0]:
random_idx = np.random.randint(low=0, high=len(preprocess_test.sentences))
random_sentence = preprocess_test.sentences[random_idx]
print("Sentence")
print(" ".join(random_sentence), "\n")

for i in range(len(random_sentence)):
    center, contexts = preprocess_test.skipgram(random_sentence, i)
        
    center = preprocess_train.word2idx[center]
    contexts = [preprocess_train.word2idx[w] for w in contexts]
    
    center = torch.tensor(center)
    contexts = torch.tensor(contexts)
        
    center_vec = net_cpu.in_embedding(center)
    context_vec = net_cpu.out_embedding(contexts)
    
    output = torch.matmul(context_vec, center_vec)
    
    print("Center: %s" % preprocess_train.idx2word[center.item()])
    print("Contexts")
    
    for i, w in enumerate(contexts):
        print("Word {}\t output {:.3f}".format(preprocess_train.idx2word[w.item()], output[i].item()))
        
    
    


Sentence
european drama has had better though still mixed fortunes 

Center: european
Contexts
Word <unk>	 output 25.416
Word <unk>	 output 25.416
Word <unk>	 output 25.416
Word <unk>	 output 25.416
Word <unk>	 output 25.416
Word european	 output 19.724
Word <unk>	 output 25.416
Word <unk>	 output 25.416
Word <unk>	 output 25.416
Word <unk>	 output 25.416
Center: drama
Contexts
Word <unk>	 output 23.455
Word <unk>	 output 23.455
Word <unk>	 output 23.455
Word <unk>	 output 23.455
Word european	 output 18.263
Word drama	 output 15.684
Word <unk>	 output 23.455
Word <unk>	 output 23.455
Word <unk>	 output 23.455
Word <unk>	 output 23.455
Center: has
Contexts
Word <unk>	 output 31.316
Word <unk>	 output 31.316
Word <unk>	 output 31.316
Word european	 output 24.323
Word drama	 output 20.873
Word has	 output 29.232
Word <unk>	 output 31.316
Word <unk>	 output 31.316
Word <unk>	 output 31.316
Word <unk>	 output 31.316
Center: had
Contexts
Word <unk>	 output 29.901
Word <unk>	 output 29.901
W

In [0]:
net_

ValueError: ignored