In [1]:
import argparse
import os
import shutil

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data.dataloader import DataLoader

from dpp_nets.utils.io import make_embd, make_tensor_dataset
from dpp_nets.my_torch.utilities import pad_tensor

from dpp_nets.layers.layers import DeepSetBaseline
from torch.utils.data import DataLoader


import string
import nltk
import string
import numpy as np
import torch
import nltk

from nltk.corpus import stopwords
import torch
import torch.nn as nn
from collections import OrderedDict

import gzip
from torch.utils.data import Dataset

In [96]:
data_path = '/Users/Max/data/beer_reviews/reviews.all.train.chunks.txt.gz'
word_path = '/Users/Max/data/beer_reviews/reviews.all.train.words.txt.gz'
embd_path = '/Users/Max/data/beer_reviews/review+wiki.filtered.200.txt.gz'

In [126]:
class Vocabulary:
    
    def __init__(self):
        
        # Basic Indexing
        self.word2index = {}
        self.index2word = {}
        
        # Keeping track of vocabulary
        self.vocab_size = 0 
        self.word2count = {}
        
        # Vector Dictionaries
        self.pretrained = {}
        self.random = {}
        self.word2vec = {}
        self.index2vec = {}

        # Set of Stop Words
        self.stop_words = set()
        
        self.Embedding = None
        self.EmbeddingBag = None
    
    def setStops(self):
        
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
        make_stops = set(string.punctuation + '\n' + '\t' + '...')
        unmake_stops = set(('no', 'not'))

        self.stop_words = self.stop_words.union(make_stops)
        self.stop_words = self.stop_words.difference(unmake_stops)      
        
    def loadPretrained(self, embd_path):
        
        self.pretrained = {}
        with gzip.open(embd_path, 'rt') as f:
            for line in f:
                line = line.strip()
                if line:
                    word, *embd = line.split()
                    vec = torch.FloatTensor([float(dim) for dim in embd])            
                    self.pretrained[word]  = vec
                    
    def loadCorpus(self, word_path):
        
        with gzip.open(data_path, 'rt') as f:

            for line in f:
                _, review = line.split('\D')
                review = tuple(tuple(chunk.split('\W')) for chunk in review.split('\T'))

                for words in review:
                    vocab.addWords(words)
            
    def addWords(self, words):
        """
        words: seq containing variable no of words
        """
        for word in words:
            self.addWord(word)

    def addWord(self, word):

        if word not in self.word2index:
            
            # Keeping track of vocabulary
            self.vocab_size += 1
            self.word2count[word] = 1
            
            # Basic Indexing
            self.word2index[word] = self.vocab_size
            self.index2word[self.vocab_size] = word
            
            # Add word vector
            if word in self.pretrained:
                vec = self.pretrained[word]
                self.word2vec[word] = vec
                self.index2vec[self.vocab_size] = vec
                
            else:
                vec = torch.randn(200)
                self.random[word] = vec
                self.word2vec[word] = vec
                self.index2vec[self.vocab_size] = vec
        else:
            self.word2count[word] += 1
            
    def updateEmbedding(self):
        
        vocab_size = len(self.index2vec) + 1
        EMBD_DIM = 200
        
        self.Embedding = nn.Embedding(vocab_size, EMBD_DIM, padding_idx=0)
        self.EmbeddingBag = nn.EmbeddingBag(vocab_size, EMBD_DIM)
        embd_matrix = torch.zeros(vocab_size, EMBD_DIM)
        
        for ix, vec in vocab.index2vec.items():
            embd_matrix[ix] = vec
        
        embd_dict = OrderedDict([('weight', embd_matrix)])
        self.Embedding.load_state_dict(embd_dict)
        self.EmbeddingBag.load_state_dict(embd_dict)
    
    def checkWord(self, word, min_count):
        if word not in vocab.stop_words and word in vocab.word2index and vocab.word2index[word] > min_count:
            return word
            
    def filterReview(self, review):
        """
        review should be like our data set
        """
        f_review = []
        seen = set()
        
        for tup in review:
            f_tuple = []
            
            for word in tup:
                word = self.checkWord(word, 10)
                if word:
                    f_tuple.append(word)
            
            f_tuple = tuple(f_tuple)    
            
            if f_tuple and f_tuple not in seen:
                seen.add(f_tuple)
                f_review.append(f_tuple)
                
        return f_review
    
    def mapIndicesBatch(self, reviews):
        
        f_review = []
        offset = []
        i = 0

        for review in reviews:
            seen = set()
            
            for tup in review: 
                f_tuple = []
                
                for word in tup:
                    word = vocab.checkWord(word, 10)
                    if word:
                        f_tuple.append(word)

                f_tuple = tuple(f_tuple)    

                if f_tuple and f_tuple not in seen:
                    seen.add(f_tuple)
                    f_review.extend([vocab.word2index[word] for word in f_tuple])
                    offset.append(i)
                    i += len(f_tuple)
            
        f_review, offset = torch.LongTensor(f_review), torch.LongTensor(offset)   
        return f_review, offset
    
    def mapIndices(self, review):
        
        f_review = []
        offset = []
        seen = set()
        i = 0

        for tup in review:
            f_tuple = []

            for word in tup:
                word = vocab.checkWord(word, 10)
                if word:
                    f_tuple.append(word)

            f_tuple = tuple(f_tuple)    

            if f_tuple and f_tuple not in seen:
                seen.add(f_tuple)
                f_review.extend([vocab.word2index[word] for word in f_tuple])
                offset.append(i)
                i += len(f_tuple)

        f_review, offset = torch.LongTensor(f_review), torch.LongTensor(offset)   
        return f_review, offset
    
    def returnEmbds(self, review):
        
        f_review = []
        offset = []
        seen = set()
        i = 0

        for tup in review:
            f_tuple = []

            for word in tup:
                word = vocab.checkWord(word, 10)
                if word:
                    f_tuple.append(word)

            f_tuple = tuple(f_tuple)    

            if f_tuple and f_tuple not in seen:
                seen.add(f_tuple)
                f_review.extend([vocab.word2index[word] for word in f_tuple])
                offset.append(i)
                i += len(f_tuple)

        f_review, offset = Variable(torch.LongTensor(f_review)), Variable(torch.LongTensor(offset))
        embd = self.EmbeddingBag(f_review, offset)

        return embd

In [127]:
class BeerDataset(Dataset):
    """BeerDataset."""

    def __init__(self, data_path, aspect='all'):
        
        # Compute size of the data set      
        self.aspect = aspect
        self.vocab = vocab
        
        with gzip.open(data_path, 'rt') as f:
            self.lines = f.readlines()

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        
        # Decode
        target, review = self.lines[idx].split('\D')
        
        # Target
        target = torch.FloatTensor([float(t) for t in target.split()[:3]])
        
        # Review
        review = tuple(tuple(chunk.split('\W')) for chunk in review.split('\T'))
        #ixs, offset = self.vocab.mapIndices(review)
        
        #sample = {'ixs': ixs, 'offset': offset, 'target': target}
        sample = {'review': review, 'target': target}
        return sample

In [128]:
vocab = Vocabulary()
vocab.loadPretrained(embd_path)
vocab.setStops()
vocab.loadCorpus(word_path)
vocab.updateEmbedding()

ds = BeerDataset(data_path, vocab)

In [None]:
def my_collate(batch):
    "Puts each data field into a tensor with outer dimension batch size"
    return {'review': [d['review']for d in batch], 'target': torch.stack([d['target'] for d in batch], 0)}

In [23]:
def my_collate(batch, vocab=vocab):

    # Count sizes
    max_no_chunks = 0
    for d in batch:
        max_no_chunks = max(max_no_chunks, len(vocab.filterReview(d['review'])))
    
    # Map to Embeddings
    reps = []
    for d in batch:
        rep = vocab.returnEmbds(d['review'])
        rep = torch.cat([rep, Variable(torch.zeros(max_no_chunks + 1 - rep.size(0), rep.size(1)))], dim=0)
        reps.append(rep)
    
    data_tensor = torch.stack(reps) 
    
    # Create target vector
    # target_tensor = Variable(torch.stack([d['target'] for d in batch]))
    target_tensor = Variable(torch.stack([d['target'] for d in batch]))
    
    return data_tensor, target_tensor

In [75]:
# Solution 1 using my_collate

dl = DataLoader(ds, batch_size=50, collate_fn=my_collate)
for batch in dl:
    break
words = batch[0]

from dpp_nets.layers.layers import KernelVar, MarginalSampler, PredNet

kernel_net = KernelVar(200,500,200)
trainer = MarginalTrainer()

embd_dim = 200
hidden_dim = 500
kernel_dim = 200
enc_dim = 200
target_dim = 3

kernel_net = KernelVar(embd_dim, hidden_dim, kernel_dim)
sampler = MarginalSampler()
pred_net = PredNet(embd_dim, hidden_dim, enc_dim, target_dim)

criterion = nn.MSELoss()
activation = nn.Sigmoid()

pred = None

pred_loss = None 
reg_loss = None
loss = None

reg = 10
reg_mean = 0.1

kernel, words = kernel_net(words) # returned words are masked now!

sampler.s_ix = kernel_net.s_ix
sampler.e_ix = kernel_net.e_ix

weighted_words = sampler(kernel, words) 

pred_net.s_ix = sampler.s_ix
pred_net.e_ix = sampler.e_ix

pred = pred_net(weighted_words)

target = batch[1]

if activation:
    pred = activation(pred)

pred_loss = criterion(pred, target)

if reg:
    reg_loss = reg * (torch.stack(sampler.exp_sizes) - reg_mean).pow(2).mean()
    loss = pred_loss + reg_loss
else:
    loss = pred_loss


loss.backward()

In [47]:
vocab.EmbeddingBag.weight.grad.data.nonzero()


   11     0
   11     1
   11     2
     ⋮      
  877   197
  877   198
  877   199
[torch.LongTensor of size 154400x2]

In [15]:
from torch.autograd import Variable
words = Variable(torch.FloatTensor([[[1,2,3,4],[3,4,5,6],[0,0,0,0]],[[1,2,3,4],[0,0,0,0],[0,0,0,0]]]))

In [14]:
torch.utils.backcompat.broadcast_warning.enabled = True
torch.utils.backcompat.keepdim_warning.enabled = True
words = Variable(torch.FloatTensor([[[1,2,3,4],[3,4,5,6],[0,0,0,0]],[[1,2,3,4],[0,0,0,0],[0,0,0,0]]]))

In [177]:
# Solution 2 - using mycollate2 + new KernelNetwork
def my_collate2(batch, vocab=vocab):

    # Create indices
    s_ix, e_ix, i = [], [], 0

    for l in [len(vocab.filterReview(d['review'])) for d in batch]:
        s_ix.append(i)
        i += l
        e_ix.append(i)
    
    # Map to Embeddings
    batch_review = [review['review'] for review in batch]
    ixs, offsets =  vocab.mapIndicesBatch(batch_review)
    embd = vocab.EmbeddingBag(Variable(ixs), Variable(offsets))

    # Create target vector
    target_tensor = Variable(torch.stack([d['target'] for d in batch]))
    
    return embd, target_tensor, s_ix, e_ix

In [165]:
batch = [ds[ix] for ix in range(4)]
batch_review = [review['review'] for review in batch]

In [180]:
embd, target_tensor, s_ix, e_ix = my_collate2(batch, vocab)

In [191]:
context = []
for s, e in zip(s_ix, e_ix):
    text = embd[s:e].sum(0, keepdim=True).expand_as(embd[s:e])
    context.append(text)
context = torch.cat(context, dim=0)


In [192]:
context

Variable containing:
 -2.9850  -1.7892   2.4351  ...    1.6741  -2.4876  -5.4286
 -2.9850  -1.7892   2.4351  ...    1.6741  -2.4876  -5.4286
 -2.9850  -1.7892   2.4351  ...    1.6741  -2.4876  -5.4286
           ...               ⋱              ...            
 -0.8701  -2.0587   2.1200  ...    0.8378  -2.6831  -2.2940
 -0.8701  -2.0587   2.1200  ...    0.8378  -2.6831  -2.2940
 -0.8701  -2.0587   2.1200  ...    0.8378  -2.6831  -2.2940
[torch.FloatTensor of size 389x200]

In [167]:
ixs, offsets =  vocab.mapIndicesBatch(batch_review)
print(len(ixs), len(offsets))
for review in batch_review:
    ixs, offsets = vocab.mapIndices(review)
    print(len(ixs), len(offsets))

1117 389
356 125
141 52
389 143
231 69


In [None]:
class KernelVar(nn.Module):

    def __init__(self, embd_dim, hidden_dim, kernel_dim):
        """
        Currently, this creates a 2-hidden-layer network 
        with ELU non-linearities.

        """
        super(KernelVar, self).__init__()
        self.embd_dim = embd_dim
        self.hidden_dim = hidden_dim
        self.kernel_dim = kernel_dim

        self.layer1 = nn.Linear(2 * embd_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer3 = nn.Linear(hidden_dim, kernel_dim)

        self.net = nn.Sequential(self.layer1, nn.Tanh(), self.layer2, nn.Tanh(), self.layer3)

        self.s_ix = None
        self.e_ix = None


    def forward(self, embd):
        """
        Given words, returns batch_kernel of dimension
        [-1, kernel_dim]
        """
        
        # Create context
        context = []
        for s, e in zip(self.s_ix, self.e_ix):
            text = embd[s:e].sum(0, keepdim=True).expand_as(embd[s:e])
            context.append(text)
        context = torch.cat(context, dim=0)
        batch_x = torch.cat([embd, context], dim=1)
        
        batch_kernel = self.net(batch_x)

        return batch_kernel , words 

In [172]:
# Next check
l_ixs, l_offsets =  vocab.mapIndicesBatch(batch_review)

review = batch_review[2]
ixs, offsets = vocab.mapIndices(review)

a, b = 111, 49
print(l_ixs[a+356+141],l_offsets[b+125 + 52])
print(ixs[a], offsets[b]+356+141)

233 634
233 634


In [162]:
vocab.EmbeddingBag(Variable(l_ixs), Variable(l_offsets))

Variable containing:
 3.8239e-02 -1.1865e-01 -8.4999e-02  ...  -4.0149e-02 -1.0817e-01 -8.7149e-02
-1.0188e-01 -8.4496e-02 -3.7914e-02  ...  -3.8677e-02 -5.3894e-02  4.0471e-02
-1.3008e-02  4.8421e-03  1.0499e-02  ...  -1.2304e-02 -4.7805e-02 -2.2940e-02
                ...                   ⋱                   ...                
-2.0457e-02 -6.7305e-02  5.4419e-02  ...  -1.3410e-02 -7.4654e-02 -4.6580e-02
-6.0656e-02 -2.6737e-02  1.1477e-01  ...   4.4280e-02 -9.8683e-02 -4.5009e-02
-6.0656e-02 -2.6737e-02  1.1477e-01  ...   4.4280e-02 -9.8683e-02 -4.5009e-02
[torch.FloatTensor of size 389x200]

In [148]:
a = 12

In [115]:
lengths = [len(vocab.filterReview(d['review'])) for d in batch]
cum_sum = [for i in range(len(lengths))]

In [116]:
result = [sum(numbers[:i]) for i in range(1, len(numbers)+1)]

[125, 52, 143]

In [120]:
for i, l in enumerate(lengths):
    print(i, l)

0 125
1 52
2 143


In [121]:
s_ix

[0, 2]

In [122]:
e_ix

[2, 3]

In [168]:
## 
s_ix = []
e_ix = []
i = 0
for l in [len(vocab.filterReview(d['review'])) for d in batch]:
    s_ix.append(i)
    i += l
    e_ix.append(i)

In [169]:
print(s_ix)
print(e_ix)

[0, 125, 177, 320]
[125, 177, 320, 389]


In [171]:
[len(vocab.filterReview(d['review'])) for d in batch]

[125, 52, 143, 69]