In [1]:
import argparse
import os
import shutil

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data.dataloader import DataLoader

from dpp_nets.utils.io import make_embd, make_tensor_dataset
from dpp_nets.my_torch.utilities import pad_tensor

from dpp_nets.layers.layers import DeepSetBaseline
from torch.utils.data import DataLoader


import string
import nltk
import string
import numpy as np
import torch
import nltk

from nltk.corpus import stopwords
import torch
import torch.nn as nn
from collections import OrderedDict

import gzip
from torch.utils.data import Dataset

In [2]:
data_path = '/Users/Max/data/beer_reviews/reviews.all.train.chunks.txt.gz'
word_path = '/Users/Max/data/beer_reviews/reviews.all.train.words.txt.gz'
embd_path = '/Users/Max/data/beer_reviews/review+wiki.filtered.200.txt.gz'

In [3]:
class Vocabulary:
    
    def __init__(self):
        
        # Basic Indexing
        self.word2index = {}
        self.index2word = {}
        
        # Keeping track of vocabulary
        self.vocab_size = 0 
        self.word2count = {}
        
        # Vector Dictionaries
        self.pretrained = {}
        self.random = {}
        self.word2vec = {}
        self.index2vec = {}

        # Set of Stop Words
        self.stop_words = set()
        
        self.Embedding = None
        self.EmbeddingBag = None
    
    def setStops(self):
        
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
        make_stops = set(string.punctuation + '\n' + '\t' + '...')
        unmake_stops = set(('no', 'not'))

        self.stop_words = self.stop_words.union(make_stops)
        self.stop_words = self.stop_words.difference(unmake_stops)      
        
    def loadPretrained(self, embd_path):
        
        self.pretrained = {}
        with gzip.open(embd_path, 'rt') as f:
            for line in f:
                line = line.strip()
                if line:
                    word, *embd = line.split()
                    vec = torch.FloatTensor([float(dim) for dim in embd])            
                    self.pretrained[word]  = vec
                    
    def loadCorpus(self, word_path):
        
        with gzip.open(data_path, 'rt') as f:

            for line in f:
                _, review = line.split('\D')
                review = tuple(tuple(chunk.split('\W')) for chunk in review.split('\T'))

                for words in review:
                    vocab.addWords(words)
            
    def addWords(self, words):
        """
        words: seq containing variable no of words
        """
        for word in words:
            self.addWord(word)

    def addWord(self, word):

        if word not in self.word2index:
            
            # Keeping track of vocabulary
            self.vocab_size += 1
            self.word2count[word] = 1
            
            # Basic Indexing
            self.word2index[word] = self.vocab_size
            self.index2word[self.vocab_size] = word
            
            # Add word vector
            if word in self.pretrained:
                vec = self.pretrained[word]
                self.word2vec[word] = vec
                self.index2vec[self.vocab_size] = vec
                
            else:
                vec = torch.randn(200)
                self.random[word] = vec
                self.word2vec[word] = vec
                self.index2vec[self.vocab_size] = vec
        else:
            self.word2count[word] += 1
            
    def updateEmbedding(self):
        
        vocab_size = len(self.index2vec) + 1
        EMBD_DIM = 200
        
        self.Embedding = nn.Embedding(vocab_size, EMBD_DIM, padding_idx=0)
        self.EmbeddingBag = nn.EmbeddingBag(vocab_size, EMBD_DIM)
        embd_matrix = torch.zeros(vocab_size, EMBD_DIM)
        
        for ix, vec in vocab.index2vec.items():
            embd_matrix[ix] = vec
        
        embd_dict = OrderedDict([('weight', embd_matrix)])
        self.Embedding.load_state_dict(embd_dict)
        self.EmbeddingBag.load_state_dict(embd_dict)
    
    def checkWord(self, word, min_count):
        if word not in vocab.stop_words and word in vocab.word2index and vocab.word2index[word] > min_count:
            return word
            
    def filterReview(self, review):
        """
        review should be like our data set
        """
        f_review = []
        seen = set()
        
        for tup in review:
            f_tuple = []
            
            for word in tup:
                word = self.checkWord(word, 10)
                if word:
                    f_tuple.append(word)
            
            f_tuple = tuple(f_tuple)    
            
            if f_tuple and f_tuple not in seen:
                seen.add(f_tuple)
                f_review.append(f_tuple)
                
        return f_review
    
    def mapIndicesBatch(self, reviews):
        
        f_review = []
        offset = []
        i = 0

        for review in reviews:
            seen = set()
            
            for tup in review: 
                f_tuple = []
                
                for word in tup:
                    word = vocab.checkWord(word, 10)
                    if word:
                        f_tuple.append(word)

                f_tuple = tuple(f_tuple)    

                if f_tuple and f_tuple not in seen:
                    seen.add(f_tuple)
                    f_review.extend([vocab.word2index[word] for word in f_tuple])
                    offset.append(i)
                    i += len(f_tuple)
            
        f_review, offset = torch.LongTensor(f_review), torch.LongTensor(offset)   
        return f_review, offset
    
    def mapIndices(self, review):
        
        f_review = []
        offset = []
        seen = set()
        i = 0

        for tup in review:
            f_tuple = []

            for word in tup:
                word = vocab.checkWord(word, 10)
                if word:
                    f_tuple.append(word)

            f_tuple = tuple(f_tuple)    

            if f_tuple and f_tuple not in seen:
                seen.add(f_tuple)
                f_review.extend([vocab.word2index[word] for word in f_tuple])
                offset.append(i)
                i += len(f_tuple)

        f_review, offset = torch.LongTensor(f_review), torch.LongTensor(offset)   
        return f_review, offset
    
    def returnEmbds(self, review):
        
        f_review = []
        offset = []
        seen = set()
        i = 0

        for tup in review:
            f_tuple = []

            for word in tup:
                word = vocab.checkWord(word, 10)
                if word:
                    f_tuple.append(word)

            f_tuple = tuple(f_tuple)    

            if f_tuple and f_tuple not in seen:
                seen.add(f_tuple)
                f_review.extend([vocab.word2index[word] for word in f_tuple])
                offset.append(i)
                i += len(f_tuple)

        f_review, offset = Variable(torch.LongTensor(f_review)), Variable(torch.LongTensor(offset))
        embd = self.EmbeddingBag(f_review, offset)

        return embd

In [4]:
class BeerDataset(Dataset):
    """BeerDataset."""

    def __init__(self, data_path, aspect='all'):
        
        # Compute size of the data set      
        self.aspect = aspect
        self.vocab = vocab
        
        with gzip.open(data_path, 'rt') as f:
            self.lines = f.readlines()

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        
        # Decode
        target, review = self.lines[idx].split('\D')
        
        # Target
        target = torch.FloatTensor([float(t) for t in target.split()[:3]])
        
        # Review
        review = tuple(tuple(chunk.split('\W')) for chunk in review.split('\T'))
        #ixs, offset = self.vocab.mapIndices(review)
        
        #sample = {'ixs': ixs, 'offset': offset, 'target': target}
        sample = {'review': review, 'target': target}
        return sample

In [5]:
vocab = Vocabulary()
vocab.loadPretrained(embd_path)
vocab.setStops()
vocab.loadCorpus(word_path)
vocab.updateEmbedding()

ds = BeerDataset(data_path, vocab)

In [10]:
a = 'abc'
b = 'def'
os.path.join(a,b)

'abc/def'

In [6]:
def my_collate(batch):
    "Puts each data field into a tensor with outer dimension batch size"
    return {'review': [d['review']for d in batch], 'target': torch.stack([d['target'] for d in batch], 0)}

In [7]:
# Solution 2 - using mycollate2 + new KernelNetwork
from dpp_nets.layers.layers import MarginalSampler, PredNet

def my_collate2(batch, vocab=vocab):

    # Create indices
    s_ix, e_ix, i = [], [], 0

    for l in [len(vocab.filterReview(d['review'])) for d in batch]:
        s_ix.append(i)
        i += l
        e_ix.append(i)
    
    # Map to Embeddings
    batch_review = [review['review'] for review in batch]
    ixs, offsets =  vocab.mapIndicesBatch(batch_review)
    embd = vocab.EmbeddingBag(Variable(ixs), Variable(offsets))

    # Create target vector
    target_tensor = Variable(torch.stack([d['target'] for d in batch]))
    
    return embd, target_tensor, s_ix, e_ix

class KernelVar(nn.Module):

    def __init__(self, embd_dim, hidden_dim, kernel_dim):
        """
        Currently, this creates a 2-hidden-layer network 
        with ELU non-linearities.

        """
        super(KernelVar, self).__init__()
        self.embd_dim = embd_dim
        self.hidden_dim = hidden_dim
        self.kernel_dim = kernel_dim

        self.layer1 = nn.Linear(2 * embd_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer3 = nn.Linear(hidden_dim, kernel_dim)

        self.net = nn.Sequential(self.layer1, nn.Tanh(), self.layer2, nn.Tanh(), self.layer3)

        self.s_ix = None
        self.e_ix = None


    def forward(self, embd):
        """
        Given words, returns batch_kernel of dimension
        [-1, kernel_dim]
        """
        
        # Create context
        context = []
        for s, e in zip(self.s_ix, self.e_ix):
            text = embd[s:e].sum(0, keepdim=True).expand_as(embd[s:e])
            context.append(text)
        context = torch.cat(context, dim=0)
        batch_x = torch.cat([embd, context], dim=1)
        
        batch_kernel = self.net(batch_x)

        return batch_kernel , embd 

from timeit import default_timer
start = default_timer()

dl = DataLoader(ds, batch_size=500, collate_fn=my_collate2)
for batch in dl:
    break

embd, target, s_ix, e_ix = batch

embd_dim = 200
hidden_dim = 500
kernel_dim = 200
enc_dim = 200
target_dim = 3

kernel_net = KernelVar(embd_dim, hidden_dim, kernel_dim)
kernel_net.s_ix, kernel_net.e_ix = s_ix, e_ix

sampler = MarginalSampler()
pred_net = PredNet(embd_dim, hidden_dim, enc_dim, target_dim)

criterion = nn.MSELoss()
activation = nn.Sigmoid()

pred = None

pred_loss = None 
reg_loss = None
loss = None

reg = 10
reg_mean = 0.1

kernel, words = kernel_net(embd) # returned words are masked now!

sampler.s_ix = kernel_net.s_ix
sampler.e_ix = kernel_net.e_ix

weighted_words = sampler(kernel, words) 

pred_net.s_ix = sampler.s_ix
pred_net.e_ix = sampler.e_ix

pred = pred_net(weighted_words)

target = batch[1]

if activation:
    pred = activation(pred)

pred_loss = criterion(pred, target)

if reg:
    reg_loss = reg * (torch.stack(sampler.exp_sizes) - reg_mean).pow(2).mean()
    loss = pred_loss + reg_loss
else:
    loss = pred_loss

loss.backward()
duration = default_timer() - start
print(duration)

64.01037289399937


In [8]:
def my_collate(batch, vocab=vocab):

    # Count sizes
    max_no_chunks = 0
    for d in batch:
        max_no_chunks = max(max_no_chunks, len(vocab.filterReview(d['review'])))
    
    # Map to Embeddings
    reps = []
    for d in batch:
        rep = vocab.returnEmbds(d['review'])
        rep = torch.cat([rep, Variable(torch.zeros(max_no_chunks + 1 - rep.size(0), rep.size(1)))], dim=0)
        reps.append(rep)
    
    data_tensor = torch.stack(reps) 
    
    # Create target vector
    # target_tensor = Variable(torch.stack([d['target'] for d in batch]))
    target_tensor = Variable(torch.stack([d['target'] for d in batch]))
    
    return data_tensor, target_tensor

# Solution 1 using my_collate
from timeit import default_timer
from dpp_nets.layers.layers import KernelVar, MarginalSampler, PredNet


start = default_timer()

dl = DataLoader(ds, batch_size=500, collate_fn=my_collate)
for batch in dl:
    break
words = batch[0]

kernel_net = KernelVar(200,500,200)

embd_dim = 200
hidden_dim = 500
kernel_dim = 200
enc_dim = 200
target_dim = 3

kernel_net = KernelVar(embd_dim, hidden_dim, kernel_dim)
sampler = MarginalSampler()
pred_net = PredNet(embd_dim, hidden_dim, enc_dim, target_dim)

criterion = nn.MSELoss()
activation = nn.Sigmoid()

pred = None

pred_loss = None 
reg_loss = None
loss = None

reg = 10
reg_mean = 0.1

kernel, words = kernel_net(words) # returned words are masked now!

sampler.s_ix = kernel_net.s_ix
sampler.e_ix = kernel_net.e_ix

weighted_words = sampler(kernel, words) 

pred_net.s_ix = sampler.s_ix
pred_net.e_ix = sampler.e_ix

pred = pred_net(weighted_words)

target = batch[1]

if activation:
    pred = activation(pred)

pred_loss = criterion(pred, target)

if reg:
    reg_loss = reg * (torch.stack(sampler.exp_sizes) - reg_mean).pow(2).mean()
    loss = pred_loss + reg_loss
else:
    loss = pred_loss


loss.backward()
duration = default_timer() - start
print(duration)

93.63630560999991


In [9]:
torch.utils.backcompat.broadcast_warning.enabled = True
torch.utils.backcompat.keepdim_warning.enabled = True
words = Variable(torch.FloatTensor([[[1,2,3,4],[3,4,5,6],[0,0,0,0]],[[1,2,3,4],[0,0,0,0],[0,0,0,0]]]))

In [12]:
for batch in dl:
    break

In [15]:
vocab.EmbeddingBag.parameters()

<generator object Module.parameters at 0x1319bd9e8>

In [29]:
vocab.EmbeddingBag.weight[3,3]

Variable containing:
1.00000e-02 *
 -2.3152
[torch.DoubleTensor of size 1]

In [28]:
vocab.EmbeddingBag.double()

EmbeddingBag(112232, 200, mode=mean)

In [30]:
my_collate2()

<function __main__.my_collate2>