In [None]:
import argparse
import os
import shutil
import gzip

import torch
import torch.nn as nn
from torch.autograd import Variable

import spacy

In [72]:
import json
import gzip
import ast
from collections import defaultdict
from collections import namedtuple
import spacy 

class EvalSet():
    
    def __init__(self, rat_path, vocab, mode='words'):
        
        nlp = spacy.load('en')
        self.reviews = []
        self.labelled_docs = []
        self.vocab = vocab
        self.words  = []
        self.chunks = []
        self.sents  = []
                
        DictCollect = namedtuple('DictCollect', ['all', 'clean', 'rev'])
        
        with open(rat_path) as f:
            for line in f:
                review = json.loads(line)
                self.reviews.append(review)
                        
        for review in self.reviews:
            doc = nlp(ast.literal_eval(review['raw'])['review/text'])
            sens_with_labels = self.__gatherLabels(review)
            labelled_doc = self.__curateDoc(doc, sens_with_labels)
            self.labelled_docs.append(labelled_doc)
        
        for labelled_doc in self.labelled_docs:
            words, chunks, sents = defaultdict(set), defaultdict(set), defaultdict(set)            
            for review, label in labelled_doc:
                for word in review:
                    words[tuple([word.text])].update([*label])
                    chunks[tuple([w.text for w in word.subtree if w.text != '\n' and w.text != '\t'])].update([*label])
                sents[tuple([word.text for word in review])].update([*label])
            self.words.append(words)
            self.chunks.append(chunks)
            self.sents.append(sents)
            
        self.words = [self.__complementDict(defdict) for defdict in self.words]
        self.chunks = [self.__complementDict(defdict) for defdict in self.chunks]
        self.sents = [self.__complementDict(defdict) for defdict in self.sents]
        
    def __complementDict(self, def_dict):
    
            DictCollect = namedtuple('DictCollect', ['all', 'clean', 'rev'])
            cleanDict, revDict = self.vocab.createDicts(def_dict)
            dictcollect = DictCollect(def_dict, cleanDict, revDict)
            
            return dictcollect
        
    def __curateDoc(self, doc, sens_with_labels):
    
        labelled_doc = [(sen, set()) for sen in doc.sents]
        
        for words, label in sens_with_labels:
            scores = ([self.__computeScore(sen, words) for sen, _ in labelled_doc])
            labelled_doc[scores.index(max(scores))][1].add(label)
        
        return labelled_doc    

    def __computeScore(self, sen, words):
        
        sen = set([str(token).lower() for token in sen])
        n = len(sen) 
        s = len(sen & words)
        score = s / n
        return score
    
    def __gatherLabels(self, review, labels=['0','1','2']):
        sens_with_labels = []
        all_words = review['x']
        for label in labels:
            for s, e in review[label]:
                sens_with_labels.append((set(all_words[s:e]), label))
        return sens_with_labels

In [73]:
rat_path = '/Users/Max/data/beer_reviews/annotations.json'
evalset = EvalSet(rat_path, vocab)

In [93]:
import torch
import torch.nn as nn
from torch.autograd import Variable

from dpp_nets.utils.language import Vocabulary, BeerDataset, custom_collate
from dpp_nets.layers.layers import ChunkTrainer

# Load saved checkpoint
model_dir = '/Users/Max/checkpoints/beer_reviews/' 
model = 'allchunksreg0.01reg_mean10.0lr0.001marginal_best_ckp.pth.tar'
model_path = model_dir + model
model = torch.load(model_path, map_location=lambda storage, loc: storage)

In [96]:
model.keys()

dict_keys(['epoch:', 'model', 'state_dict', 'lowest_loss', 'optimizer'])

In [104]:
model['optimizer']

{'param_groups': [{'betas': (0.9, 0.999),
   'eps': 1e-08,
   'lr': 0.0001,
   'params': [139774206458520],
   'weight_decay': 0},
  {'betas': (0.9, 0.999),
   'eps': 1e-08,
   'lr': 0.0001,
   'params': [139774206458432,
    139774206458344,
    139774206458256,
    139774206458168,
    139774206458080,
    139774206457992,
    139774206457904,
    139774206457816,
    139774206457728,
    139774206457640,
    139774206457552,
    139774206457464,
    139774206457376,
    139774206457288,
    139774206457200,
    139774206457112,
    139774206457024,
    139774206456936],
   'weight_decay': 0}],
 'state': {139774206456936: {'exp_avg': 
   1.00000e-04 *
     0.1757
    -8.0492
    -2.1988
   [torch.FloatTensor of size 3], 'exp_avg_sq': 
   1.00000e-06 *
     3.4333
     6.1171
     5.9175
   [torch.FloatTensor of size 3], 'step': 35700}, 139774206457024: {'exp_avg': 
   1.00000e-03 *
   -0.0018 -0.0351 -0.0339  ...   0.0136  0.0364 -0.5372
    0.7836  0.7990  0.7991  ...  -0.8353 -0.79

In [None]:
### Need to save a checkpoint which allows me to evaluate
### and also allows to continue training

In [78]:
defdict
newdict
for k in newdict.keys():
    print(k, reverse_dict[k])
vocab.returnEmbds(newdict.keys())
vocab.word2vec["'s"]

defaultdict(set,
            {("'m",
              'getting',
              'plenty',
              'dark',
              'chocolate',
              'bitter',
              'espresso'): set(),
             ('finishes', 'hop', 'bitterness'): set(),
             ('little', 'sweet', 'chocolate', 'follows'): {'1'},
             ('malts', 'hit', 'nose'): {'1'},
             ('nice', 'stout', 'would', 'love', 'maybe', 'age'): set(),
             ('night',): {'0'},
             ('roasty', 'character', 'taste'): set(),
             ('smooth', 'mouthfeel', 'perfect', 'carbonation', 'style'): {'2'},
             ('snifter',
              'small',
              'coffee',
              'head',
              'reduces',
              'quickly'): {'0'},
             ('stout',): set(),
             ('typical', 'imp'): set()})