In [None]:
import argparse
import os
import shutil
import gzip

import torch
import torch.nn as nn
from torch.autograd import Variable

import spacy

In [None]:
# parsing in spacy style
words = [tuple([token.text]) for token in doc]
sents = [tuple([token.text for token in sent]) for sent in doc.sents]
chunks = [tuple([word.text for word in token.subtree]) for token in doc]

# creating encodings
enc_words  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in words])
enc_sents  = target + '\D' + '\T'.join(['\W'.join(tup) for tup in sents])
enc_chunks = target + '\D' + '\T'.join(['\W'.join(tup) for tup in chunks])



In [15]:
import json
import gzip

def read_rationales(path):
    """
    This reads the json.annotations file. 
    Creates a list of dictionaries, which holds the 994 reviews for which
    sentence-level annotations are available. 
    """
    data = []
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path) as fin:
        for line in fin:
            item = json.loads(line)
            data.append(item)
    return data

In [16]:
import spacy
import ast
nlp = spacy.load('en')
anno = '/Users/Max/data/beer_reviews/annotations.json'
annotations = read_rationales(anno)

In [17]:
# Better
# For each sentence that deserves a label, we extract the sentence from doc that is closest. 

def map_sentence(sen, tup):
    sen = set([str(token).lower() for token in sen])
    n = len(sen) #+ len(tup[0])
    s = len(sen & tup[0])
    score = s / n
    return score

# Which sentences deserve a label and what label?
ix = 2
review = annotations[ix]
doc = nlp(ast.literal_eval(annotations[ix]['raw'])['review/text'])
all_words = review['x']
label_sens = []
for label in ['0','1','2']:
    label_sens.extend([(set(all_words[s:e]), label) for s, e in review[label]])

# Label the sentences in doc
sentences = [(sen, set()) for sen in doc.sents]
for tup in label_sens:
    scores = ([map_sentence(sen, tup) for sen, _ in sentences])
    # print(scores)
    sentences[scores.index(max(scores))][1].add(tup[1])

In [18]:
sentences

[(Very dark beer. Pours a nice finger and a half of creamy foam and stays throughout the beer.		,
  {'0'}),
 (Smells of coffee and roasted malt.		, {'1'}),
 (Has a major coffee-like taste with hints of chocolate., set()),
 (If you like black coffee, you will love this Porter.		, set()),
 (Creamy smooth mouthfeel and definitely gets smoother on the palate once it warms.		,
  {'2'}),
 (It's an ok Porter but I feel there are much better one's out there., set())]

In [19]:
# Process the sentences - create dictionaries - the keys can be fed to vocab
# - the values serve as look-up.
# how to exactly look-up? 

from collections import defaultdict

words = defaultdict(set)
chunks = defaultdict(set)
sents = defaultdict(set)

for tup in sentences:
    for token in tup[0]:
        words[tuple([token.text])].update([*tup[1]])
        chunks[tuple([word.text for word in token.subtree if word.text != '\n' and word.text != '\t'])].update([*tup[1]])
    sents[tuple([token.text for token in tup[0]])].update([*tup[1]])

In [20]:
words

defaultdict(set,
            {('\t\t',): {'0', '1', '2'},
             ("'s",): set(),
             (',',): set(),
             ('-',): set(),
             ('.',): {'0', '1', '2'},
             ('Creamy',): {'2'},
             ('Has',): set(),
             ('I',): set(),
             ('If',): set(),
             ('It',): set(),
             ('Porter',): set(),
             ('Pours',): {'0'},
             ('Smells',): {'1'},
             ('Very',): {'0'},
             ('a',): {'0'},
             ('an',): set(),
             ('and',): {'0', '1', '2'},
             ('are',): set(),
             ('beer',): {'0'},
             ('better',): set(),
             ('black',): set(),
             ('but',): set(),
             ('chocolate',): set(),
             ('coffee',): {'1'},
             ('creamy',): {'0'},
             ('dark',): {'0'},
             ('definitely',): {'2'},
             ('feel',): set(),
             ('finger',): {'0'},
             ('foam',): {'0'},
             ('gets',):

In [21]:
from dpp_nets.utils.language import Vocabulary, BeerDataset, custom_collate
from dpp_nets.layers.layers import ChunkTrainer

train_path = '/Users/Max/data/beer_reviews/' + 'reviews.' + 'all' + '.train.' + 'chunks' + '.txt.gz'
val_path = '/Users/Max/data/beer_reviews/' + 'reviews.' + 'all' + '.heldout.' + 'chunks' + '.txt.gz'
embd_path = '/Users/Max/data/beer_reviews/' + 'review+wiki.filtered.200.txt.gz'
word_path = '/Users/Max/data/beer_reviews/' + 'reviews.' + 'all' + '.train.' + 'words.txt.gz'

In [22]:
vocab = Vocabulary()
vocab.loadPretrained(embd_path)
vocab.setStops()
vocab.loadCorpus(word_path)
vocab.updateEmbedding()
vocab.setCuda(False)

In [23]:
newdict = defaultdict(set)
reverse_dict = defaultdict(list)

for tup, label in defdict.items():

    f_tuple = []

    for word in tup:
        word = vocab.checkWord(word, 10)
        if word:
            f_tuple.append(word)

    f_tuple = tuple(f_tuple)    

    if f_tuple: 
        newdict[f_tuple].update(*label)
        reverse_dict[f_tuple].append(tup)

NameError: name 'defdict' is not defined

In [None]:
defdict
newdict
for k in newdict.keys():
    print(k, reverse_dict[k])
vocab.returnEmbds(newdict.keys())
vocab.word2vec["'s"]

In [26]:
import json
import gzip
import ast
from collections import defaultdict
from collections import namedtuple
import spacy 

class EvalSet():
    
    def __init__(self, rat_path, vocab, mode='words'):
        
        nlp = spacy.load('en')
        self.reviews = []
        self.labelled_docs = []
        self.vocab = vocab
        self.words  = []
        self.chunks = []
        self.sents  = []
                
        DictCollect = namedtuple('DictCollect', ['allDict', 'cleanDict', 'revDict'])
        
        with open(rat_path) as f:
            for line in f:
                review = json.loads(line)
                self.reviews.append(review)
                        
        for review in self.reviews:
            doc = nlp(ast.literal_eval(review['raw'])['review/text'])
            sens_with_labels = self.__gatherLabels(review)
            labelled_doc = self.__curateDoc(doc, sens_with_labels)
            self.labelled_docs.append(labelled_doc)
        
        for labelled_doc in self.labelled_docs:
            words, chunks, sents = defaultdict(set), defaultdict(set), defaultdict(set)            
            for review, label in labelled_doc:
                for word in review:
                    words[tuple([word.text])].update([*label])
                    chunks[tuple([w.text for w in word.subtree if w.text != '\n' and w.text != '\t'])].update([*label])
                sents[tuple([word.text for word in review])].update([*label])
            self.words.append(words)
            self.chunks.append(chunks)
            self.sents.append(sents)
            
        self.words = [self.__complementDict(defdict) for defdict in self.words]
        self.chunks = [self.__complementDict(defdict) for defdict in self.chunks]
        self.sents = [self.__complementDict(defdict) for defdict in self.sents]
        
    def __complementDict(self, def_dict):
    
            cleanDict, revDict = self.vocab.createDicts(def_dict)
            dictcollect = DictCollect(def_dict, clanDict, revDict)
            
            return dictcollect
        
    def __curateDoc(self, doc, sens_with_labels):
    
        labelled_doc = [(sen, set()) for sen in doc.sents]
        
        for words, label in sens_with_labels:
            scores = ([self.__computeScore(sen, words) for sen, _ in labelled_doc])
            labelled_doc[scores.index(max(scores))][1].add(label)
        
        return labelled_doc    

    def __computeScore(self, sen, words):
        
        sen = set([str(token).lower() for token in sen])
        n = len(sen) 
        s = len(sen & words)
        score = s / n
        return score
    
    def __gatherLabels(self, review, labels=['0','1','2']):
        sens_with_labels = []
        all_words = review['x']
        for label in labels:
            for s, e in review[label]:
                sens_with_labels.append((set(all_words[s:e]), label))
        return sens_with_labels

In [27]:
rat_path = '/Users/Max/data/beer_reviews/annotations.json'
evalset = EvalSet(rat_path, vocab)

In [25]:
evalset.words[2].allDict

defaultdict(set,
            {('\t\t',): {'0', '1', '2'},
             ("'s",): set(),
             (',',): set(),
             ('-',): set(),
             ('.',): {'0', '1', '2'},
             ('Creamy',): {'2'},
             ('Has',): set(),
             ('I',): set(),
             ('If',): set(),
             ('It',): set(),
             ('Porter',): set(),
             ('Pours',): {'0'},
             ('Smells',): {'1'},
             ('Very',): {'0'},
             ('a',): {'0'},
             ('an',): set(),
             ('and',): {'0', '1', '2'},
             ('are',): set(),
             ('beer',): {'0'},
             ('better',): set(),
             ('black',): set(),
             ('but',): set(),
             ('chocolate',): set(),
             ('coffee',): {'1'},
             ('creamy',): {'0'},
             ('dark',): {'0'},
             ('definitely',): {'2'},
             ('feel',): set(),
             ('finger',): {'0'},
             ('foam',): {'0'},
             ('gets',):

In [30]:
for dictcollect in evalset.words:
    allDict, _, _ = dictcollect
    clean, rev = evalset.vocab.createDicts(allDict)
    break
    dictcollect._replace(cleanDict=clean)
    dictcollect._replace(revDict=rev)

In [38]:
dictcollect._replace(cleanDict=clean)

DictCollect(allDict=defaultdict(<class 'set'>, {('Clear',): {'0'}, (',',): {'0', '1', '2'}, ('burnished',): {'0'}, ('copper',): {'0'}, ('-',): {'0'}, ('brown',): {'0'}, ('topped',): {'0'}, ('by',): {'0', '1'}, ('a',): {'0'}, ('large',): {'0'}, ('beige',): {'0'}, ('head',): {'0'}, ('that',): {'0'}, ('displays',): {'0'}, ('impressive',): {'0'}, ('persistance',): {'0'}, ('and',): {'0', '1', '2'}, ('leaves',): {'0'}, ('small',): {'0'}, ('to',): {'0'}, ('moderate',): {'0'}, ('amount',): {'0'}, ('of',): {'0'}, ('lace',): {'0'}, ('in',): {'0'}, ('sheets',): {'0'}, ('when',): {'0'}, ('it',): {'0'}, ('eventually',): {'0'}, ('departs',): {'0'}, ('.',): {'0', '1', '2'}, ('\t\t',): {'0'}, ('The',): {'1', '2'}, ('nose',): {'1'}, ('is',): {'1', '2'}, ('sweet',): {'1'}, ('spicy',): {'1'}, ('the',): {'1'}, ('flavor',): {'1'}, ('malty',): {'1'}, ('accented',): {'1'}, ('nicely',): {'1'}, ('honey',): {'1'}, ('abundant',): {'1'}, ('caramel',): {'1'}, ('/',): {'1'}, ('toffee',): {'1'}, ('notes',): {'1'}, (

In [39]:
dictdictcollect.cleanDict

defaultdict(set, {})

In [40]:
clean

defaultdict(set,
            {("'ll",): set(),
             ("'m",): set(),
             ("'s",): set(),
             ('3',): set(),
             ('abundant',): {'1'},
             ('accented',): {'1'},
             ('alcohol',): set(),
             ('amount',): {'0'},
             ('beer',): set(),
             ('beige',): {'0'},
             ('best',): set(),
             ('bet',): set(),
             ('bitterness',): set(),
             ('brewery',): set(),
             ('brown',): {'0'},
             ('burnished',): {'0'},
             ('caramel',): {'1'},
             ('cinnamon',): set(),
             ('contains',): set(),
             ('copper',): {'0'},
             ('creamy',): {'2'},
             ('departs',): {'0'},
             ('displays',): {'0'},
             ('drinking',): set(),
             ('especially',): set(),
             ('eventually',): {'0'},
             ('exceeded',): set(),
             ('exemplary',): {'2'},
             ('expectations',): set(),
         