In [1]:
import argparse
import os
import shutil

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data.dataloader import DataLoader

from dpp_nets.utils.io import make_embd, make_tensor_dataset
from dpp_nets.my_torch.utilities import pad_tensor

from dpp_nets.layers.layers import DeepSetBaseline

parser = argparse.ArgumentParser(description='Baseline (Deep Sets) Trainer')

parser.add_argument('-a', '--aspect', type=str, choices=['aspect1', 'aspect2', 'aspect3', 'all'],
                    help='what is the target?', required=True)
parser.add_argument('--remote', type=int,
                    help='training locally or on cluster?', required=True)

parser.add_argument('--data_path_local', type=str, default='/Users/Max/data/beer_reviews',
                    help='where is the data folder locally?')
parser.add_argument('--data_path_remote', type=str, default='/cluster/home/paulusm/data/beer_reviews',
                    help='where is the data folder?')

parser.add_argument('--ckp_path_local', type=str, default='/Users/Max/checkpoints/beer_reviews',
                    help='where is the data folder locally?')

parser.add_argument('--ckp_path_remote', type=str, default='/cluster/home/paulusm/checkpoints/beer_reviews',
                    help='where is the data folder?')

parser.add_argument('-b', '--batch-size', default=50, type=int,
                    metavar='N', help='mini-batch size (default: 50)')
parser.add_argument('--epochs', default=100, type=int, metavar='N',
                    help='number of total epochs to run')
#parser.add_argument('--lr-k', '--learning-rate-k', default=0.1, type=float,
#                    metavar='LRk', help='initial learning rate for kernel net')
#parser.add_argument('--lr-p', '--learning-rate-p', default=0.1, type=float,
#                    metavar='LRp', help='initial learning rate for pred net')
parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float,
                    metavar='LR', help='initial learning rate for baseline')
#parser.add_argument('--reg', type=float, required=True,
#                    metavar='reg', help='regularization constant')
#parser.add_argument('--reg-mean', type=float, required=True,
#                    metavar='reg_mean', help='regularization_mean')

args = parser.parse_args("-a all --remote 0".split())

val_path   = os.path.join(args.data_path_local, str.join(".",['reviews', args.aspect, 'heldout.txt.gz']))
embd_path = os.path.join(args.data_path_local, 'review+wiki.filtered.200.txt.gz')
#embd, word_to_ix = make_embd(embd_path)
#val_set = make_tensor_dataset(val_path, word_to_ix)
#val_loader = DataLoader(val_set, 10000)

In [109]:
train_path   = os.path.join(args.data_path_local, str.join(".",['reviews', args.aspect, 'train.txt.gz']))

In [2]:
from dpp_nets.utils.language import create_clean_vocabulary
nlp, vocab, embd = create_clean_vocabulary(embd_path, train_path)

In [3]:
embd.weight.requires_grad = False

In [5]:
from dpp_nets.utils.language import BeerDataset, process_batch
from torch.utils.data import DataLoader
ds = BeerDataset(val_path)
dl = DataLoader(ds, 1, shuffle=True)


In [94]:
filter_stops(doc.sents[0], vocab)

<generator object filter_stops.<locals>.<genexpr> at 0x15e5433b8>

In [112]:
def yield_sen_vec(doc, vocab, embd):
    seen = set()
    for s in doc.sents:
        t = tuple((filter_stops(s, vocab)))
        if t and t not in seen:
            seen.add(t)
            ixs = torch.LongTensor([vocab.word2index[word] for word in t])
            embd_mat = embd(Variable(ixs)).mean(0)
            yield embd_mat

In [111]:
def process_batch_sens(nlp, vocab, embd, batch):

    MAX_CHUNK_LENGTH = 271
    MAX_SENS_NO = 397

    # maxi = 0
    # for review in batch['review']:
     #   doc = nlp(review)
     #   rep = torch.stack(list(yield_chunk_vec(doc, vocab, embd))).squeeze()
     #   maxi = max(maxi, rep.size(0))

    reps = []
    for review in batch['review']:
        doc = nlp(review)
        rep = torch.stack(list(yield_sen_vec(doc, vocab, embd))).squeeze()
        rep = torch.cat([rep, Variable(torch.zeros(MAX_SENS_NO + 1 - rep.size(0),rep.size(1)))],dim=0)
        reps.append(rep)

    data_tensor =  torch.stack(reps)
    target_tensor = Variable(torch.stack(batch['target']).t().float())
    
    return data_tensor, target_tensor


In [None]:
loss = ChunkTrainer(200,500,200,200,3)(data_tensor, target_tensor)

In [None]:
loss.backward()

In [None]:
torch.cat([rep, Variable(torch.zeros(maxi + 1 - rep.size(0),rep.size(1)))],dim=0)

In [None]:
l = []
for chunk in list(yield_chunks(doc, vocab)):
    c = embd(Variable(chunk)).mean(0)
    l.append(c)

In [None]:
len(l)

In [None]:
from torch.utils.data import Dataset, DataLoader
import json

anno_path = os.path.join(args.data_path_local, 'annotations.json')

class BeerDatasetAnnotated(Dataset):
    """BeerDataset."""

    def __init__(self, anno_path, aspect='all'):
        
        # Compute size of the data set      
        self.aspect = aspect
        

        self.lines = []
        with open(anno_path) as f:
            for line in f:
                item = json.loads(line)

                # Get doc
                doc = nlp(ast.literal_eval(item['raw'])['review/text']

                # Get annotations
                a0 = item['0']
                a1 = item['1']
                a2 = item['2']

                # Get target
                target = item['y']

                self.lines.append((doc,(a0, a1, a2), target))
        

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        instance = self.lines[idx]            
        sample = {'review': instance[0], 'target': instance[2]}

        return sample

In [None]:
item

In [None]:
ds = BeerDataset(val_path)
dl = DataLoader(ds, batch_size=12, shuffle=True)

In [None]:
process_batch(batch)

In [None]:
from dpp_nets.my_torch.utilities import pad_tensor

def filter_stops(tree, vocab):
    return (token.text for token in tree if not token.is_stop and token.text in vocab.word2index)

#def yield_words(doc, vocab)

def yield_chunks(doc, vocab, MAX_CHUNK_LENGTH):
    seen = set()
    for token in doc:
        t = tuple((filter_stops(token.subtree, vocab)))
        if t and t not in seen:
            seen.add(t)
            #ixs = [vocab.word2index[word] if word in vocab.word2index else print(word) for word in t]
            ixs = torch.LongTensor([vocab.word2index[word] for word in t])
            ixs = pad_tensor(ixs,0,0,MAX_CHUNK_LENGTH)
            yield ixs
            
def yield_sentences(doc, vocab, MAX_SENTENCE_LENGTH):
    seen = set()
    for sen in doc.sents:
        t = tuple((filter_stops(sen)))
        if t and t not in seen:
            seen.add(t)
            #ixs = [vocab.word2index[word] for word in t]
            ixs = torch.LongTensor([vocab.word2index[word] for word in t])
            ixs = pad_tensor(ixs,0,0,MAX_SENTENCE_LENGTH)
            yield ixs

In [110]:
import gzip
import tqdm
MAX_SENS_NO = 0

with gzip.open(train_path, 'rt') as f:
    for line in tqdm.tqdm(f):
        target, sep, review = line.partition('\t')
        doc = nlp(review)
        MAX_SENS_NO = max(len(list(doc.sents)), MAX_SENS_NO)


210000it [33:12, 105.37it/s]


In [None]:
with gzip.open(short_path, 'wt') as f:
    for line in lines[:100]:
        f.write(line)

In [None]:
MAX_CHUNK_LENGTH = max([len(m) for l in measure_list for m in l])
Max_CHUNK_NO = max([len(l) for l in measure_list])

In [None]:
from spacy.symbols import nsubj, VERB

sentence = list(doc.sents)[5]
for token in sentence:
    #print(token, list(token.children), token.head, token.dep_, list(token.lefts), list(token.rights), list(token.subtree))
    #print(token, list(token.subtree)) #,list(token.lefts),list(token.rights),token.left_edge, token.right_edge)
    print((token, list(token.subtree)))
    #print(token, list(token.ancestors))
    #print(token, token.vector)

In [None]:
def data_iterator(data_path):
    with gzip.open(data_path, 'rt') as f:
        for line in f:
            target, sep, words = line.partition("\t")
            words, target = words.split(), target.split()
            if len(words):
                target = torch.Tensor([float(v) for v in target])
                yield words, target

In [None]:
i = 0
n = 0
maxi = 0
mean = M2 = 0.0

with gzip.open(val_path, 'rt') as f:
    for line in f:
        target, sep, review = line.partition("\t")
        n_sentences = len(nltk.sent_tokenize(review))
        x = n_sentences
        
        n += 1
        if x == 101: 
            break
        maxi = max(x, maxi)
        delta = x - mean
        mean += delta/n
        delta2 = x - mean
        M2 += delta*delta2

        #i += 1
        #if i > 21:
         #   break

In [None]:
sentences

In [None]:
# How to create sentences from a review
sentences = nltk.sent_tokenize(review)

In [None]:
# How to create words from a sentence
words = nltk.word_tokenize(sentences[0])

In [None]:
# How to remove stop words from a sentence

from nltk.corpus import stopwords
operators = set(('no', 'not'))
punctuation = set(string.punctuation)
stop = (set(stopwords.words('english')) | punctuation) - operators 
fwords = [[word for word in words if word not in stop] for words in [nltk.word_tokenize(sen) for sen in sentences]]

In [None]:
fwords

In [None]:
# How to create bigrams (after stop-word removal? - would do so)

print(list(nltk.bigrams(words)))
print(30*'-')
list(nltk.bigrams(fwords))

In [None]:
import json
anno_path = os.path.join(args.data_path_local, 'annotations.json')
def read_rationales(path):
    """
    This reads the json.annotations file. 
    Creates a list of dictionaries, which holds the 994 reviews for which
    sentence-level annotations are available. 
    """
    data = []
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path) as fin:
        for line in fin:
            item = json.loads(line)
            data.append(item)
    return data

rationales = read_rationales(anno_path)

rationales[0].keys()

nltk.sent_tokenize(rationales[1]['raw'])[10]

import string
raw = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in rationales[2]['x']]).strip()
nltk.sent_tokenize(raw)

print(rationales[2]['2'][0])
rationales[2]['x'][49:63]



In [None]:
import ast
doc = nlp(ast.literal_eval(data[0]['raw']) ['review/text'])


In [None]:
data[0]['y']

In [None]:
import gzip
import tqdm
import string
import nltk
from nltk.corpus import stopwords

path = os.path.join(args.data_path_local, str.join(".",['reviews', args.aspect, 'heldout.txt.gz']))
def simple_iterator(path):
    with gzip.open(path, 'rt') as f:
        for line in f:
            target, sep, review = line.partition("\t")
            yield review, target
            
# Create set of filtered sentences for each review:
operators = set(('no', 'not'))
punctuation = set(string.punctuation)
stop = set(stopwords.words('english')) | punctuation | set('...')  - operators 

for review, target in tqdm.tqdm(simple_iterator(path)):

    # Split review into sentences
    sens = nltk.sent_tokenize(review)
    
    # Split each sentence into words
    sens = [nltk.word_tokenize(sen) for sen in sens]
    
    # Remove stop words
    fwords = [[word for word in sen if word not in stop] for sen in sens]
    

In [None]:
# Noun-Chunks
def noun_chunks(doc):
    my_processed_review = []
    for chunk in doc.noun_chunks:
        chunk = tuple(filter(lambda token: not token.is_stop, chunk))
        chunk = tuple(word.text for word in chunk)
        my_processed_review.append(chunk)
    my_processed_review = list(filter(None, my_processed_review))
    return my_processed_review

# Sub-Trees
def sub_trees(doc):
    my_processed_review = []
    for sen in doc.sents:
        for token in sen:
            chunk = token.subtree
            chunk = tuple(filter(lambda token: not token.is_stop, chunk))
            #chunk = tuple(word.text for word in chunk)
            my_processed_review.append(chunk)
    my_processed_review = list(filter(None, my_processed_review))
    my_processed_review = list(set(my_processed_review))
    return my_processed_review

# Sentence-Level processing. 
def sentences(doc):
    my_processed_review = []
    for sen in doc.sents:
        chunk = tuple(token for token in sen if not token.is_stop)
        chunk = tuple(word.text for word in chunk)
        my_processed_review.append(chunk)
    my_processed_review = list(filter(None, my_processed_review))
    return my_processed_review


