In [1]:
# annotations contains the rationals. Roughly 1000, 994 to be precise.
# heldouts seem to be the validation sets
# what is the test set? just the 994 reviews that are also annotated? or all data?
# review + wiki_filtered-200 could be the word embeddings. 

filepath = "/Users/Max/data/beer_reviews/Beeradvocate.txt" # full-dataset
filtered =  "/Users/Max/data/beer_reviews/review+wiki.filtered.200.txt.gz" # review+wiki.filtered.200.txt.gz
anno = "/Users/Max/data/beer_reviews/annotations.json" # annotated commentaries
heldout = "/Users/Max/data/beer_reviews/reviews.aspect1.heldout.txt.gz" # selection of commentaries
count = 0

with open(anno, 'r') as f:
    for line in f:
        count += 1
print(count)

994


In [2]:
# Firstly, I need a PyTorch Dataset.
# Secondly, I can simply pass this PyTorch Dataset to DataLoader, to get batch functionality etc. 

###
# Let's have one embedding layer, which maps word incides to the corresponding embeddings. 
# I need one padding index!
# My data set consists of x, y.
# x is a list of indices, y is the sentinment vector
# Input: file_path to txt file


In [194]:
# build my custom embedding
# i.e. transfer the word embeddings to a PyTorch word embedding
# This works!
# Wrap into two functions!
# 1) Function to actually create the word embedding pytorch file, takes 10 sec., need only once forever
# 2) Function that instantiates and loads word embeddings and set requires_grad = False

import torch
import torch.nn as nn
from collections import OrderedDict
import gzip
import os
import json
import numpy as np
from dpp_nets.utils.io import embd_iterator, data_iterator

root = '/Users/Max/data/beer_reviews'
data_file = 'reviews.aspect1.heldout.txt.gz'
embd_file = 'review+wiki.filtered.200.txt.gz'

def make_embd(embd_path, word_to_ix=False, save_path=None):
    
    # Create dictionaries
    ix_to_word = {}
    ix_to_vecs = {}
    word_to_ix = {}
    
    for ix, (word, vecs) in enumerate(embd_iterator(embd_path)):
        ix_to_word[ix] = word
        ix_to_vecs[ix] = vecs
        word_to_ix = {word: ix}
    
    vocab_size, embd_dim = len(ix_to_word), len(ix_to_vecs[0])
    
    if word_to_ix:
        return word_to_ix
    
    embd = torch.zeros(1 + vocab_size, embd_dim)
    for i, vec in enumerate(ix_to_vecs.values(), 1): 
        embd[i] = vec

    embd_weight_dict = OrderedDict([('weight', embd)])

    if save:
        torch.save(embd_weight_dict, 'embeddings.pt')    
    else:
        embd_layer = nn.Embedding(1 + vocab_size, embd_dim, padding_idx=0)
        embd_layer.load_state_dict(embd_weight_dict)
        embd.weight.requires_grad = False
        return embd_layer

def load_embd(embd_dict_path):
    
    embd_weight_dict = torch.load(embd_dict_path)
    vocab_size, embd_dim = embd_weight_dict['weight'].size()
    embd_layer = nn.Embedding(1 + vocab_size, embd_dim, padding_idx=0)
    embd_layer.load_state_dict(embd_weight_dict)
    embd.weight.requires_grad = False

    return embd_layer

In [190]:
def make_tensor_dataset(data_path, word_to_ix):
    
    

200

In [193]:
def load_embd(embd_dict_path):
    
    embd_weight_dict = torch.load(embd_dict_path)
    vocab_size, embd_dim = embd_weight_dict['weight'].size()
    embd_layer = nn.Embedding(1 + vocab_size, embd_dim, padding_idx=0)
    embd_layer.load_state_dict(embd_weight_dict)
    embd.weight.requires_grad = False

    return embd_layer

In [148]:
   



def make_pytorch_ready(root, data_file, embd_file, max_set_size=None):  
    
    # Create data paths
    data_path = os.path.join(root, data_file)
    embd_path = os.path.join(root, embd_file)
    
    # Create dictionaries
    ix_to_word = {}
    ix_to_vals = {}
    
    for ix, (word, vals) in enumerate(embd_iterator(embd_path)):
        ix_to_word[ix] = word
        ix_to_vals[ix] = vals
    
    word_to_ix = {v: k for k,v in ix_to_word.items()}
    
    # Create stats
    vocab_size = len(ix_to_word)
    embd_dim = len(ix_to_vals[0])
    
    # Create word embedding
    my_embd = np.zeros([1 + vocab_size, embd_dim])
    for i, val in enumerate(ix_to_vals.values(),1): 
        my_embd[i] = val
    my_state_dict = OrderedDict([('weight',torch.Tensor(my_embd))])
    torch.save(my_state_dict, 'embeddings.pt')
    
    if not max_set_size:
        max_set_size = 915
        # calculate the maximum_set_size
        
    # Create Dataset
    data_x, data_y = [ ], [ ]
    for (words, target) in data_iterator(data_path):
        data_x.append(words)
        data_y.append(target)
        
    targets = torch.stack([torch.Tensor(i) for i in data_y])

    # one-loop too many
    data_x_enc = []
    errors = []
    for words in data_x:
        temp = []
        temp2 = []
        for word in words:
            if word in word_to_ix:
                temp.append(word_to_ix[word])
            else:
                temp2.append(word)
        temp = np.pad(np.array(temp),[0,max_set_size - len(temp)], 'constant', constant_values=(0, 0))
        data_x_enc.append(temp)
        errors.append(temp2)
    reviews = torch.stack([torch.LongTensor(i) for i in data_x_enc])
    
    return reviews, targets
# Consider changing the word-processing, i.e. split at -, split at /, etc. !!
# Right now 

In [169]:
root = '/Users/Max/data/beer_reviews'
data_file = 'reviews.aspect1.heldout.txt.gz'
embd_file = 'review+wiki.filtered.200.txt.gz'

reviews, targets = make_pytorch_ready(root, data_file, embd_file)

In [154]:
from torch.utils.data import TensorDataset


In [155]:
my_data_set = TensorDataset(reviews, targets)

In [158]:
torch.save(my_data_set, 'my_data_set.pt')


In [159]:
loaded_data_set = torch.load('my_data_set.pt')

In [161]:
my_data_set.data_tensor


 1.2000e+01  6.4300e+02  2.6700e+02  ...   0.0000e+00  0.0000e+00  0.0000e+00
 2.1037e+04  2.5109e+04  1.0740e+03  ...   0.0000e+00  0.0000e+00  0.0000e+00
 7.4300e+02  8.9000e+01  3.4000e+02  ...   0.0000e+00  0.0000e+00  0.0000e+00
                ...                   ⋱                   ...                
 6.6000e+01  1.4000e+01  2.2000e+01  ...   0.0000e+00  0.0000e+00  0.0000e+00
 1.1430e+03  3.5800e+02  1.0000e+00  ...   0.0000e+00  0.0000e+00  0.0000e+00
 2.6959e+04  1.1200e+02  6.1390e+03  ...   0.0000e+00  0.0000e+00  0.0000e+00
[torch.LongTensor of size 10000x915]

In [168]:
from torch.autograd import Variable
embd(Variable(reviews[1:7]))

Variable containing:
( 0 ,.,.) = 
 -0.0813 -0.0336  0.0406  ...  -0.0041 -0.0135 -0.0813
  0.1185  0.0688 -0.0062  ...   0.0312 -0.0398 -0.0726
 -0.1108 -0.0834  0.0851  ...   0.0742  0.0198  0.0375
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 1 ,.,.) = 
 -0.0072 -0.0024  0.0411  ...  -0.0923 -0.0001 -0.0169
 -0.0515 -0.1156  0.1176  ...   0.0355  0.0461 -0.0511
 -0.0287 -0.0775  0.1195  ...   0.0786  0.0693  0.0316
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 2 ,.,.) = 
 -0.0105 -0.0856  0.0691  ...   0.0508  0.0344 -0.0080
  0.0362  0.0103  0.0808  ...  -0.0468  0.0393  0.0518
 -0.1188  0.0190 -0.0323  ...  -0.0478 -0.1224  0.0610
   

In [122]:
import re
# Splitting on: , <space> - ! ? :
list(filter(None, re.split("[, \-!?:]+", "Hey, you - what are you doing here!?")))

['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']

In [117]:
data_path = os.path.join(root, data_file)
embd_path = os.path.join(root, embd_file)
gen = create_embd_iterator(embd_path)

In [120]:
count = 0
gen = create_data_iterator(data_path)
for instance in gen:
    count += 1
print(count)

10000


In [None]:
import json
import sys
import gzip
import numpy as np

def read_annotations(path):
    """
    This reads in the original data set.  
    """
    data_x, data_y = [ ], [ ]
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path, 'rt') as fin:
        for line in fin:
            y, sep, x = line.partition("\t")
            x, y = x.split(), y.split()
            if len(x) == 0: continue
            y = np.asarray([ float(v) for v in y ])
            data_x.append(x)
            data_y.append(y)
    say("{} examples loaded from {}\n".format(
            len(data_x), path
        ))
    say("max text length: {}\n".format(
        max(len(x) for x in data_x)
    ))
    return data_x, data_y

In [3]:
# Build word embedding
embd_path = "/Users/Max/data/beer_reviews/review+wiki.filtered.200.txt.gz"

def load_embedding_iterator(path):
    file_open = gzip.open if path.endswith(".gz") else open
    with file_open(path, 'rt') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                word = parts[0]
                vals = np.array([float(x) for x in parts[1:]])
                yield word, vals

ix_to_word = {}
ix_to_vals = {}

for ix, (word, vals) in enumerate(load_embedding_iterator(embd_path)):
    ix_to_word[ix] = word
    ix_to_vals[ix] = vals

word_to_ix = {v: k for k,v in ix_to_word.items()}

assert(len(ix_to_vals) == len(ix_to_word))
vocab_size = len(ix_to_word)
embd_dim = len(ix_to_vals[0])
    
my_embd = np.zeros([1 + vocab_size, embd_dim])
for i, val in enumerate(ix_to_vals.values(),1): 
    my_embd[i] = val
        
my_state_dict = OrderedDict([('weight',torch.Tensor(my_embd))])
torch.save(my_state_dict, 'embeddings.pt')

#########
embd = nn.Embedding(1 + vocab_size, embd_dim, padding_idx=0)
embd.load_state_dict(torch.load('embeddings.pt'))


In [4]:
import json
import sys
import gzip
import numpy as np

def read_annotations(path):
    """
    This reads in the original data set.  
    """
    data_x, data_y = [ ], [ ]
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path, 'rt') as fin:
        for line in fin:
            y, sep, x = line.partition("\t")
            x, y = x.split(), y.split()
            if len(x) == 0: continue
            y = np.asarray([ float(v) for v in y ])
            data_x.append(x)
            data_y.append(y)
    say("{} examples loaded from {}\n".format(
            len(data_x), path
        ))
    say("max text length: {}\n".format(
        max(len(x) for x in data_x)
    ))
    return data_x, data_y

def read_rationales(path):
    """
    This reads the json.annotations file. 
    Creates a list of dictionaries, which holds the 994 reviews for which
    sentence-level annotations are available. 
    """
    data = [ ]
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path) as fin:
        for line in fin:
            item = json.loads(line)
            data.append(item)
    return data

def read_corpus(path):
    with open(path) as fin:
        lines = fin.readlines()
    lines = [ x.strip().split() for x in lines ]
    lines = [ x for x in lines if x ]
    corpus_x = [ x[1:] for x in lines ]
    corpus_y = [ int(x[0]) for x in lines ]
    return corpus_x, corpus_y

def say(s, stream=sys.stdout):
    stream.write("{}".format(s))
    stream.flush()

In [6]:
#annotated_rationales = read_rationales(anno)
validation_x, validation_y = read_annotations(heldout)
targets = torch.stack([torch.Tensor(i) for i in validation_y])

10000 examples loaded from /Users/Max/data/beer_reviews/reviews.aspect1.heldout.txt.gz
max text length: 915


In [35]:
errors = []
data_x_enc =[]

for words in validation_x:
    temp = []
    temp2 = []
    for word in words:
        if word in word_to_ix:
            temp.append(word_to_ix[word])
        else:
            temp2.append(word)
    data_x_enc.append(temp)
    errors.append(temp2)