<a href="https://colab.research.google.com/github/madziejm/1e100-ibu/blob/master/1e100ibu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Dependencies siorbing

In [1]:
import torchtext
import io
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import get_tokenizer

import torch

dev = 'cuda' if torch.cuda.is_available else 'cpu'

#### Dataset siorbing

In [2]:
RATEBEER_FILE = '/content/SNAP-Ratebeer.txt'
! export RATEBEER_FILE='/content/SNAP-Ratebeer.txt'
! [ -e $RATEBEER_FILE ] || gdown --id '12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T' # https://drive.google.com/file/d/12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T/view?usp=sharing
! echo "Dataset head (trailing newline makes entry end): "
! head -n 16 $RATEBEER_FILE
! iconv -f ISO-8859-1 -t UTF-8 $RATEBEER_FILE -o {RATEBEER_FILE}.new && mv {RATEBEER_FILE}.new $RATEBEER_FILE

Downloading...
From: https://drive.google.com/uc?id=12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T
To: /content/SNAP-Ratebeer.txt
100% 1.74G/1.74G [00:12<00:00, 143MB/s]
Dataset head (trailing newline makes entry end): 
beer/name: John Harvards Simcoe IPA
beer/beerId: 63836
beer/brewerId: 8481
beer/ABV: 5.4
beer/style: India Pale Ale &#40;IPA&#41;
review/appearance: 4/5
review/aroma: 6/10
review/palate: 3/5
review/taste: 6/10
review/overall: 13/20
review/time: 1157587200
review/profileName: hopdog
review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.

beer/name: John Harvards Simcoe IPA
beer/beerId: 63836


### Vocab representation

In [3]:
UNKNOWN_TOKEN = "<unk>"

tokenizer = get_tokenizer('basic_english')
def create_vocab(dataset_path):
  def yield_tokens():
    with io.open(dataset_path, encoding='utf-8') as f:
      for line in f:
        review_text_prefix = 'review/text: '
        if line.startswith(review_text_prefix):
          line = line[len(review_text_prefix):] # drop prefix
          yield tokenizer(line) # TODO remove punctuation
          # line = line.strip().lower().split() 
  vocab = build_vocab_from_iterator(yield_tokens(), specials=[UNKNOWN_TOKEN])
  vocab.set_default_index(vocab[UNKNOWN_TOKEN])
  return vocab

In [4]:
vocab = create_vocab(RATEBEER_FILE)

##### Test `vocab`

In [5]:
review = 'On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.'
ided_review = vocab.lookup_indices(tokenizer(review))
print(f'id-ed review: {ided_review}')
word_count = len(vocab.get_itos())
print(f'word count: {word_count}')
print(f'most common words: {vocab.get_itos()[:30]}')
# try whether we can get an id for unknown token (which will be probably 0 as it's index of unknown token)
print(f'{vocab.lookup_indices(["lubie", "hamburgefonsz"])}')
# TODO remove keep only n most common words

id-ed review: [29, 123, 34, 5, 10719, 2, 2038, 1225, 1, 125, 3, 135, 4, 160, 75, 52, 138, 3, 183, 51, 45, 6, 3, 81, 398, 141, 32, 9, 1, 187, 139, 684, 4, 103, 315, 490, 1, 177, 7, 684, 2, 19, 39, 4, 3, 18, 19, 179, 26, 1, 15, 87, 145, 23, 923, 5, 47920, 1693, 79, 15, 488, 13, 65, 11, 107, 3, 18, 19, 70, 6, 3, 42, 95, 11, 16, 1, 19, 92, 392, 29, 5, 149, 1]
word count: 641357
most common words: ['<unk>', '.', ',', 'a', 'and', 'the', 'with', 'of', 'is', 'head', 'aroma', 'to', 'in', 'this', 'but', 'i', 'it', 'sweet', 'very', 'light', 'beer', 'some', 'flavor', 'not', 'malt', 'bottle', 'finish', 'nice', 'that', 'on']
[0, 0]


Test gradients

In [6]:
values = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
l2_loss = values.square().sum()
l2_loss.backward()
print(values.grad)

tensor([2., 4., 6.])


## yyyy

In [7]:
aspects = ['appearance', 'aroma', 'palate', 'taste', 'overall']
aspect_count = len(aspects)

In [39]:
theta = torch.empty(word_count, aspect_count, requires_grad=True)
with torch.no_grad():
  torch.nn.init.kaiming_uniform_(theta)
  # enforce 1 initialization on aspect name (page 4)
  aspect_ids = vocab.lookup_indices(aspects)
  theta[aspect_ids, :] = 1


phi = torch.rand((word_count, aspect_count), requires_grad=True)
with torch.no_grad():
  # normalize that sum across all words is 1 for a given aspect (eq. 7)
  phi = phi / phi.sum(dim=0)

In [46]:
def review_text2ids(review_text: str):
  return vocab.lookup_indices(tokenizer(review_text))

def review_text2weights(review_text: str):
  ids = review_text2ids(review_text)
  thetas = theta[ids]
  phis   = phi[ids]
  return thetas, phis

def review_probability(review_text: str):
  theta_weight, phi_weight = 1.0, 1.0
  thetas, phis = review_text2weights(review_text)
  return torch.mean(
      theta_weight * thetas +
      phi_weight * phis
  )

In [47]:
review_probability('Ich trinke Bier gern')

tensor(0.0300, grad_fn=<MeanBackward0>)

In [50]:
review_probability('Amazing hops and taste')

tensor(0.4446, grad_fn=<MeanBackward0>)

In [48]:
review_probability('appearance aroma palate taste overall')

tensor(1.0000, grad_fn=<MeanBackward0>)

In [33]:
print(theta)

tensor([[-0.3340,  0.3367, -0.9189, -0.6612, -0.2976],
        [ 0.0410,  0.4752,  0.4142,  0.0634,  1.0900],
        [ 0.6639,  0.3962, -0.9144,  0.8693, -0.1799],
        ...,
        [-0.3180, -0.9306, -0.2156, -0.8973, -0.6841],
        [-0.8311,  0.4176,  0.6423,  0.6981,  0.0203],
        [ 0.7947, -0.3581, -0.7585, -0.2399,  0.4761]], requires_grad=True)


In [34]:
vocab.lookup_indices(aspects)

[201, 10, 90, 31, 137]