<a href="https://colab.research.google.com/github/madziejm/1e100-ibu/blob/master/1e100ibu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Dependencies siorbing

In [None]:
import torchtext
import io
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import get_tokenizer

import torch

dev = 'cuda' if torch.cuda.is_available else 'cpu'

In [None]:
!pip install icecream
from icecream import ic



#### Dataset siorbing

In [None]:
RATEBEER_FILE = '/content/SNAP-Ratebeer.txt'
! export RATEBEER_FILE='/content/SNAP-Ratebeer.txt'
! [ -e $RATEBEER_FILE ] || gdown --id '12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T' # https://drive.google.com/file/d/12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T/view?usp=sharing
! echo "Dataset head (trailing newline makes entry end): "
! head -n 16 $RATEBEER_FILE
! iconv -f ISO-8859-1 -t UTF-8 $RATEBEER_FILE -o {RATEBEER_FILE}.new && mv {RATEBEER_FILE}.new $RATEBEER_FILE

Downloading...
From: https://drive.google.com/uc?id=12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T
To: /content/SNAP-Ratebeer.txt
100% 1.74G/1.74G [00:16<00:00, 108MB/s]
Dataset head (trailing newline makes entry end): 
beer/name: John Harvards Simcoe IPA
beer/beerId: 63836
beer/brewerId: 8481
beer/ABV: 5.4
beer/style: India Pale Ale &#40;IPA&#41;
review/appearance: 4/5
review/aroma: 6/10
review/palate: 3/5
review/taste: 6/10
review/overall: 13/20
review/time: 1157587200
review/profileName: hopdog
review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.

beer/name: John Harvards Simcoe IPA
beer/beerId: 63836


### Vocab representation

In [None]:
UNKNOWN_TOKEN = "<unk>"

tokenizer = get_tokenizer('basic_english')
def create_vocab(dataset_path):
  def yield_tokens():
    with io.open(dataset_path, encoding='utf-8') as f:
      for line in f:
        review_text_prefix = 'review/text: '
        if line.startswith(review_text_prefix):
          line = line[len(review_text_prefix):] # drop prefix
          yield tokenizer(line) # TODO remove punctuation
          # line = line.strip().lower().split() 
  vocab = build_vocab_from_iterator(yield_tokens(), specials=[UNKNOWN_TOKEN])
  vocab.set_default_index(vocab[UNKNOWN_TOKEN])
  return vocab

In [None]:
vocab = create_vocab(RATEBEER_FILE)

##### Test `vocab`

In [None]:
review = 'On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.'
ided_review = vocab.lookup_indices(tokenizer(review))
print(f'id-ed review: {ided_review}')
word_count = len(vocab.get_itos())
print(f'word count: {word_count}')
print(f'most common words: {vocab.get_itos()[:30]}')
# try whether we can get an id for unknown token (which will be probably 0 as it's index of unknown token)
print(f'{vocab.lookup_indices(["lubie", "hamburgefonsz"])}')
# TODO remove keep only n most common words

id-ed review: [29, 123, 34, 5, 10719, 2, 2038, 1225, 1, 125, 3, 135, 4, 160, 75, 52, 138, 3, 183, 51, 45, 6, 3, 81, 398, 141, 32, 9, 1, 187, 139, 684, 4, 103, 315, 490, 1, 177, 7, 684, 2, 19, 39, 4, 3, 18, 19, 179, 26, 1, 15, 87, 145, 23, 923, 5, 47920, 1693, 79, 15, 488, 13, 65, 11, 107, 3, 18, 19, 70, 6, 3, 42, 95, 11, 16, 1, 19, 92, 392, 29, 5, 149, 1]
word count: 641357
most common words: ['<unk>', '.', ',', 'a', 'and', 'the', 'with', 'of', 'is', 'head', 'aroma', 'to', 'in', 'this', 'but', 'i', 'it', 'sweet', 'very', 'light', 'beer', 'some', 'flavor', 'not', 'malt', 'bottle', 'finish', 'nice', 'that', 'on']
[0, 0]


Test gradients

In [None]:
values = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
l2_loss = values.square().sum()
# l2_loss.zero_grad()
l2_loss.backward()

print(values.grad)

tensor([2., 4., 6.])


## yyyy

In [None]:
aspects = ['appearance', 'aroma', 'palate', 'taste', 'overall']
aspect_count = len(aspects)

In [None]:
# theta = torch.empty(word_count, aspect_count, requires_grad=True)
theta = torch.rand((word_count, aspect_count), requires_grad=True)
with torch.no_grad():
    theta *= 0.9
#   torch.nn.init.kaiming_uniform_(theta)
  # enforce 1 initialization on aspect name (page 4)
  aspect_ids = vocab.lookup_indices(aspects)
  theta[aspect_ids, :] = 1

# phi = torch.rand((word_count, , aspect_count), requires_grad=True)

aspect_max_rating = [5, 10, 5, 10, 20]
phis = {i : torch.rand((word_count, aspect_max_rating[i]), requires_grad=True) for i in range(aspect_count)}

with torch.no_grad():
  # normalize that sum across all words is 1 for a given aspect (eq. 7)
  phis = {i : phi / phi.sum(dim=0) for i, phi in phis.items()}

In [None]:
def review_text2ids(review_text: str):
  return vocab.lookup_indices(tokenizer(review_text))

# def review_text2weights(review_text: str):
#   ids = review_text2ids(review_text)
#   thetas = theta[ids]
#   phis   = phi[ids]
#   return thetas, phis

# def review_likelihood(review_text: str):
#   theta_weight, phi_weight = 1.0, 1.0
#   thetas, phis = review_text2weights(review_text)
#   return torch.mean(
#       theta_weight * thetas +
#       phi_weight * phis
#   )

In [None]:
review_likelihood('Ich trinke Bier gern')

In [None]:
review_likelihood('Amazing hops and taste')

In [None]:
review_likelihood('appearance aroma palate taste overall')

In [None]:
print(theta)

In [None]:
vocab.lookup_indices(aspects)

### Implementation of $(1)$

In [None]:
def ratebeer(review):
    """
    get review like this
    
    beer/name: John Harvards Simcoe IPA
    beer/beerId: 63836
    beer/brewerId: 8481
    beer/ABV: 5.4
    beer/style: India Pale Ale &#40;IPA&#41;
    review/appearance: 4/5
    review/aroma: 6/10
    review/palate: 3/5
    review/taste: 6/10
    review/overall: 13/20
    review/time: 1157587200
    review/profileName: hopdog
    review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.
    
    return review text as list of sentences and aspect ratings
    """

In [None]:
# tokenizer = get_tokenizer('basic_english')

def yield_ided_review(dataset_path=RATEBEER_FILE, max_lines=1000000000):
    def yield_tokens():
        with io.open(dataset_path, encoding='utf-8') as f:
            counter = 0
            review_lines = []
            for line, _ in zip(f, range(max_lines)):
                if counter < 13:
                    review_lines.append(line)
                    counter += 1
                else:
                    review_aspects = review_lines[5:10]
                    review_aspects = list(map(lambda s: int(s.split(': ')[1].split('/')[0]), review_aspects))
                    review_sentences = review[12].lower().split(': ')[1].split('.')
                    reviev_senteces = list(map(review_text2ids, review_sentences))
                    result = (review_sentences, review_aspects)
                    ic(result)
                    raise BaseException
                    yield (review_sentences, review_aspects) # TODO remove punctuation
                    # line = line.strip().split()
                    counter = 0
                    review_lines = []



In [None]:
ided_reviews = yield_ided_reviews(max_lines=int(2e6))

In [None]:
def sentence_aspects_likelihood_theta(sen_ids):
    theta_theta = theta[sen_ids]
    theta_score = theta_theta.sum(0)
    return theta_score

def sentence_aspects_likelihood_phi(sen_ids, ratings):
    phi_score = [phis[aspect_idx][sen_ids, ratings[aspect_idx]] for aspect_idx in range(aspect_count)] 
    phi_score = [phi.sum(0) for phi in phi_score]
    return phi_score

def sentence_aspects_likelihood(sen_ids, ratings):
    score = torch.exp(sentence_aspects_likelihood_theta(sen_ids) + sentence_aspects_likelihood_phi(sen_ids, ratings))
    return score / score.sum()

In [None]:
# def review_aspect_likelihoods(review_sentences: str):
#   theta_weight, phi_weight = 1.0, 1.0
#   thetas, phis = review_text2weights(review_text)
#     for s in review_sentences:

# #   return map(
# #       lambda s:,
# #       review_sentences
# #   )
#   return torch.mean(
#       theta_weight * thetas +
#       phi_weight * phis
#   )