<a href="https://colab.research.google.com/github/madziejm/1e100-ibu/blob/master/1e100ibu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Dependencies siorbing

In [2]:
import torchtext
import io
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import get_tokenizer

import torch

dev = 'cuda' if torch.cuda.is_available else 'cpu'

In [3]:
!pip install icecream
from icecream import ic

Collecting icecream
  Downloading icecream-2.1.1-py2.py3-none-any.whl (8.1 kB)
Collecting executing>=0.3.1
  Downloading executing-0.8.2-py2.py3-none-any.whl (16 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Installing collected packages: executing, colorama, asttokens, icecream
Successfully installed asttokens-2.0.5 colorama-0.4.4 executing-0.8.2 icecream-2.1.1


#### Dataset siorbing

In [4]:
RATEBEER_FILE = '/content/SNAP-Ratebeer.txt'
! export RATEBEER_FILE='/content/SNAP-Ratebeer.txt'
! [ -e $RATEBEER_FILE ] || gdown --id '12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T' # https://drive.google.com/file/d/12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T/view?usp=sharing
! echo "Dataset head (trailing newline makes entry end): "
! head -n 16 $RATEBEER_FILE
! iconv -f ISO-8859-1 -t UTF-8 $RATEBEER_FILE -o {RATEBEER_FILE}.new && mv {RATEBEER_FILE}.new $RATEBEER_FILE

Downloading...
From: https://drive.google.com/uc?id=12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T
To: /content/SNAP-Ratebeer.txt
100% 1.74G/1.74G [00:19<00:00, 91.2MB/s]
Dataset head (trailing newline makes entry end): 
beer/name: John Harvards Simcoe IPA
beer/beerId: 63836
beer/brewerId: 8481
beer/ABV: 5.4
beer/style: India Pale Ale &#40;IPA&#41;
review/appearance: 4/5
review/aroma: 6/10
review/palate: 3/5
review/taste: 6/10
review/overall: 13/20
review/time: 1157587200
review/profileName: hopdog
review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.

beer/name: John Harvards Simcoe IPA
beer/beerId: 63836


### Vocab representation

In [5]:
UNKNOWN_TOKEN = "<unk>"

tokenizer = get_tokenizer('basic_english')
def create_vocab(dataset_path):
  def yield_tokens():
    with io.open(dataset_path, encoding='utf-8') as f:
      for line in f:
        review_text_prefix = 'review/text: '
        if line.startswith(review_text_prefix):
          line = line[len(review_text_prefix):] # drop prefix
          yield tokenizer(line) # TODO remove punctuation
          # line = line.strip().lower().split() 
  vocab = build_vocab_from_iterator(yield_tokens(), specials=[UNKNOWN_TOKEN])
  vocab.set_default_index(vocab[UNKNOWN_TOKEN])
  return vocab

In [6]:
vocab = create_vocab(RATEBEER_FILE)

##### Test `vocab`

In [7]:
review = 'On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.'
ided_review = vocab.lookup_indices(tokenizer(review))
print(f'id-ed review: {ided_review}')
word_count = len(vocab.get_itos())
print(f'word count: {word_count}')
print(f'most common words: {vocab.get_itos()[:30]}')
# try whether we can get an id for unknown token (which will be probably 0 as it's index of unknown token)
print(f'{vocab.lookup_indices(["lubie", "hamburgefonsz"])}')
# TODO remove keep only n most common words

id-ed review: [29, 123, 34, 5, 10719, 2, 2038, 1225, 1, 125, 3, 135, 4, 160, 75, 52, 138, 3, 183, 51, 45, 6, 3, 81, 398, 141, 32, 9, 1, 187, 139, 684, 4, 103, 315, 490, 1, 177, 7, 684, 2, 19, 39, 4, 3, 18, 19, 179, 26, 1, 15, 87, 145, 23, 923, 5, 47920, 1693, 79, 15, 488, 13, 65, 11, 107, 3, 18, 19, 70, 6, 3, 42, 95, 11, 16, 1, 19, 92, 392, 29, 5, 149, 1]
word count: 641357
most common words: ['<unk>', '.', ',', 'a', 'and', 'the', 'with', 'of', 'is', 'head', 'aroma', 'to', 'in', 'this', 'but', 'i', 'it', 'sweet', 'very', 'light', 'beer', 'some', 'flavor', 'not', 'malt', 'bottle', 'finish', 'nice', 'that', 'on']
[0, 0]


Test gradients

In [8]:
values = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
l2_loss = values.square().sum()
# l2_loss.zero_grad()
l2_loss.backward()

print(values.grad)

tensor([2., 4., 6.])


## yyyy

In [9]:
aspects = ['appearance', 'aroma', 'palate', 'taste', 'overall']
aspect_count = len(aspects)

In [83]:
# theta = torch.empty(word_count, aspect_count, requires_grad=True)
theta = torch.rand((word_count, aspect_count), requires_grad=True)
with torch.no_grad():
    theta *= 0.9
#   torch.nn.init.kaiming_uniform_(theta)
  # enforce 1 initialization on aspect name (page 4)
    aspect_ids = vocab.lookup_indices(aspects)
    theta[aspect_ids, :] = 1

# phi = torch.rand((word_count, , aspect_count), requires_grad=True)

aspect_rating_count = [6, 11, 6, 11, 21]
phis = [torch.rand((word_count, aspect_rating_count[i]), requires_grad=True) for i in range(aspect_count)]

with torch.no_grad():
    # normalize that sum across all words is 1 for a given aspect (eq. 7)
    phis = [phi / phi.sum(dim=0) for phi in phis]
for phi in phis:
    phi.requires_grad = True


In [12]:
def review_text2ids(review_text: str):
  return vocab.lookup_indices(tokenizer(review_text))

# def review_text2weights(review_text: str):
#   ids = review_text2ids(review_text)
#   thetas = theta[ids]
#   phis   = phi[ids]
#   return thetas, phis

# def review_likelihood(review_text: str):
#   theta_weight, phi_weight = 1.0, 1.0
#   thetas, phis = review_text2weights(review_text)
#   return torch.mean(
#       theta_weight * thetas +
#       phi_weight * phis
#   )

In [None]:
review_likelihood('Ich trinke Bier gern')

In [None]:
review_likelihood('Amazing hops and taste')

In [None]:
review_likelihood('appearance aroma palate taste overall')

In [None]:
print(theta)

In [None]:
vocab.lookup_indices(aspects)

### Implementation of $(1)$

In [13]:
def ratebeer(review):
    """
    get review like this
    
    beer/name: John Harvards Simcoe IPA
    beer/beerId: 63836
    beer/brewerId: 8481
    beer/ABV: 5.4
    beer/style: India Pale Ale &#40;IPA&#41;
    review/appearance: 4/5
    review/aroma: 6/10
    review/palate: 3/5
    review/taste: 6/10
    review/overall: 13/20
    review/time: 1157587200
    review/profileName: hopdog
    review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.
    
    return review text as list of sentences and aspect ratings
    """

In [44]:
# tokenizer = get_tokenizer('basic_english')

def yield_ided_review(dataset_path=RATEBEER_FILE, max_lines=1000000000):
    with io.open(dataset_path, encoding='utf-8') as f:
        counter = 0
        review_lines = []
        for line, _ in zip(f, range(max_lines)):
            if counter < 13:
                review_lines.append(line)
                counter += 1
            else:
                review_aspects_scores = review_lines[5:10]
                review_aspects_scores = list(map(lambda s: int(s.split(': ')[1].split('/')[0]), review_aspects_scores))
                
                # wywalamy review/text
                review_sentences = review_lines[12].lower()[len("review/text: "):]
                if line.startswith("UPDATED:"):
                    review_sentences = review_sentences[len("UPDATED: APR 29, 2008"):] # drop prefix
                
                review_sentences = review_sentences.split('.')
                review_sentences = list(filter(lambda l: len(l) != 0, (map(review_text2ids, review_sentences))))
                result = (review_sentences, review_aspects_scores)
                # raise BaseException
                yield (review_sentences, review_aspects_scores) # TODO remove punctuation
                # line = line.strip().split()
                counter = 0
                review_lines = []



In [40]:
# ided_reviews = yield_ided_reviews(max_lines=int(2e6))

In [49]:
def sentence_aspects_likelihood_theta(sen_ids):
    theta_theta = theta[sen_ids]
    theta_score = theta_theta
    return theta_score

def sentence_aspects_likelihood_phi(sen_ids):
    phi_scores = [phis[aspect_idx][sen_ids, :] for aspect_idx in range(aspect_count)]
    return phi_scores

# def sentence_aspects_likelihood(sen_ids):
#     theta_score = sentence_aspects_likelihood_theta(sen_ids)
#     phi_scores = sentence_aspects_likelihood_phi(sen_ids)
#     ic(theta_score, phi_scores)

    # score = torch.exp( + sentence_aspects_likelihood_phi(sen_ids, ratings))
    # return score / score.sum()


In [107]:
for (review_sentences, review_aspects_scores) in yield_ided_review(max_lines=int(2e6)):
    for sen_ids in review_sentences:
        theta_scores = sentence_aspects_likelihood_theta(sen_ids)
        aspect_pred = int(torch.argmax(torch.nn.functional.softmax(theta_scores.sum(0))).item())

        pred_aspect_rating = review_aspects_scores[aspect_pred]
        phi_score = sentence_aspects_likelihood_phi(sen_ids)[aspect_pred][:, pred_aspect_rating]
        
        asd = (theta_scores[:, aspect_pred] + phi_score).sum()
        ic(asd)

        ic(torch.square(theta).sum())
        
        # ic(torch.hstack(list(torch.square(phi).sum() for phi in phis)).sum())
        
        loss = -asd
        loss.backward()
        ic(theta.grad)

        with torch.no_grad():
            theta -= 0.01 * theta.grad
            phis = [phi - 0.01 * phi.grad if phi.grad != None else phi for phi in phis ]
        raise BaseException



  after removing the cwd from sys.path.
ic| asd: tensor(5.7028, grad_fn=<SumBackward0>)
ic| torch.square(theta).sum(): tensor(865644., grad_fn=<SumBackward0>)
ic| theta.grad: tensor([[  0.,   0.,   0.,   0.,   0.],
                        [  0.,   0.,   0.,   0.,   0.],
                        [  0., -24.,   0.,   0.,   0.],
                        ...,
                        [  0.,   0.,   0.,   0.,   0.],
                        [  0.,   0.,   0.,   0.,   0.],
                        [  0.,   0.,   0.,   0.,   0.]])
ic| phis[0].requires_grad: True


BaseException: ignored

In [None]:
# def review_aspect_likelihoods(review_sentences: str):
#   theta_weight, phi_weight = 1.0, 1.0
#   thetas, phis = review_text2weights(review_text)
#     for s in review_sentences:

# #   return map(
# #       lambda s:,
# #       review_sentences
# #   )
#   return torch.mean(
#       theta_weight * thetas +
#       phi_weight * phis
#   )