<a href="https://colab.research.google.com/github/madziejm/1e100-ibu/blob/master/1e100ibu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preliminary

#### Dependencies

In [1]:
import torch
dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'dev = {dev}')

dev = cuda


In [2]:
!pip install --quiet icecream
from icecream import ic

## Dataset representation

In [None]:
!pip install 'spacy<3.3.0,>=3.2.0' --quiet
!python -m spacy download en_core_web_sm

[K     |████████████████████████████████| 6.0 MB 6.3 MB/s 
[K     |████████████████████████████████| 628 kB 39.3 MB/s 
[K     |████████████████████████████████| 181 kB 22.9 MB/s 
[K     |████████████████████████████████| 451 kB 31.3 MB/s 
[K     |████████████████████████████████| 10.1 MB 43.2 MB/s 
[K     |████████████████████████████████| 42 kB 943 kB/s 
[31mERROR: Operation cancelled by user[0m
[?25hCollecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 6.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
!pip show spacy | egrep Version
# we want SpaCy 3

Version: 3.2.1


#### review example

In [None]:
#     """
#     beer/name: John Harvards Simcoe IPA
#     beer/beerId: 63836
#     beer/brewerId: 8481
#     beer/ABV: 5.4
#     beer/style: India Pale Ale &#40;IPA&#41;
#     review/appearance: 4/5
#     review/aroma: 6/10
#     review/palate: 3/5
#     review/taste: 6/10
#     review/overall: 13/20
#     review/time: 1157587200
#     review/profileName: hopdog
#     review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.
#     """

#### dataset representation

In [3]:
from collections import Counter
from torchtext._torchtext import (Vocab as VocabPybind) # make use of some hidden interface
from torchtext.vocab import Vocab, build_vocab_from_iterator
from tqdm.notebook import trange, tqdm
import gc # garbage collector interface
import io
import re
import spacy # nlp toolkit
import torch

class RateBeerReviews(torch.utils.data.Dataset):
    def __init__(self, filepath='/content/SNAP-Ratebeer.txt', reviews_max=float('inf')):
        self.aspects = ['appearance', 'aroma', 'palate', 'taste', 'overall']
        self.aspect_count = len(self.aspects)
        self.aspect_max = [5 + 1, 10 + 1, 5 + 1, 10 + 1, 20 + 1]
        self._aspect_ratings = [ [] for _ in self.aspects ]
        self._texts = []
        self.unkn_tok = "<unk>" # unknown/out of vocabulary token
        self._len = 0
        self._fetch_data(filepath, reviews_max)
        self._post_process(max_word_count=20000) # 20K words should be okay

    def _fetch_data(self, filepath, reviews_max):
        with io.open(filepath, encoding='utf-8') as f:
            for line in tqdm(f, total=(40938282 if reviews_max == float('inf') else reviews_max * 14), desc='Reading data'):
                if line == '\n': # separator
                    self._len += 1
                    if reviews_max <= self._len:
                        break
                elif line.startswith('review/appearance: '):
                    line = line[len('review/appearance: '):]
                    self._aspect_ratings[0].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/aroma: '):
                    line = line[len('review/aroma: '):]
                    self._aspect_ratings[1].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/palate: '):
                    line = line[len('review/palate: '):]
                    self._aspect_ratings[2].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/taste: '):
                    line = line[len('review/taste: '):]
                    self._aspect_ratings[3].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/overall: '):
                    line = line[len('review/overall: '):]
                    self._aspect_ratings[4].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/text: '):
                    line = line[len('review/text: '):]
                    if line.startswith('UPDATED:'):
                        line = line[len("UPDATED: APR 29, 2008"):] # drop prefix
                    line = re.sub('~', ' ', line.strip()) # remove whitespace incl. trailing newline and tildes that can be found in data for some reason
                    if line:
                        self._texts.append(line)
                    else: # some reviews do not have associated text; unwind (remove) their ratings for each aspect
                        for aspect_ratings in self._aspect_ratings:
                            aspect_ratings.pop()
                        self._len -= 1

    def _post_process(self, min_word_freq=None, max_word_count=None):
        assert (min_word_freq is not None) ^ bool(max_word_count is not None), "provide one of min_word_freq and max_word_count"
        nlp = spacy.util.get_lang_class('en')()
        nlp.add_pipe("sentencizer", config={"punct_chars": ['.', '?', '!']})
        nlp.Defaults.stop_words |= { '-', '+'}
        nlp.Defaults.stop_words -= {'mostly', 'whole', 'indeed', 'quite', 'ever', 'nothing', 'perhaps', 'not', 'no', 'only', 'well', 'really', 'except'}
        print("Spacy pipe (tokenization&sentence split)..")
        gc.collect() # force garbage collection
        self._texts = [tuple(list(tok.lower_ for tok in sent if not tok.is_stop and not tok.is_punct and not tok.is_space and len(tok) > 2) for sent in doc.sents) for doc in nlp.pipe(self._texts)]
        print("Building vocab (word-id mapping)..")
        gc.collect(generation=0) # force garbage collection
        gc.collect(generation=1) # force garbage collection
        gc.collect(generation=2) # force garbage collection
        sent_gen = (sent for text in self._texts for sent in text)
        if min_word_freq:
            self.vocab = build_vocab_from_iterator(sent_gen, specials=[self.unkn_tok], min_word_freq=5)
        else:
            words = Counter()
            for tokens in sent_gen:
                words.update(tokens)
            words = [word for word, freq in words.most_common(max_word_count)] # list sorted by frequency yikees
            self.vocab = Vocab(VocabPybind(words, None))
        self.vocab.insert_token(self.unkn_tok, 0)
        self.vocab.set_default_index(self.vocab[self.unkn_tok]) # set index for out-of-vocabulary words
        print("Mapping words to ids..")
        gc.collect() # force garbage collection
        self._texts = [tuple(self.vocab.lookup_indices(sent) for sent in text) for text in self._texts]
        gc.collect() # force garbage collection

    def __getitem__(self, i):
        sentences = self._texts[i]
        ratings = tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count))
        return (sentences, ratings)

    def __len__(self):
        return self._len

If you want to read dataset from dataset file, set FETCH_RATEBEER to true in the cell below and RECREATE_PICKLE to True. If you left them untouched, it'lle be read from serialized `RateBeerReviews` class object instead of parsing text file.

In [4]:
%%bash

export FETCH_RATEBEER=false
if [ "$FETCH_RATEBEER" = true ] && [ -e $RATEBEER_FILE ]
then # original dataset
    export RATEBEER_FILE='/content/SNAP-Ratebeer.txt'
    gdown --id '12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T' # https://drive.google.com/file/d/12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T/view?usp=sharing
    echo "Dataset head (trailing newline makes entry end): "
    head -n 16 $RATEBEER_FILE
    iconv -f ISO-8859-1 -t UTF-8 $RATEBEER_FILE -o {RATEBEER_FILE}.new && mv {RATEBEER_FILE}.new $RATEBEER_FILE
else # pickle
    gdown --id '1VBDjyR4jpzAgzcDUGNQFguOfLC3rtOV_' # https://drive.google.com/file/d/1VBDjyR4jpzAgzcDUGNQFguOfLC3rtOV_/view?usp=sharing  # 20K words dataset
    # gdown --id '1ebDMDlOxtFh8B5i8lajR7q3kq-0hM02j' # https://drive.google.com/file/d/1ebDMDlOxtFh8B5i8lajR7q3kq-0hM02j/view?usp=sharing # min frequency 5 words dataset
fi

Downloading...
From: https://drive.google.com/uc?id=1VBDjyR4jpzAgzcDUGNQFguOfLC3rtOV_
To: /content/ratebeer-20K-vocab.pickle
  0%|          | 0.00/319M [00:00<?, ?B/s]  1%|1         | 4.72M/319M [00:00<00:13, 24.1MB/s]  7%|7         | 22.5M/319M [00:00<00:09, 32.5MB/s] 11%|#         | 34.1M/319M [00:00<00:06, 41.0MB/s] 13%|#3        | 42.5M/319M [00:00<00:05, 46.3MB/s] 21%|##        | 65.5M/319M [00:00<00:04, 60.9MB/s] 24%|##4       | 77.6M/319M [00:00<00:03, 66.5MB/s] 32%|###1      | 101M/319M [00:00<00:02, 81.2MB/s]  36%|###5      | 114M/319M [00:01<00:02, 87.3MB/s] 41%|####1     | 132M/319M [00:01<00:01, 103MB/s]  46%|####5     | 146M/319M [00:01<00:01, 107MB/s] 51%|#####1    | 163M/319M [00:01<00:01, 119MB/s] 55%|#####5    | 177M/319M [00:01<00:01, 116MB/s] 61%|######    | 193M/319M [00:01<00:01, 124MB/s] 66%|######5   | 209M/319M [00:01<00:00, 132MB/s] 70%|#######   | 224M/319M [00:01<00:00, 119MB/s] 77%|#######6  | 244M/319M [00:01<00:00, 131MB/s] 81%|########1 

In [None]:
import pickle # serialize lib
from google.colab import drive

drive.mount('/drive')

DATASET_PICKLE='/content/ratebeer.pickle'
DATASET_PICKLE='/content/ratebeer-20K-vocab.pickle'
RECREATE_PICKLE = False

if RECREATE_PICKLE:
    with open(DATASET_PICKLE, 'wb') as f:
        rb = RateBeerReviews()
        print('Dumping..')
        pickle.dump(rb, f)
else:
    with open(DATASET_PICKLE, 'rb') as f:
        rb = pickle.load(f)

In [7]:
# word ID-s count and 1K of least common words
print(len(rb.vocab.get_itos()))
print(rb.vocab.get_itos()[-1000:])

20001
['tractor', 'syd', 'chiefly', 'defiant', 'stripper', 'val', 'bierhaus', 'eyeball', 'sporatic', 'deviate', 'grapenuts', 'beervana', 'tarness', 'lenny', 'phenomenon', 'reek', 'usualy', 'bernard', 'barbecued', 'fruitbeer', 'enjoyit', 'rive', 'sas', 'flavorable', 'ecstatic', 'leopoldstoch', 'soulless', 'allagashs', 'mikrobryggeri', 'disgusted', 'heine', 'covey', 'respectively', 'scout', 'emitting', 'flemdawg', 'atom', 'brightens', 'pch', 'www.bierzwerg.de', 'oakwood', 'flagged', 'collor', 'rbnag-11', 'belter', 'ecstasy', 'deem', 'gust', 'creamyish', 'opq', 'fruti', 'accompagne', 'zunge', 'sinful', 'prunish', 'ingrediants', 'refill', 'rug', 'alc%', 'blase', 'paramount', 'oldrtybastrd', 'fininsh', 'sofort', 'sazz', 'echoing', 'prospect', 'contributor', 'crazily', 'rogueone', 'beer_hawk', 'presenza', 'overlain', 'hinteren', 'spur', 'suspicions', 'glendale', 'renowned', 'hover', '09/08/2008', 'flashback', 'degraded', 'vegetabley', 'sunburst', 'snug', 'beneficial', 'linkery', 'mtn', 'doub

### Training (implementation of $(1)$)

In [None]:
from torch.utils.data import random_split
import datetime
from scipy.optimize import linear_sum_assignment
from more_itertools import grouper

class Model():
    def __init__(self, dataset):
        self.ds = dataset
        self.init_weights()
        self._optim = torch.optim.SGD(
            params=(
                self.theta,
                *self.phis
            ),
            lr=0.00000000000001,
            weight_decay=0.001
        )

    def init_weights(self):
        word_count = len(self.ds.vocab.get_itos())
        self.theta = torch.rand((word_count, self.ds.aspect_count)).to(dev)
        # scale to [0.0, 0.9], as we enforce this weight to 1.0 for some words later on
        self.theta = self.theta * 0.9
        # enforce 1 initialization on aspect name (page 4)
        aspect_ids = self.ds.vocab.lookup_indices(self.ds.aspects)
        self.theta[aspect_ids, :] = 1
        self.theta.requires_grad_()

        # introduce separate phi for each aspect
        self.phis = [torch.rand((word_count, self.ds.aspect_max[i])).to(dev) for i in range(self.ds.aspect_count)]
        # normalize that sum across all words is 1 for a given aspect (eq. 7)
        self.phis = [phi / phi.sum(dim=0) for phi in self.phis]
        for phi in self.phis: phi.requires_grad_()
    
    def rev_words_thetas(self, rev_sens_ids):
        """
        TODO comment sentence_aspects_likelihood_theta
        """
        return [self.theta[sen_ids] for sen_ids in rev_sens_ids]

    def rev_words_phis(self, rev_sens_ids):
        """
        TODO comment sentence_aspects_likelihood_phi
        """
        return [[self.phis[aspect_idx][sen_ids, :] for aspect_idx in range(self.ds.aspect_count)] for sen_ids in rev_sens_ids]
    
    def dump_weights(self, dest_dir='/drive/MyDrive/Colab Notebooks/1e100ibu/saves/'):
        torch.save(self.phis,  f'{dest_dir}{datetime.datetime.now()}-theta')
        torch.save(self.theta, f'{dest_dir}{datetime.datetime.now()}-phis')

    def load_weights(self, src_path):
        self.theta = torch.load(f'src_path-theta')
        self.phis  = torch.load(f'src_path-phis')
    

    def _linear_assignement(self, costs):
        return linear_sum_assignment(costs, maximize=True)
    
    def aspecs_assignments(self):
        pass
    
    def train(self, epoch_count=1):
        train_size = int(0.8 * len(self.ds))
        # train_size = 1000 # use latter; this is for debuggin' only
        test_size = len(self.ds) - train_size

        self.train_ds, self.test_ds = random_split(self.ds, [train_size, test_size], generator=torch.Generator().manual_seed(42)) # let's fix RNG seed for now

        batch_size = 5

        try:
            for epoch in range(epoch_count):
                ic(epoch)
                for i, batch in enumerate(tqdm(grouper(self.train_ds, batch_size), total=(len(self.train_ds) / batch_size))):
                    sentence_ll_losses = []
                    for (rev_sents_ids, review_aspects_scores) in batch:
                        rev_thetas = self.rev_words_thetas(rev_sents_ids)
                        rev_phis   = self.rev_words_phis(rev_sents_ids)
                        res_sents_scores = torch.stack(
                            [
                            rev_thetas[j].sum(dim=0) + torch.tensor(tuple(rev_phis[j][a][:, review_aspects_scores[a]].sum() for a in range(self.ds.aspect_count))).to(dev) # 1 x aspect count
                            for j in range(len(rev_sents_ids))
                            ],
                        ) # sent count x aspect count
                        sents_aspect_preds_max = torch.argmax(res_sents_scores, dim=1)
                        row_ind, col_ind = self._linear_assignement(costs=res_sents_scores.detach().cpu().numpy())
                        sents_aspect_preds_linear = sents_aspect_preds_max
                        
                        sents_aspect_preds_linear[row_ind] = torch.from_numpy(col_ind).to(dev)
                        
                        for j in range(len(rev_sents_ids)):
                            sen_thetas = rev_thetas[j]
                            sen_phis   = rev_phis[j]

                            # (most likely) aspect assignment (5)
                            aspect_pred = sents_aspect_preds_linear[j]

                            # sentence likelihood (6)
                            aspect_rating = review_aspects_scores[aspect_pred]
                            theta_score_ll = sen_thetas[:, aspect_pred].sum()
                            phi_score_ll = sen_phis[aspect_pred][:, aspect_rating].sum()

                            ic(theta_score_ll + phi_score_ll)
                            ic(theta_score_ll)
                            ic(phi_score_ll)
                            ll = torch.log(theta_score_ll + phi_score_ll)
                            sentence_ll_losses.append(-ll)

                            # if 0 == i % 50000:
                            #     ic(ll)
                        
                    ll_loss = torch.stack(sentence_ll_losses).sum()
                    self._optim.zero_grad()
                    ll_loss.backward()
                    self._optim.step()

                    if 0 == i % 100:
                        ic(i)
                        ic(ll_loss)
                        self.dump_weights()
                    
        except KeyboardInterrupt:
            print('Interrupted.')
        except Exception as e:
            raise e


In [None]:
model = Model(rb)

In [None]:
model.train()