<a href="https://colab.research.google.com/github/madziejm/1e100-ibu/blob/master/1e100ibu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preliminary

#### Dependencies

In [None]:
dev = 'cuda' if torch.cuda.is_available else 'cpu' # not used now TODO
# dev = 'cpu'
print(f'dev = {dev}')

dev = cuda


In [None]:
!pip install --quiet icecream
from icecream import ic

## Dataset representation

In [None]:
!pip install 'spacy<3.3.0,>=3.2.0' --quiet
!python -m spacy download en_core_web_sm

In [None]:
!pip show spacy | egrep Version
# we want SpaCy 3

Version: 3.2.1


#### review example

In [None]:
#     """
#     beer/name: John Harvards Simcoe IPA
#     beer/beerId: 63836
#     beer/brewerId: 8481
#     beer/ABV: 5.4
#     beer/style: India Pale Ale &#40;IPA&#41;
#     review/appearance: 4/5
#     review/aroma: 6/10
#     review/palate: 3/5
#     review/taste: 6/10
#     review/overall: 13/20
#     review/time: 1157587200
#     review/profileName: hopdog
#     review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.
#     """

In [None]:
import torch
from tqdm.notebook import trange, tqdm
from torchtext.vocab import build_vocab_from_iterator
import io
import spacy # nlp toolkit
import gc # garbage collector interface
import re

class RateBeerReviews(torch.utils.data.Dataset):
    def __init__(self, filepath='/content/SNAP-Ratebeer.txt', reviews_max=float('inf')):
        self.aspects = ['appearance', 'aroma', 'palate', 'taste', 'overall']
        self.aspect_count = len(self.aspects)
        self.aspect_max = [5 + 1, 10 + 1, 5 + 1, 10 + 1, 20 + 1]
        self._aspect_ratings = [ [] for _ in self.aspects ]
        self._texts = []
        self.unkn_tok = "<unk>" # unknown/out of vocabulary token
        self._len = 0
        self._fetch_data(filepath, reviews_max)
        self._post_process()

    def _fetch_data(self, filepath, reviews_max):
        # tu coś chyba miało być
        with io.open(filepath, encoding='utf-8') as f:
            for line in tqdm(f, total=(40938282 if reviews_max == float('inf') else reviews_max * 14), desc='Reading data'):
                if line == '\n': # separator
                    self._len += 1
                    if reviews_max <= self._len:
                        break
                elif line.startswith('review/appearance: '):
                    line = line[len('review/appearance: '):]
                    self._aspect_ratings[0].append(int(line.split('/')[0]))
                elif line.startswith('review/aroma: '):
                    line = line[len('review/aroma: '):]
                    self._aspect_ratings[1].append(int(line.split('/')[0]))
                elif line.startswith('review/palate: '):
                    line = line[len('review/palate: '):]
                    self._aspect_ratings[2].append(int(line.split('/')[0]))
                elif line.startswith('review/taste: '):
                    line = line[len('review/taste: '):]
                    self._aspect_ratings[3].append(int(line.split('/')[0]))
                elif line.startswith('review/overall: '):
                    line = line[len('review/overall: '):]
                    self._aspect_ratings[4].append(int(line.split('/')[0]))
                elif line.startswith('review/text: '):
                    line = line[len('review/text: '):]
                    if line.startswith('UPDATED:'):
                        line = line[len("UPDATED: APR 29, 2008"):] # drop prefix
                    self._texts.append(re.sub('~', ' ', line.rstrip('\n'))) # remove trailing newline and tildes that can be found in data for some reason

    def _post_process(self):
        nlp = spacy.util.get_lang_class('en')()
        nlp.add_pipe("sentencizer", config={"punct_chars": ['.', '?', '!']})
        nlp.Defaults.stop_words |= { '-', '+'}
        nlp.Defaults.stop_words -= {'mostly', 'whole', 'indeed', 'quite', 'ever', 'nothing', 'perhaps', 'not', 'no', 'only', 'well', 'really', 'except'}
        print("Spacy pipe (tokenization&sentence split)..")
        gc.collect() # force garbage collection
        self._texts = [tuple(list(tok.lower_ for tok in sent if not tok.is_stop and not tok.is_punct and not tok.is_space and len(tok) > 2) for sent in doc.sents) for doc in nlp.pipe(self._texts)]
        print("Building vocab (word-id mapping)..")
        gc.collect(generation=0) # force garbage collection
        gc.collect(generation=1) # force garbage collection
        gc.collect(generation=2) # force garbage collection
        self.vocab = build_vocab_from_iterator((sent for text in self._texts for sent in text), specials=[self.unkn_tok], min_freq=5)
        self.vocab.set_default_index(self.vocab[self.unkn_tok])
        print("Mapping words to ids..")
        gc.collect() # force garbage collection
        self._texts = [tuple(self.vocab.lookup_indices(sent) for sent in text) for text in self._texts]
        gc.collect() # force garbage collection

    def __getitem__(self, i):
        sentences = self._texts[i]
        ratings = tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count))
        return (sentences, ratings)

    def __len__(self):
        return self._len

If you want to read dataset from dataset file, set FETCH_RATEBEER to true in the cell below and RECREATE_PICKLE to True. If you left them untouched, it'lle be read from serialized `RateBeerReviews` class object instead of parsing text file.

In [None]:
%%bash

export FETCH_RATEBEER=false
if [ "$FETCH_RATEBEER" = true ] && [ -e $RATEBEER_FILE ]
then # original dataset
    export RATEBEER_FILE='/content/SNAP-Ratebeer.txt'
    gdown --id '12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T' # https://drive.google.com/file/d/12tEEYQcHZtg5aWyfIiWWVIDAJNT-5d_T/view?usp=sharing
    echo "Dataset head (trailing newline makes entry end): "
    head -n 16 $RATEBEER_FILE
    iconv -f ISO-8859-1 -t UTF-8 $RATEBEER_FILE -o {RATEBEER_FILE}.new && mv {RATEBEER_FILE}.new $RATEBEER_FILE
else # pickle
    gdown --id '1ebDMDlOxtFh8B5i8lajR7q3kq-0hM02j' # https://drive.google.com/file/d/1ebDMDlOxtFh8B5i8lajR7q3kq-0hM02j/view?usp=sharing
fi

Downloading...
From: https://drive.google.com/uc?id=1ebDMDlOxtFh8B5i8lajR7q3kq-0hM02j
To: /content/ratebeer.pickle
  0%|          | 0.00/320M [00:00<?, ?B/s]  1%|1         | 4.72M/320M [00:00<00:06, 47.0MB/s]  8%|8         | 25.7M/320M [00:00<00:04, 60.5MB/s] 11%|#         | 34.1M/320M [00:00<00:04, 59.4MB/s] 19%|#8        | 59.8M/320M [00:00<00:03, 77.1MB/s] 22%|##2       | 71.8M/320M [00:00<00:03, 72.1MB/s] 29%|##9       | 93.3M/320M [00:00<00:02, 89.8MB/s] 33%|###3      | 107M/320M [00:00<00:02, 83.0MB/s]  41%|####      | 131M/320M [00:01<00:01, 103MB/s]  46%|####5     | 147M/320M [00:01<00:01, 95.2MB/s] 53%|#####2    | 168M/320M [00:01<00:01, 103MB/s]  57%|#####6    | 181M/320M [00:01<00:01, 86.0MB/s] 60%|######    | 192M/320M [00:01<00:02, 63.0MB/s] 63%|######2   | 201M/320M [00:02<00:01, 59.4MB/s] 65%|######5   | 209M/320M [00:02<00:01, 57.4MB/s] 68%|######7   | 217M/320M [00:02<00:01, 59.3MB/s] 70%|#######   | 224M/320M [00:02<00:01, 59.2MB/s] 72%|#######2  | 2

In [None]:
import pickle # serialize lib
from google.colab import drive

drive.mount('/drive')

DATASET_PICKLE='/content/ratebeer.pickle'
RECREATE_PICKLE = False

if RECREATE_PICKLE:
    with open(DATASET_PICKLE, 'wb') as f:
        rb = RateBeerReviews()
        print('Dumping..')
        pickle.dump(rb, f)
else:
    with open(DATASET_PICKLE, 'rb') as f:
        rb = pickle.load(f)

In [None]:
print(len(rb.vocab.get_itos()))
print(rb.vocab.get_itos()[-1000:])

111047
['tonto', 'too.<font', 'too^co2', 'tood', 'tooled', 'tooooooo', 'top-3', 'top.4', 'top3', 'topfermenting', 'toppd', 'topsail', 'torani', 'tord', 'torments', 'tornadoes', 'torok', 'toronto2010', 'toronto2011', 'torpedoes', 'torrada', 'torre', 'tortue', 'tosaty', 'tosay', 'tossin', 'tosta', 'tostados', 'totall', 'toughed', 'toughie', 'touhc', 'toungh', 'tourette', 'tournai', 'tournure', 'towner', 'towsend', 'tpa', 'tpa975', 'tpd', 'tpe', 'tpll', 'tpo', 'tracce', 'tracis', 'tradicionais', 'traditinal', 'traditiona', 'traight', 'trailblazer', 'trainstation', 'traipse', 'traitements', 'traitre', 'trajo', 'tramping', 'tranformed', 'tranforms', 'tranquillit', 'tranquillo', 'transaprent', 'transcendance', 'transcontinental', 'transfusion', 'translucense', 'translucient', 'trapani', 'trapissed', 'trappest', 'trappistl', 'trauben', 'travaill', 'trbung', 'treason', 'treaty', 'trembles', 'treo', 'trespass', 'trickster', 'trided', 'trinkable', 'tripelbock', 'tripels-', 'tripely', 'tripical',

### Training (implementation of $(1)$)

In [None]:
from torch.utils.data import random_split
import datetime

class Model():
    def __init__(self, dataset):
        self.ds = dataset
        self.init_weights()

    def init_weights(self):
        word_count = len(self.ds.vocab.get_itos())
        self.theta = torch.rand((word_count, self.ds.aspect_count), requires_grad=True)
        # if dev == 'cuda': # TODO
        #     theta = theta
        with torch.no_grad():
            self.theta *= 0.9
            # enforce 1 initialization on aspect name (page 4)
            aspect_ids = self.ds.vocab.lookup_indices(self.ds.aspects)
            self.theta[aspect_ids, :] = 1
        self.theta.grad = torch.zeros_like(self.theta)

        aspect_rating_count = [6, 11, 6, 11, 21]

        # introduce separate phi for each aspect
        self.phis = [torch.rand((word_count, aspect_rating_count[i]), requires_grad=True) for i in range(self.ds.aspect_count)]

        # if dev == 'cuda': # TODO
        #     phis = [phi.cuda() for phi in phis]

        # normalize that sum across all words is 1 for a given aspect (eq. 7)
        self.phis = [phi / phi.sum(dim=0) for phi in self.phis]
        for phi in self.phis:
            phi.grad = torch.zeros_like(phi) # otherwise we would got "'NoneType' object..." in the first train iteration
    
    def sentence_aspects_likelihood_theta(self, sen_ids):
        return self.theta[sen_ids]

    def sentence_aspects_likelihood_phi(self, sen_ids):
        phi_scores = [self.phis[aspect_idx][sen_ids, :] for aspect_idx in range(self.ds.aspect_count)]
        return phi_scores
    
    def dump_weights(self, dest_dir='/drive/MyDrive/Colab Notebooks/1e100ibu/saves/'):
        torch.save(self.phis,  f'{dest_dir}{datetime.datetime.now()}-theta')
        torch.save(self.theta, f'{dest_dir}{datetime.datetime.now()}-phis')

    def load_weights(self, src_path):
        self.theta = torch.load(f'src_path-theta')
        self.phis  = torch.load(f'src_path-phis')
    
    # def sentence_likelihood(self, sen_ids): # TODO remove?
    #   pass
    
    def train(self, epoch_count=1):
        train_size = int(0.8 * len(self.ds))
        # train_size = 1000 # use latter; this is for debuggin' only
        test_size = len(self.ds) - train_size

        self.train_ds, self.test_ds = random_split(self.ds, [train_size, test_size], generator=torch.Generator().manual_seed(42)) # let's fix RNG seed for now

        exit = False
        for epoch in range(epoch_count):
            if exit:
                break
            ic(epoch)
            for i, (review_sentences, review_aspects_scores) in enumerate(tqdm(self.train_ds)):
                if exit:
                    break
                # ic(i)
                for sen_ids in review_sentences:
                    if exit:
                        break
                    try:
                        theta_scores = self.sentence_aspects_likelihood_theta(sen_ids)
                        aspect_pred = int(torch.argmax(torch.nn.functional.softmax(theta_scores.sum(0), dim=-1)).item())

                        # ic(self.ds.vocab.lookup_tokens(sen_ids))
                        # ic(aspect_pred, self.ds.aspects[aspect_pred])


                        aspect_rating = review_aspects_scores[aspect_pred]
                        # ic(aspect_rating)
                        # print(sentence_aspects_likelihood_phi(sen_ids)[aspect_pred])
                        # print(sentence_aspects_likelihood_phi(sen_ids)[aspect_pred][:, aspect_rating])
                        # print(sentence_aspects_likelihood_phi(sen_ids)[aspect_pred][:, 4])
                        phi_score = self.sentence_aspects_likelihood_phi(sen_ids)[aspect_pred][:, aspect_rating]
                        # ic(phi_score)

                        ll = torch.log(theta_scores[:, aspect_pred] + phi_score).sum()

                        l2_regularization_loss = torch.mean(torch.square(self.theta))
                        for phi in self.phis:
                            l2_regularization_loss = l2_regularization_loss + torch.mean(torch.square(phi))
                        
                        loss = -ll + l2_regularization_loss * 0.1
                        loss.backward()
                        # ic(self.theta.grad)
                        if 0 == i % 10000:
                            ic(ll)
                            ic(l2_regularization_loss)
                            ic(loss)
                            print()
                            self.dump_weights()

                        with torch.no_grad():
                            self.theta -= 0.00001 * self.theta.grad
                            self.phis = [phi - 0.00001 * phi.grad if phi.grad != None else phi for phi in self.phis]
                            self.theta.grad.zero_()
                            for phi in self.phis:
                                if phi.grad is not None:
                                    phi.grad.zero_()
                    except KeyboardInterrupt:
                        exit = True

    # def sentence_aspects_likelihood(sen_ids): # TODO not needed?
    #     theta_score = sentence_aspects_likelihood_theta(sen_ids)
    #     phi_scores = sentence_aspects_likelihood_phi(sen_ids)
    #     ic(theta_score, phi_scores)
        # score = torch.exp( + sentence_aspects_likelihood_phi(sen_ids, ratings))
        # return score / score.sum()


In [None]:
model = Model(rb)
model.train()

In [None]:
# number of word ID-s
ic(len(rb.vocab.get_itos()))

ic| len(rb.vocab.get_itos()): 111047


111047