<a href="https://colab.research.google.com/github/madziejm/1e100-ibu/blob/master/1e100ibu_embedding_stars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preliminary

#### Dependencies

In [None]:
import torch
dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'dev = {dev}')

In [None]:
!pip install icecream torchtext sentence_transformers faiss-cpu faiss-gpu fasttext --quiet
!pip install spacy --upgrade --quiet
!python3 -m spacy download pl_core_news_md --quiet
!python3 -m spacy download en_core_web_sm

In [None]:
import gc # garbage collector interface
import io
import re
import spacy # nlp toolkit
import torch
import pickle
from collections import Counter
from torchtext._torchtext import (Vocab as VocabPybind) # make use of some hidden interface
from torchtext.vocab import Vocab, build_vocab_from_iterator
from tqdm.notebook import trange, tqdm
from icecream import ic


class BaseReviews(torch.utils.data.Dataset):
    def __init__(self, aspects, aspect_max, aspect_ratings, texts, unkn_tok, _len, anchor_words):
        self.aspects = aspects
        self.aspect_count = len(aspects)
        self.aspect_max = aspect_max
        self._aspect_ratings = aspect_ratings
        self._texts = texts
        self.unkn_tok = unkn_tok
        self._len = _len
        self.anchor_words = anchor_words
        self.vocab = None

    def dump(self, dest_path, filename):
        contents = {
            'aspects'        : self.aspects,
            'aspect_max'     : self.aspect_max,
            '_aspect_ratings': self._aspect_ratings,
            '_texts'         : self._texts,
            'unkn_tok'       : self.unkn_tok,
            '_len'           : self._len,
            'anchor_words'   : self.anchor_words,
            'vocab'          : self.vocab,
        }
        with open(f'{dest_path}/{filename}', 'wb') as f:
            pickle.dump(contents, f)
    
    def load(self, dest_path, filename):
        with open(f'{dest_path}/{filename}', 'rb') as f:
            contents = pickle.load(f)
            self.aspects        = contents['aspects']
            self.aspect_max     = contents['aspect_max']
            self._aspect_ratings = contents['_aspect_ratings']
            self._texts          = contents['_texts']
            self.unkn_tok       = contents['unkn_tok']
            self._len           = contents['_len']
            self.anchor_words   = contents['anchor_words']
            self.vocab          = contents['vocab']

    def __getitem__(self, i):
        # # 1 # python
        # sentences = tuple(sent for sent in self._texts[i])
        # ratings = tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count))
        # 2 # tensor
        sentences = tuple(torch.LongTensor(sent) for sent in self._texts[i])
        ratings = torch.LongTensor(tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count)))
        # # 3 # dev
        # sentences = tuple(torch.tensor(sent) for sent in self._texts[i])
        # ratings = torch.tensor(tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count)))
        return (sentences, ratings)

    def __len__(self):
        return self._len

In [None]:
try: # mount user's Google Drive if on Colab to save training artifacts
    from google.colab import drive
    drive.mount('/drive')
    ROOT_DIR = '/content/'
    MODEL_ROOT_DIR = '/drive/MyDrive/1e100ibu/saves'
except ImportError:
    ROOT_DIR = './'
    MODEL_ROOT_DIR = './saves/'

Mounted at /drive


## ocen-piwo.pl representation

### SBERT

Class that maps ocen-piwo.pl reviews to sentence embeddings using [Sentence-BERT](https://arxiv.org/abs/1908.10084) from [sentence-transformers](https://www.sbert.net/).

Model used: `distiluse-base-multilingual-cased-v1` ("multilingual knowledge distilled version of multilingual Universal Sentence Encoder. Supports 15 languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish").
This model is common for both the datasets.

The sentence embeddings are then averaged or maxpooled depending on constructor parameter.


In [None]:
from collections import Counter
from torchtext._torchtext import (Vocab as VocabPybind) # make use of some hidden interface
from torchtext.vocab import Vocab, build_vocab_from_iterator
from tqdm.notebook import trange, tqdm
import gc # garbage collector interface
import io
import re
import spacy # nlp toolkit
import torch
import json
from tqdm.contrib.concurrent import thread_map
from typing import List
from sentence_transformers import SentenceTransformer, util

class OcenPiwoSBERTReviews(BaseReviews):
    def __init__(self, vec_agg='avg'):
        aspects = ['ogólny', 'smak', 'zapach', 'wygląd',]
        super().__init__(
            aspects        = aspects,
            aspect_max     = [10, 10, 10, 10],
            aspect_ratings = [ [] for _ in aspects ],
            texts          = [],
            unkn_tok       = '<unk>', # unknown/out of vocabulary token
            _len            = 0,
            anchor_words = {
                'ogólny'     : ('ogólnie'),
                'smak'       : ('smak'),
                'zapach'     : ('zapach'),
                'wygląd'     : ('wygląd', 'wygląda')
            },
        )
        self.pipe = None
        self.model = None
        self._vec_agg = vec_agg

    def dump(self, dest_path, filename): # override to use torch instead of pickle
        contents = {
            'aspects'        : self.aspects,
            'aspect_max'     : self.aspect_max,
            '_aspect_ratings': self._aspect_ratings,
            '_texts'         : self._texts,
            'unkn_tok'       : self.unkn_tok,
            '_len'           : self._len,
            'anchor_words'   : self.anchor_words,
            'vocab'          : self.vocab,
            'vec_agg'        : self._vec_agg
        }
        with open(f'{dest_path}/{filename}', 'wb') as f:
            torch.save(contents, f)
    
    def load(self, dest_path, filename): # override to use torch instead of pickle; additionaly save whether to apply avg or maxpool
        with open(f'{dest_path}/{filename}', 'rb') as f:
            contents = torch.load(f, map_location=torch.device(dev))
            self.aspects         = contents['aspects']
            self.aspect_max      = contents['aspect_max']
            self._aspect_ratings = contents['_aspect_ratings']
            self._texts          = contents['_texts']
            self.unkn_tok        = contents['unkn_tok']
            self._len            = contents['_len']
            self.anchor_words    = contents['anchor_words']
            self.vocab           = contents['vocab']
            self._vec_agg        = contents['vec_agg']

    def build(self, filepath=f'{ROOT_DIR}/ocen-piwo-utf8.json'):
        with io.open(filepath, encoding='utf-8') as f:
            json_dict = json.loads(f.read())

            for i, reviews in enumerate(json_dict.values()):
                for sentences, ratings in reviews:
                    self._len += 1
                    if self._len > 10: # TODO remove
                        break

                    for aspect in range(self.aspect_count):
                        self._aspect_ratings[aspect].append(int(ratings[aspect]))

                    self._texts.append(sentences)
        self._post_process()

    def _fetch_nlp_pipeline(self):
        if not self.pipe:
            nlp = spacy.load('pl_core_news_md')
            # we want sentencizer only, as tokenization is part of Transformer model we'll use
            for pipe_name in nlp.pipe_names:
                # if pipe_name != 'sentencizer':
                nlp.remove_pipe(pipe_name)
            nlp.add_pipe("sentencizer", config={"punct_chars": ['.', '?', '!']})
            self.pipe = nlp.pipe
    
    def _free_nlp_pipeline(self):
        self.pipe = None
    
    def _fetch_transformer_model(self):
        # self.model = model # TODO
        if not self.model:
            self.model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
    
    def _free_transformer_model(self):
        self.model = None

    def __getitem__(self, i):
        # # 1 # python
        # sentences = tuple(sent for sent in self._texts[i])
        # ratings = tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count))
        # 2 # tensor
        sentences = self._texts[i]
        ratings = torch.LongTensor(tuple(int(self._aspect_ratings[a][i]) for a in range(self.aspect_count)))
        # # 3 # dev
        # sentences = tuple(torch.tensor(sent) for sent in self._texts[i])
        # ratings = torch.tensor(tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count)))
        return (sentences, ratings)


    def text_embedding(self, text: str):
        self._fetch_transformer_model()
        self._fetch_nlp_pipeline()
        if 'avg' == self._vec_agg:
            return self.model.encode([sent.text for sent in next(self.pipe([text])).sents], convert_to_tensor=True).mean(dim=0)
        elif 'maxpool' == self._vec_agg:
            return self.model.encode([sent.text for sent in next(self.pipe([text])).sents], convert_to_tensor=True).max(dim=0)[0]
        else:
            assert False

    def sentences_avg_embedding(self, sentences: List[str]):
        return self.model.encode(sentences, convert_to_tensor=True).mean(dim=0)

    def sentences_maxpool_embedding(self, sentences: List[str]):
        return self.model.encode(sentences, convert_to_tensor=True).max(dim=0)[0]
    
    def closest_indices(self, text: str, top_k=20):
        text_emb = self.text_embedding(text)[None, :]
        result = util.semantic_search(query_embeddings=text_emb, corpus_embeddings=self._texts, top_k=top_k)
        return [(d['corpus_id'], d['score']) for d in result[0]]
    
    def knn_predict_rating(self, text: str, top_k=20):
        knn = self.closest_indices(text, top_k)
        indices, weights = [list(t) for t in zip(*knn)]
        weights = torch.tensor(weights)
        weights = weights / weights.sum()
        nearest_ratings = torch.stack(tuple(self[idx][1] for idx in indices)).to(torch.float)
        nearest_ratings *= weights[:, None]
        nearest_ratings = nearest_ratings.sum(dim=0)
        nearest_ratings.round_()
        return nearest_ratings
    
    def _post_process(self):
        print("Spacy pipe (sentence split)..")
        gc.collect() # force garbage collection
        self._fetch_nlp_pipeline()
        self._texts = [[sent.text for sent in doc.sents] for doc in self.pipe(self._texts)]
        for i, text in enumerate(self._texts):
            assert 0 != len(text) # make sure no empty reviews again (new could be introduced by removing stop words unfortunately)
        print("Mapping reviews to embeddings..")
        gc.collect() # force garbage collection
        self._fetch_transformer_model()
        if 'avg' == self._vec_agg:
            self._texts = torch.stack(thread_map(self.sentences_avg_embedding, self._texts, max_workers=1))
        elif 'maxpool' == self._vec_agg:
            self._texts = torch.stack(thread_map(self.sentences_maxpool_embedding, self._texts))
        else:
            assert False

In [None]:
op = OcenPiwoSBERTReviews(vec_agg='avg')
# op.pipe = pipe # kinda cache by using old pipe and model when prototyping using notebook
# op.model = model # same as above
op.load(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-avg-sbert-vecs.pt')
# op.build(filepath='scrap-ocen-piwo/ocen-piwo-utf8.json')
# op.dump(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-avg-sbert-vecs.pt')
print(op.closest_indices('Jak dla mnie podstawka lepsza.')) # check: this input is 0-th review from the dataset, so hope for 0 be the most similar
print(op.knn_predict_rating('Najgorsze piwo jakie piłem kiedykolwiek'))

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

[(34218, 0.7136959433555603), (39326, 0.686835527420044), (0, 0.6711326837539673), (3137, 0.65353924036026), (44595, 0.6387416124343872), (48338, 0.630121648311615), (47587, 0.5942672491073608), (50915, 0.5930118560791016), (50287, 0.5895595550537109), (15460, 0.5888326168060303), (29684, 0.5726549625396729), (50095, 0.571191132068634), (40925, 0.5640509128570557), (37764, 0.5573294162750244), (23545, 0.5545322895050049), (48275, 0.5462486147880554), (45120, 0.5414141416549683), (51075, 0.5395609140396118), (50259, 0.5389387607574463), (27861, 0.5347295999526978)]
tensor([7., 6., 6., 7.])


In [None]:
op = OcenPiwoSBERTReviews(vec_agg='maxpool')
# op.pipe = pipe # kinda cache by using old pipe and model when prototyping using notebook
# op.model = model # same as above
# op.build('scrap-ocen-piwo/ocen-piwo-utf8.json')
# op.dump(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-maxpool-sbert-vecs.pt')
op.load(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-maxpool-sbert-vecs.pt')
print(op.closest_indices('Jak dla mnie podstawka lepsza.')) # check: this input is 0-th review from the dataset, so hope for 0 be the most similar
print(op.knn_predict_rating('Najgorsze piwo jakie piłem kiedykolwiek'))

[(26590, -0.0023189131170511246), (26981, -0.002318914048373699), (30670, -0.002318916842341423), (38535, -0.0023189252242445946), (26830, -0.002318927086889744), (27893, -0.002318927086889744), (27548, -0.002318928949534893), (12812, -0.0023189298808574677), (21838, -0.0023189298808574677), (19831, -0.0023189308121800423), (156, -0.0023189326748251915), (19849, -0.002318933606147766), (32968, -0.002318933606147766), (27527, -0.0023189345374703407), (31864, -0.0023189345374703407), (13263, -0.0023189345374703407), (3333, -0.0023189345374703407), (33909, -0.0023189354687929153), (5588, -0.0023189354687929153), (13984, -0.0023189354687929153)]
tensor([7., 7., 7., 7.])


### FastText
Class that maps ocen-piwo.pl reviews to [FastText](https://fasttext.cc/) word embeddings using trained on Common Crawl: https://fasttext.cc/docs/en/crawl-vectors.html.

File: cc.pl.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz

The word embeddings are then averaged or maxpooled depending on constructor parameter value.

In [None]:
import gc # garbage collector interface
import io
import re
import spacy # nlp toolkit
import torch
import json
import fasttext
import fasttext.util
import numpy as np
import faiss
from typing import List
from tqdm.contrib.concurrent import thread_map
from icecream import ic
from collections import Counter
from torchtext._torchtext import (Vocab as VocabPybind) # make use of some hidden interface
from torchtext.vocab import Vocab, build_vocab_from_iterator
from tqdm.notebook import trange, tqdm

class OcenPiwoFasttextEmbeddedReviews(BaseReviews):
    def __init__(self, vec_agg='avg'):
        aspects = ['ogólny', 'smak', 'zapach', 'wygląd',]
        super().__init__(
            aspects        = aspects,
            aspect_max     = [10, 10, 10, 10],
            aspect_ratings = [ [] for _ in aspects ],
            texts          = [],
            unkn_tok       = '<unk>', # unknown/out of vocabulary token
            _len            = 0,
            anchor_words = {
                'ogólny'     : ('ogólnie'),
                'smak'       : ('smak'),
                'zapach'     : ('zapach'),
                'wygląd'     : ('wygląd', 'wygląda')
            },
        )
        self.pipe = None
        self.model = None
        self._vec_agg = vec_agg
        self._index = None

    def dump(self, dest_path, filename): # override to use torch instead of pickle
        contents = {
            'aspects'        : self.aspects,
            'aspect_max'     : self.aspect_max,
            '_aspect_ratings': self._aspect_ratings,
            '_texts'         : self._texts,
            'unkn_tok'       : self.unkn_tok,
            '_len'           : self._len,
            'anchor_words'   : self.anchor_words,
            'vocab'          : self.vocab,
            'vec_agg'        : self._vec_agg
        }
        with open(f'{dest_path}/{filename}', 'wb') as f:
            torch.save(contents, f)
    
    def load(self, dest_path, filename): # override to use torch instead of pickle; additionaly save whether to apply avg or maxpool
        with open(f'{dest_path}/{filename}', 'rb') as f:
            contents = torch.load(f, map_location=torch.device(dev))
            self.aspects         = contents['aspects']
            self.aspect_max      = contents['aspect_max']
            self._aspect_ratings = contents['_aspect_ratings']
            self._texts          = contents['_texts']
            self.unkn_tok        = contents['unkn_tok']
            self._len            = contents['_len']
            self.anchor_words    = contents['anchor_words']
            self.vocab           = contents['vocab']
            self._vec_agg        = contents['vec_agg']

    def build(self, filepath=f'{ROOT_DIR}/ocen-piwo-utf8.json'):
        with io.open(filepath, encoding='utf-8') as f:
            json_dict = json.loads(f.read())

            for i, reviews in enumerate(json_dict.values()):
                for sentences, ratings in reviews:
                    self._len += 1
                    if self._len > 1000: # TODO remove
                        break

                    for aspect in range(self.aspect_count):
                        self._aspect_ratings[aspect].append(int(ratings[aspect]))

                    self._texts.append(sentences)
        self._post_process()

    def _fetch_nlp_pipeline(self):
        if not self.pipe:
            nlp = spacy.load('pl_core_news_md')
            # we want sentencizer only, as tokenization is part of Transformer model we'll use
            # ic(nlp.pipe_names) # 'tok2vec', 'morphologizer', 'parser', 'tagger', 'attribute_ruler', 'lemmatizer', 'ner'
            for pipe_name in nlp.pipe_names:
                if pipe_name not in ['tokenizer', 'lemmatizer']:
                    nlp.remove_pipe(pipe_name)
            nlp.add_pipe("sentencizer", config={"punct_chars": ['.', '?', '!']})
            self.pipe = lambda texts: [[tok.lemma_ for sent in doc.sents for tok in sent if not tok.is_punct and not tok.is_space] for doc in nlp.pipe(texts)] # TODO remove stop words? (not tok.is_stop)
    
    def _free_nlp_pipeline(self):
        self.pipe = None
    
    def _fetch_model(self):
        if not self.model:
            self.model = fasttext.load_model(f'{MODEL_ROOT_DIR}/../vectors/cc.pl.300.bin')
    
    def _free_model(self):
        self.model = None

    def __getitem__(self, i):
        # # 1 # python
        # sentences = tuple(sent for sent in self._texts[i])
        # ratings = tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count))
        # 2 # tensor
        sentences = self._texts[i]
        ratings = torch.LongTensor(tuple(int(self._aspect_ratings[a][i]) for a in range(self.aspect_count)))
        # # 3 # dev
        # sentences = tuple(torch.tensor(sent) for sent in self._texts[i])
        # ratings = torch.tensor(tuple(self._aspect_ratings[a][i] for a in range(self.aspect_count)))
        return (sentences, ratings)


    def text_embedding(self, text: str):
        """tokenize text, and return mean/maxpool of the token vectors """
        self._fetch_nlp_pipeline()
        self._fetch_model()
        words = self.pipe([text])[0] # self.pipe is for list of texts and returns a list; we grab oth list of the result because of this
        vecs = np.vstack([self.model[word] for word in words])
        if 'avg' == self._vec_agg:
            return vecs.mean(axis=0)
        elif 'maxpool' == self._vec_agg:
            return vecs.max(axis=0)
        else:
            assert False

    def words_avg_embedding(self, words: List[str]):
        """ treat words as bag of words, return average of their vectors """
        vecs = np.vstack([self.model[word] for word in words])
        return vecs.mean(axis=0)

    def words_maxpool_embedding(self, words: List[str]):
        """ treat words as bag of words, return max of their vectors (position-wise) """
        vecs = np.vstack([self.model[word] for word in words])
        return vecs.max(axis=0)
    
    def closest_indices(self, text: str, top_k=20):
        self._set_up_texts_index()
        text_emb = self.text_embedding(text)[None, :] # introduce new dimension for vector count (here only 1 vector)
        result = self._index.search(text_emb, top_k)
        # result[0] is list of similiarities list, result[1] is list of list
        # we get [0] which means result for 0th vector passed to search (the single one we passed)
        result = (result[0][0], result[1][0])
        return result
    
    def knn_predict_rating(self, text: str, top_k=10):
        knn = self.closest_indices(text, top_k)
        weights, indices = knn
        weights = np.array(weights)
        weights = weights / weights.sum()
        nearest_ratings = np.vstack(tuple(self[idx][1] for idx in indices)).astype(np.float32)
        nearest_ratings *= weights[:, None]
        nearest_ratings = nearest_ratings.sum(axis=0)
        nearest_ratings = np.rint(nearest_ratings)
        return nearest_ratings
    
    def _set_up_texts_index(self):
        """set up index for vector-encoded corpora texts"""
        if self._index is None:
            if hasattr(faiss, 'StandardGpuResources'):
                # gpu mode
                res = faiss.StandardGpuResources()
                config = faiss.GpuIndexFlatConfig()
                config.device = 0
                self._index = faiss.GpuIndexFlatIP(res, self._texts.shape[1], config)
            else:
                # cpu mode
                self._index = faiss.IndexFlatIP(self._texts.shape[1])
            if True:
                faiss.normalize_L2(self._texts)
            self._index.add(self._texts)

    def _post_process(self):
        print("Spacy pipe (sentence split&tokenization)..")
        gc.collect() # force garbage collection
        self._fetch_nlp_pipeline()
        self._texts = self.pipe(self._texts)
        i = 0
        while True:
            if len(self._texts) <= i:
                break
            if 0 == len(self._texts[i]): # review with no tokens -> remove
                del self._texts[i]
                for a_idx, _ in enumerate(self.aspects):
                    del self._aspect_ratings[a_idx][i]
            else:
                i += 1
        for i, text in enumerate(self._texts):
            assert 0 != len(text) # make sure no empty reviews again (new could be introduced by removing stop words unfortunately)
        print("Mapping reviews to embeddings..")
        # gc.collect() # force garbage collection
        self._fetch_model()
        if 'avg' == self._vec_agg:
            self._texts = np.vstack(thread_map(self.words_avg_embedding, self._texts, max_workers=1))
        elif 'maxpool' == self._vec_agg:
            self._texts = np.vstack(thread_map(self.words_maxpool_embedding, self._texts))
        else:
            assert False
        self._set_up_texts_index()

In [None]:
op = OcenPiwoFasttextEmbeddedReviews(vec_agg='avg')
# op.pipe = pipe # kinda cache by using old pipe and model when prototyping using notebook
# op.model = model # same as above
# op.load(dest_path=ROOT_DIR, filename='ocen-piwo-avg-fasttext-vecs.pt')
# op.build(filepath=f'{MODEL_ROOT_DIR}/../ocen-piwo-utf8.json')
op.load(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-avg-fasttext-vecs.pt')
print(op.closest_indices('Jak dla mnie podstawka lepsza.')) # check: this input is 0-th review from the dataset, so hope for 0 be the most similar
print(op.knn_predict_rating('Najgorsze piwo jakie piłem kiedykolwiek'))

(array([0.67781323, 0.67781323, 0.67781323, 0.6574119 , 0.6559805 ,
       0.652663  , 0.644583  , 0.64412564, 0.6435037 , 0.63613886,
       0.6300584 , 0.6300584 , 0.6300584 , 0.62572926, 0.62572926,
       0.6112176 , 0.61084837, 0.60827816, 0.6010281 , 0.59375834],
      dtype=float32), array([43432, 43433, 45054, 45768, 43055, 34215, 11536, 13352,  8679,
       36913,  5345, 50258, 43930, 29681, 45117, 24531, 47999, 36136,
         400, 36196]))
[7. 7. 7. 7.]




In [None]:
pipe = op.pipe # kinda cache by using old pipe and model when prototyping using notebook
model = op.model # same as above

In [None]:
# pipe = op.pipe
op = OcenPiwoFasttextEmbeddedReviews(vec_agg='maxpool')
op.pipe = pipe # kinda cache by using old pipe and model when prototyping using notebook
op.model = model # same as above
# op.load(dest_path=ROOT_DIR, filename='ocen-piwo-maxpool-fasttext-vecs.pt')
# op.build(filepath=f'{MODEL_ROOT_DIR}/../ocen-piwo-utf8.json')
op.load(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-maxpool-fasttext-vecs.pt')
print(op.closest_indices('Jak dla mnie podstawka lepsza.')) # check: this input is 0-th review from the dataset, so hope for 0 be the most similar
print(op.knn_predict_rating('Najgorsze piwo jakie piłem kiedykolwiek'))

(array([8.013512 , 7.6783237, 7.6622777, 7.6002817, 7.5900207, 7.580925 ,
       7.5770082, 7.558087 , 7.524187 , 7.519915 , 7.4981885, 7.472593 ,
       7.440344 , 7.4395576, 7.417118 , 7.41661  , 7.41541  , 7.405194 ,
       7.391786 , 7.381966 ], dtype=float32), array([30667,  1191, 27869,    74, 49198,  5116,  8917, 39136,  2828,
       38532, 28121, 12981,   374, 48226, 27548, 21943, 31675, 13200,
       22314, 43310]))
[6. 6. 6. 6.]


In [None]:
op._texts.shape

In [None]:
op.pipe(['zdanie. zdanie więcej zdań'])

In [None]:
op.closest_indices('Fatalne piwo.')

In [None]:
op.knn_predict_rating('Fatalne piwo.')

In [None]:
for i in [ 58,   7, 811, 377, 528, 697, 400, 499,  19, 642, 359, 805, 773,
        352, 989, 898, 851, 849, 126, 208]:
# for i in [400, 605, 540, 269, 323, 560, 499,  19, 857, 624, 528, 234,   0,
#                     697, 581, 898, 781, 601, 322,  21]:
    print(_texts[i], '\n')

## SNAP RateBeer representation

### SBERT

Class that maps ocen-piwo.pl reviews to sentence embeddings using [Sentence-BERT](https://arxiv.org/abs/1908.10084) from [sentence-transformers](https://www.sbert.net/).

Model used: `distiluse-base-multilingual-cased-v1` ("multilingual knowledge distilled version of multilingual Universal Sentence Encoder. Supports 15 languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish").
This model is common for both the datasets.

The sentence embeddings are then averaged or maxpooled depending on constructor parameter.


In [None]:
class RateBeerSBERTReviews(OcenPiwoSBERTReviews):
    """
    beer/name: John Harvards Simcoe IPA
    beer/beerId: 63836
    beer/brewerId: 8481
    beer/ABV: 5.4
    beer/style: India Pale Ale &#40;IPA&#41;
    review/appearance: 4/5
    review/aroma: 6/10
    review/palate: 3/5
    review/taste: 6/10
    review/overall: 13/20
    review/time: 1157587200
    review/profileName: hopdog
    review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.
    """
    def __init__(self, vec_agg='avg'):
        aspects = ['appearance', 'aroma', 'palate', 'taste', 'overall']
        BaseReviews.__init__(
            self,
            aspects        = aspects,
            aspect_max     = [5, 10, 5, 10, 20],
            aspect_ratings = [ [] for _ in aspects ],
            texts          = [],
            unkn_tok       = '<unk>', # unknown/out of vocabulary token
            _len            = 0,
            anchor_words = {
                'appearance' : ('appearance', 'color'),
                'aroma'      : ('aroma'),
                'palate'     : ('palate', 'mouthfeel'),
                'taste'      : ('taste'),
                'overall'    : ('overall'),
            },
        )
        self.pipe = None
        self.model = None
        self._vec_agg = vec_agg

    def build(self, filepath=f'{ROOT_DIR}/SNAP-Ratebeer.txt', max_reviews=float('inf')):
        with io.open(filepath, encoding='utf-8') as f:
            for line in tqdm(f, total=(40938282 if max_reviews == float('inf') else max_reviews * 14), desc='Reading data'):
                if line == '\n': # separator
                    self._len += 1
                    if max_reviews <= self._len:
                        break
                elif line.startswith('review/appearance: '):
                    line = line[len('review/appearance: '):]
                    self._aspect_ratings[0].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/aroma: '):
                    line = line[len('review/aroma: '):]
                    self._aspect_ratings[1].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/palate: '):
                    line = line[len('review/palate: '):]
                    self._aspect_ratings[2].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/taste: '):
                    line = line[len('review/taste: '):]
                    self._aspect_ratings[3].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/overall: '):
                    line = line[len('review/overall: '):]
                    self._aspect_ratings[4].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/text: '):
                    line = line[len('review/text: '):]
                    if line.startswith('UPDATED:'):
                        line = line[len("UPDATED: APR 29, 2008"):] # drop prefix
                    line = re.sub('~', ' ', line.strip()) # remove whitespace incl. trailing newline and tildes that can be found in data for some reason
                    if line:
                        self._texts.append(line)
                    else: # some reviews do not have associated text; unwind (remove) their ratings for each aspect
                        for aspect_ratings in self._aspect_ratings:
                            aspect_ratings.pop()
                        self._len -= 1
        gc.collect()
        self._post_process()
    
    def _fetch_nlp_pipeline(self):
        if not self.pipe:
            nlp = spacy.util.get_lang_class('en')()
            # we want sentencizer only, as tokenization is part of Transformer model we'll use
            for pipe_name in nlp.pipe_names:
                # if pipe_name != 'sentencizer':
                nlp.remove_pipe(pipe_name)
            nlp.add_pipe("sentencizer", config={"punct_chars": ['.', '?', '!']})
            self.pipe = nlp.pipe
    
    def _free_nlp_pipeline(self):
        self.nlp = None

In [None]:
del pipe
del model
gc.collect()

398

In [None]:
rb = RateBeerSBERTReviews(vec_agg='avg')
# rb.pipe = pipe # kinda cache by using old pipe and model when prototyping using notebook
# rb.model = model # same as above
# rb.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-avg-sbert-vecs.pt')
# rb.build(filepath='/drive/MyDrive/Colab Notebooks/1e100ibu/SNAP-Ratebeer.txt', max_reviews=1e6)
rb.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-avg-sbert-vecs.pt')
rb.closest_indices('Worst beer i have ever seen') # check: this input is 0-th review from the dataset, so hope for 0 be the most similar

[(620508, 0.8770755529403687),
 (444881, 0.8675256967544556),
 (912710, 0.859946608543396),
 (799504, 0.8599153757095337),
 (867250, 0.8585574626922607),
 (799580, 0.8368151187896729),
 (798760, 0.834177553653717),
 (805445, 0.8265045881271362),
 (809532, 0.8252588510513306),
 (805801, 0.8247320652008057),
 (913585, 0.8208469152450562),
 (444829, 0.8176125288009644),
 (446641, 0.8164716362953186),
 (799523, 0.815644383430481),
 (798185, 0.8151065111160278),
 (319360, 0.8117756247520447),
 (915462, 0.8114019632339478),
 (101608, 0.8098669648170471),
 (273363, 0.8085517883300781),
 (806253, 0.8084362745285034)]

In [None]:
print(rb.closest_indices('Worst beer i have ever seen')) # check: this input is 0-th review from the dataset, so hope for 0 be the most similar
print(rb.knn_predict_rating('Tastes best from bottle. Not so heap as one could think. Nice hoppy smell. I had not supposed it will be sour though. Beautiful smooth head.'))

[(620508, 0.8770755529403687), (444881, 0.8675256967544556), (912710, 0.859946608543396), (799504, 0.8599153757095337), (867250, 0.8585574626922607), (799580, 0.8368151187896729), (798760, 0.834177553653717), (805445, 0.8265045881271362), (809532, 0.8252588510513306), (805801, 0.8247320652008057), (913585, 0.8208469152450562), (444829, 0.8176125288009644), (446641, 0.8164716362953186), (799523, 0.815644383430481), (798185, 0.8151065111160278), (319360, 0.8117756247520447), (915462, 0.8114019632339478), (101608, 0.8098669648170471), (273363, 0.8085517883300781), (806253, 0.8084362745285034)]
tensor([ 3.,  6.,  3.,  6., 12.])


In [None]:
pipe = rb.pipe # kinda cache by using old pipe and model when prototyping using notebook
model = rb.model # same as above

In [None]:
rb = RateBeerSBERTReviews(vec_agg='maxpool')
# rb.pipe = pipe # kinda cache by using old pipe and model when prototyping using notebook
# rb.model = model # same as above
# rb.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-maxpool-sbert-vecs.pt')
# rb.build(filepath='/drive/MyDrive/Colab Notebooks/1e100ibu/SNAP-Ratebeer.txt', max_reviews=1e6)
rb.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-maxpool-sbert-vecs.pt')
print(rb.closest_indices('Worst beer i have ever seen')) # check: this input is 0-th review from the dataset, so hope for 0 be the most similar
print(rb.knn_predict_rating('Tastes best from bottle. Not so heap as one could think. Nice hoppy smell. I had not supposed it will be sour though. Beautiful smooth head.'))

[(620508, 0.8770755529403687), (444881, 0.8675256967544556), (912710, 0.859946608543396), (799504, 0.8599153757095337), (799580, 0.8368151187896729), (798760, 0.834177553653717), (805445, 0.8265045881271362), (444829, 0.8176125288009644), (446641, 0.8164716362953186), (319360, 0.8117756247520447), (915462, 0.8114019632339478), (273363, 0.8085517883300781), (806253, 0.8084362745285034), (403988, 0.8071106672286987), (248957, 0.8060749173164368), (866862, 0.8004712462425232), (443889, 0.7977471351623535), (911407, 0.7967453002929688), (867250, 0.7954858541488647), (815882, 0.7952635288238525)]
tensor([ 3.,  6.,  3.,  6., 12.])


### FastText
Class that maps SNAP Ratebeer reviews to [FastText](https://fasttext.cc/) word embeddings using trained on Common Crawl: https://fasttext.cc/docs/en/crawl-vectors.html.

File: cc.en.300.bin.gz https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

The word embeddings are then averaged or maxpooled depending on constructor parameter value.

In [None]:
class RateBeerFasttextReviews(OcenPiwoFasttextEmbeddedReviews):
    """
    beer/name: John Harvards Simcoe IPA
    beer/beerId: 63836
    beer/brewerId: 8481
    beer/ABV: 5.4
    beer/style: India Pale Ale &#40;IPA&#41;
    review/appearance: 4/5
    review/aroma: 6/10
    review/palate: 3/5
    review/taste: 6/10
    review/overall: 13/20
    review/time: 1157587200
    review/profileName: hopdog
    review/text: On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.
    """
    def __init__(self, vec_agg='avg'):
        aspects = ['appearance', 'aroma', 'palate', 'taste', 'overall']
        BaseReviews.__init__(
            self,
            aspects        = aspects,
            aspect_max     = [5, 10, 5, 10, 20],
            aspect_ratings = [ [] for _ in aspects ],
            texts          = [],
            unkn_tok       = '<unk>', # unknown/out of vocabulary token
            _len            = 0,
            anchor_words = {
                'appearance' : ('appearance', 'color'),
                'aroma'      : ('aroma'),
                'palate'     : ('palate', 'mouthfeel'),
                'taste'      : ('taste'),
                'overall'    : ('overall'),
            },
        )
        self.pipe = None
        self.model = None
        self._vec_agg = vec_agg
        self._index = None

    def build(self, filepath=f'{ROOT_DIR}/SNAP-Ratebeer.txt', max_reviews=float('inf')):
        with io.open(filepath, encoding='utf-8') as f:
            for line in tqdm(f, total=(40938282 if max_reviews == float('inf') else max_reviews * 14), desc='Reading data'):
                if line == '\n': # separator
                    self._len += 1
                    if max_reviews <= self._len:
                        break
                elif line.startswith('review/appearance: '):
                    line = line[len('review/appearance: '):]
                    self._aspect_ratings[0].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/aroma: '):
                    line = line[len('review/aroma: '):]
                    self._aspect_ratings[1].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/palate: '):
                    line = line[len('review/palate: '):]
                    self._aspect_ratings[2].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/taste: '):
                    line = line[len('review/taste: '):]
                    self._aspect_ratings[3].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/overall: '):
                    line = line[len('review/overall: '):]
                    self._aspect_ratings[4].append(int(line.split('/')[0])) # lhs of split by '/' is rating, rhs is max possible rating
                elif line.startswith('review/text: '):
                    line = line[len('review/text: '):]
                    if line.startswith('UPDATED:'):
                        line = line[len("UPDATED: APR 29, 2008"):] # drop prefix
                    line = re.sub('~', ' ', line.strip()) # remove whitespace incl. trailing newline and tildes that can be found in data for some reason
                    if line:
                        self._texts.append(line)
                    else: # some reviews do not have associated text; unwind (remove) their ratings for each aspect
                        for aspect_ratings in self._aspect_ratings:
                            aspect_ratings.pop()
                        self._len -= 1
        gc.collect()
        self._post_process()
    
    def _fetch_nlp_pipeline(self):
        if not self.pipe:
            nlp = spacy.load('en_core_web_sm')
            for pipe_name in nlp.pipe_names:
                if pipe_name not in ['tokenizer']:
                    nlp.remove_pipe(pipe_name)
            nlp.add_pipe("sentencizer", config={"punct_chars": ['.', '?', '!']})
            self.pipe = lambda texts: [[tok.lower_ for sent in doc.sents for tok in sent if not tok.is_punct and not tok.is_space] for doc in nlp.pipe(texts)] # TODO remove stop words? (not tok.is_stop); do not lowercase?
    
    def _fetch_model(self):
        if not self.model:
            self.model = fasttext.load_model(f'{MODEL_ROOT_DIR}/../vectors/cc.en.300.bin')

In [None]:
rb = RateBeerFasttextReviews(vec_agg='avg')
# rb.pipe = pipe # kinda cache by using old pipe and model when prototyping using notebook
# rb.model = model # same as above
# rb.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-avg-fasttext-vecs.pt')
# rb.build(filepath='/drive/MyDrive/Colab Notebooks/1e100ibu/SNAP-Ratebeer.txt', max_reviews=1e6)
rb.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-avg-fasttext-vecs.pt')
print(rb.closest_indices('Worst beer i have ever seen')) # check: this input is 0-th review from the dataset, so hope for 0 be the most similar
print(rb.knn_predict_rating('Tastes best from bottle. Not so heap as one could think. Nice hoppy smell. I had not supposed it will be sour though. Beautiful smooth head.'))



(array([1.744843 , 1.7353735, 1.7136729, 1.689487 , 1.6631922, 1.6536332,
       1.6488355, 1.6264031, 1.5831056, 1.5831056, 1.5075598, 1.5013322,
       1.4843978, 1.4728191, 1.4696563, 1.4696563, 1.4542516, 1.4179466,
       1.4006785, 1.3984987], dtype=float32), array([551847, 736133, 799523, 889229, 527777, 913742, 912466, 799479,
       915216, 914505,  33244, 620038, 747994, 443505, 248951, 403976,
       914359, 237131,  26372, 912573]))
[3. 4. 3. 4. 9.]


In [None]:
pipe = rb.pipe # reuse old pipe and model when prototyping using notebook
model = rb.model # same as above

In [None]:
rb = RateBeerFasttextReviews(vec_agg='maxpool')
# rb.pipe = pipe # kinda cache by using old pipe and model when prototyping using notebook
# rb.model = model # same as above
# rb.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-maxpool-fasttext-vecs.pt')
# rb.build(filepath='/drive/MyDrive/Colab Notebooks/1e100ibu/SNAP-Ratebeer.txt', max_reviews=1e6)
rb.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-maxpool-fasttext-vecs.pt')
print(rb.closest_indices('Worst beer i have ever seen')) # check: this input is 0-th review from the dataset, so hope for 0 be the most similar
print(rb.knn_predict_rating('Tastes best from bottle. Not so heap as one could think. Nice hoppy smell. I had not supposed it will be sour though. Beautiful smooth head.'))



(array([22.735153, 22.64056 , 22.516415, 22.23186 , 22.160046, 22.048971,
       21.934198, 21.886766, 21.850925, 21.694199, 21.594963, 21.365963,
       21.356918, 21.258148, 21.112421, 21.022198, 21.009058, 20.928476,
       20.841497, 20.830072], dtype=float32), array([361933, 304525, 918293, 258243, 871224, 441608, 425355,  16229,
       514749, 680937, 656630, 532489, 234424, 215026, 981224, 231571,
       395523, 623433, 431470, 188537]))
[ 4.  7.  4.  7. 14.]


## KNN cross-validation for FastText

In [None]:
def benchmark_fasttext(model, k, n_iter):
    predicted = []
    ground_truth = list(zip(*model._aspect_ratings))[:n_iter]
    correct_guesses = 0

    # very importando
    model._set_up_texts_index()

    for i in tqdm(range(n_iter), leave=None):
        review, rating = model[i]
        result = model._index.search(review[None, :], k)
        scores, indices = result[0][0], result[1][0]
        
        item_index = np.where(indices != i)
        
        scores = scores[item_index]
        indices = indices[item_index]

        scores = np.array(scores)
        nearest_ratings = np.vstack(tuple(model[idx][1] for idx in indices)).astype(np.float32)
        nearest_ratings = np.rint(np.average(nearest_ratings, weights=scores, axis=0))

        correct_guesses += (nearest_ratings == rating.numpy()).sum()

        predicted.append(nearest_ratings)

    predicted = np.vstack(predicted)
    ground_truth = np.vstack(ground_truth)
    mse = np.square(predicted - ground_truth).mean()

    print("Correct guesses: ", correct_guesses, ", out of: ", model.aspect_count * n_iter, ", percentage: ", correct_guesses / (model.aspect_count * n_iter))
    print("MSE: ", mse)

In [None]:
for k in tqdm([2, 3, 5, 10, 15, 30]):
    print('------------------------------')
    print("K: ", k)

    model = OcenPiwoFasttextEmbeddedReviews(vec_agg='maxpool')
    model.load(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-maxpool-fasttext-vecs.pt')
    print("Testing maxpool - ocenpiwo")
    benchmark_fasttext(model, k, 1000)
    print()

    model = OcenPiwoFasttextEmbeddedReviews(vec_agg='avg')
    model.load(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-avg-fasttext-vecs.pt')
    print("Testing avg - ocenpiwo")
    benchmark_fasttext(model, k, 1000)
    print()

    model = RateBeerFasttextReviews(vec_agg='avg')
    model.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-avg-fasttext-vecs.pt')
    print("Testing maxpool - ratebeer")
    benchmark_fasttext(model, k, 1000)
    print()

    model = RateBeerFasttextReviews(vec_agg='maxpool')
    model.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-maxpool-fasttext-vecs.pt')
    print("Testing avg - ratebeer")
    benchmark_fasttext(model, k, 1000)

  0%|          | 0/6 [00:00<?, ?it/s]

------------------------------
K:  2
Testing maxpool - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  808 , out of:  4000 , percentage:  0.202
MSE:  5.1845

Testing avg - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1041 , out of:  4000 , percentage:  0.26025
MSE:  4.052

Testing maxpool - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1721 , out of:  5000 , percentage:  0.3442
MSE:  3.0624

Testing avg - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1556 , out of:  5000 , percentage:  0.3112
MSE:  3.9036
------------------------------
K:  3
Testing maxpool - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  911 , out of:  4000 , percentage:  0.22775
MSE:  3.788

Testing avg - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1127 , out of:  4000 , percentage:  0.28175
MSE:  3.0355

Testing maxpool - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1859 , out of:  5000 , percentage:  0.3718
MSE:  2.3784

Testing avg - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1664 , out of:  5000 , percentage:  0.3328
MSE:  2.8254
------------------------------
K:  5
Testing maxpool - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1015 , out of:  4000 , percentage:  0.25375
MSE:  3.07775

Testing avg - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1192 , out of:  4000 , percentage:  0.298
MSE:  2.51175

Testing maxpool - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1996 , out of:  5000 , percentage:  0.3992
MSE:  1.9784

Testing avg - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1843 , out of:  5000 , percentage:  0.3686
MSE:  2.222
------------------------------
K:  10
Testing maxpool - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1019 , out of:  4000 , percentage:  0.25475
MSE:  2.82375

Testing avg - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1214 , out of:  4000 , percentage:  0.3035
MSE:  2.32475

Testing maxpool - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2097 , out of:  5000 , percentage:  0.4194
MSE:  1.7782

Testing avg - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1981 , out of:  5000 , percentage:  0.3962
MSE:  1.9586
------------------------------
K:  15
Testing maxpool - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  996 , out of:  4000 , percentage:  0.249
MSE:  2.73325

Testing avg - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1144 , out of:  4000 , percentage:  0.286
MSE:  2.31275

Testing maxpool - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2137 , out of:  5000 , percentage:  0.4274
MSE:  1.7366

Testing avg - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2061 , out of:  5000 , percentage:  0.4122
MSE:  1.8494
------------------------------
K:  30
Testing maxpool - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  989 , out of:  4000 , percentage:  0.24725
MSE:  2.70025

Testing avg - ocenpiwo


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1178 , out of:  4000 , percentage:  0.2945
MSE:  2.27925

Testing maxpool - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2175 , out of:  5000 , percentage:  0.435
MSE:  1.6756

Testing avg - ratebeer


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2079 , out of:  5000 , percentage:  0.4158
MSE:  1.819


## KNN cross-validation for SBERT

In [None]:
def benchmark_sbert(model, k, n_iter):
    predicted = []
    ground_truth = list(zip(*model._aspect_ratings))[:n_iter]
    correct_guesses = 0

    for i in tqdm(range(n_iter), leave=None):
        review, rating = model[i]

        result = util.semantic_search(query_embeddings=review[None, :], corpus_embeddings=model._texts, top_k=k)
        
        result = list(filter(lambda res: res['corpus_id'] != i, result[0]))

        scores = [res['score'] for res in result]
        indices = [res['corpus_id'] for res in result]

        scores = np.array(scores)
        nearest_ratings = np.vstack(tuple(model[idx][1] for idx in indices)).astype(np.float32)
        nearest_ratings = np.rint(np.average(nearest_ratings, weights=scores, axis=0))

        correct_guesses += (nearest_ratings == rating.numpy()).sum()

        predicted.append(nearest_ratings)

    predicted = np.vstack(predicted)
    ground_truth = np.vstack(ground_truth)
    mse = np.square(predicted - ground_truth).mean()

    print("Correct guesses: ", correct_guesses, ", out of: ", model.aspect_count * n_iter, ", percentage: ", correct_guesses / (model.aspect_count * n_iter))
    print("MSE: ", mse)

In [None]:
print("Testing avg - ocenpiwo")
print('------------------------------')
model = OcenPiwoSBERTReviews(vec_agg='avg')
model.load(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-avg-sbert-vecs.pt')
for k in tqdm([2, 3, 5, 10, 15, 30], leave=None):
    print("K: ", k)
    benchmark_sbert(model, k, 1000)
    print()

print("Testing maxpool - ocenpiwo")
print('------------------------------')
model = OcenPiwoSBERTReviews(vec_agg='maxpool')
model.load(dest_path=MODEL_ROOT_DIR, filename='ocen-piwo-maxpool-fasttext-vecs.pt')
for k in tqdm([2, 3, 5, 10, 15, 30], leave=None):
    print("K: ", k)
    benchmark_sbert(model, k, 1000)
    print()

print("Testing maxpool - ratebeer")
print('------------------------------')
model = RateBeerSBERTReviews(vec_agg='maxpool')
model.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-maxpool-sbert-vecs.pt')
for k in tqdm([2, 3, 5, 10, 15, 30], leave=None):
    print("K: ", k)
    benchmark_sbert(model, k, 1000)
    print()

print("Testing avg - ratebeer")
print('------------------------------')
model = RateBeerSBERTReviews(vec_agg='avg')
model.load(dest_path=MODEL_ROOT_DIR, filename='ratebeer-avg-sbert-vecs.pt')
for k in tqdm([2, 3, 5, 10, 15, 30], leave=None):
    print("K: ", k)
    benchmark_sbert(model, k, 1000)
    print()

Testing avg - ocenpiwo
------------------------------


  0%|          | 0/6 [00:00<?, ?it/s]

K:  2


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1180 , out of:  4000 , percentage:  0.295
MSE:  3.30075

K:  3


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1315 , out of:  4000 , percentage:  0.32875
MSE:  2.57275

K:  5


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1364 , out of:  4000 , percentage:  0.341
MSE:  2.15225

K:  10


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1411 , out of:  4000 , percentage:  0.35275
MSE:  1.9345

K:  15


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1422 , out of:  4000 , percentage:  0.3555
MSE:  1.89125

K:  30


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1386 , out of:  4000 , percentage:  0.3465
MSE:  1.93125

Testing maxpool - ocenpiwo
------------------------------


  0%|          | 0/6 [00:00<?, ?it/s]

K:  2


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  808 , out of:  4000 , percentage:  0.202
MSE:  5.1845

K:  3


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  911 , out of:  4000 , percentage:  0.22775
MSE:  3.788

K:  5


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1015 , out of:  4000 , percentage:  0.25375
MSE:  3.07775

K:  10


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1018 , out of:  4000 , percentage:  0.2545
MSE:  2.824

K:  15


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  996 , out of:  4000 , percentage:  0.249
MSE:  2.73325

K:  30


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  989 , out of:  4000 , percentage:  0.24725
MSE:  2.70025

Testing maxpool - ratebeer
------------------------------


  0%|          | 0/6 [00:00<?, ?it/s]

K:  2


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1768 , out of:  5000 , percentage:  0.3536
MSE:  3.0958

K:  3


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1906 , out of:  5000 , percentage:  0.3812
MSE:  2.4534

K:  5


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2033 , out of:  5000 , percentage:  0.4066
MSE:  2.1464

K:  10


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2069 , out of:  5000 , percentage:  0.4138
MSE:  1.9706

K:  15


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2068 , out of:  5000 , percentage:  0.4136
MSE:  1.9456

K:  30


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2084 , out of:  5000 , percentage:  0.4168
MSE:  1.8986

Testing avg - ratebeer
------------------------------


  0%|          | 0/6 [00:00<?, ?it/s]

K:  2


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1826 , out of:  5000 , percentage:  0.3652
MSE:  2.4442

K:  3


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  1938 , out of:  5000 , percentage:  0.3876
MSE:  1.9432

K:  5


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2086 , out of:  5000 , percentage:  0.4172
MSE:  1.6464

K:  10


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2233 , out of:  5000 , percentage:  0.4466
MSE:  1.4906

K:  15


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2257 , out of:  5000 , percentage:  0.4514
MSE:  1.4624

K:  30


  0%|          | 0/1000 [00:00<?, ?it/s]

Correct guesses:  2289 , out of:  5000 , percentage:  0.4578
MSE:  1.4378

