In [28]:
import pandas as pd
from collections import defaultdict


In [2]:
df = pd.read_csv("Reviews.csv") # Amazon Fine Food Reviews dataset from Kaggle

In [192]:
text = df.Text[:100]
text

In [213]:
corpus = [[byte for byte in line] for line in text]
vocab = set(byte for line in text for byte in line)
merge_rules = []
n_iters = 500

for _ in range(n_iters):
    # count frequency
    freq = defaultdict(int)
    for line in corpus:
        for i in range(len(line)-1):
            freq[(line[i], line[i+1])] += 1
    
    # identify most frequent pair as new merge rule
    new_merge = max(freq, key=freq.get)
    merge_rules.append(new_merge)

    # encode corpus with new merge rule
    for n, line in enumerate(corpus):
        new_line = []
        i = 0
        while i < len(line) - 1:
            if (line[i], line[i+1]) == new_merge:
                new_line.append(line[i] + line[i+1])
                i += 2
            else:
                new_line.append(line[i])
                i += 1
        if i == len(line) - 1:
            new_line.append(line[i])
        corpus[n] = new_line

    # update vocabulary
    new_vocab = vocab.union(set(byte for line in corpus for byte in line))
    
    if len(vocab) == len(new_vocab):
        break

vocab = list(new_vocab)

In [219]:
test = df.Text[200]
test

"Even with small containers, they don't fill them up.  These little tins are less than half filled and at the price charged it seems a rip-off. Is there some exotic ingredient as costly as gold contained in those tiny squares?  Or how about the cereal ploy, they were filled at the factory but settled in transport.<br />Can manufacturers be honest in their dealings?"

In [220]:
# encoding test line of text
line = [byte for byte in test]
for merge in merge_rules:
    new_line = []
    i = 0
    while i < len(line) - 1:
        if (line[i], line[i+1]) == merge:
            new_line.append(line[i] + line[i+1])
            i += 2
        else:
            new_line.append(line[i])
            i += 1
    if i == len(line) - 1:
        new_line.append(line[i])
    line = new_line

line

['E',
 'v',
 'en ',
 'with ',
 's',
 'm',
 'all ',
 'con',
 'ta',
 'in',
 'er',
 's, ',
 'they ',
 "don't ",
 'f',
 'i',
 'll ',
 'them ',
 'u',
 'p',
 '.  ',
 'Th',
 'ese ',
 'little ',
 't',
 'in',
 's ',
 'are ',
 'l',
 'ess ',
 'than ',
 'h',
 'al',
 'f ',
 'f',
 'il',
 'l',
 'ed ',
 'and ',
 'at ',
 'the ',
 'pr',
 'ice ',
 'ch',
 'ar',
 'g',
 'ed ',
 'it ',
 'se',
 'e',
 'm',
 's ',
 'a ',
 'ri',
 'p',
 '-',
 'off',
 '. I',
 's ',
 'th',
 'ere ',
 'some ',
 'ex',
 'ot',
 'ic ',
 'ing',
 're',
 'di',
 'ent ',
 'as ',
 'c',
 'o',
 'st',
 'ly ',
 'as ',
 'g',
 'ol',
 'd ',
 'con',
 'ta',
 'in',
 'ed ',
 'in ',
 'th',
 'o',
 'se ',
 't',
 'in',
 'y ',
 's',
 'qu',
 'a',
 're',
 's',
 '?',
 ' ',
 ' ',
 'O',
 'r ',
 'h',
 'ow',
 ' ',
 'about ',
 'the ',
 'c',
 'er',
 'e',
 'al ',
 'p',
 'lo',
 'y, ',
 'they ',
 'were ',
 'f',
 'il',
 'l',
 'ed ',
 'at ',
 'the ',
 'f',
 'ac',
 't',
 'or',
 'y ',
 'but ',
 's',
 'et',
 't',
 'l',
 'ed ',
 'in ',
 't',
 'r',
 'an',
 'sp',
 'or',
 't',
 '

In [221]:
# encode line into indices
encoded_line = [vocab.index(chunk) for chunk in line]
encoded_line

[202,
 471,
 38,
 9,
 529,
 415,
 267,
 105,
 373,
 42,
 39,
 59,
 168,
 419,
 459,
 543,
 484,
 203,
 438,
 261,
 372,
 189,
 34,
 369,
 548,
 42,
 100,
 518,
 152,
 204,
 178,
 358,
 403,
 262,
 459,
 477,
 152,
 159,
 225,
 112,
 329,
 84,
 21,
 386,
 151,
 527,
 159,
 172,
 170,
 196,
 415,
 100,
 465,
 392,
 261,
 50,
 475,
 72,
 100,
 558,
 388,
 118,
 25,
 297,
 485,
 398,
 255,
 237,
 335,
 507,
 340,
 86,
 273,
 156,
 507,
 527,
 76,
 512,
 105,
 373,
 42,
 159,
 283,
 558,
 86,
 102,
 548,
 42,
 526,
 529,
 233,
 12,
 255,
 529,
 61,
 311,
 311,
 333,
 148,
 358,
 88,
 311,
 63,
 329,
 340,
 39,
 196,
 104,
 261,
 348,
 78,
 168,
 228,
 459,
 477,
 152,
 159,
 112,
 329,
 459,
 147,
 548,
 257,
 526,
 517,
 529,
 258,
 548,
 152,
 159,
 283,
 548,
 167,
 62,
 205,
 257,
 548,
 208,
 380,
 123,
 415,
 62,
 438,
 459,
 147,
 548,
 493,
 39,
 100,
 15,
 358,
 185,
 478,
 283,
 560,
 543,
 148,
 48,
 403,
 398,
 529,
 61]

In [222]:
# decode line back to chunks
"".join([vocab[idx] for idx in encoded_line])

"Even with small containers, they don't fill them up.  These little tins are less than half filled and at the price charged it seems a rip-off. Is there some exotic ingredient as costly as gold contained in those tiny squares?  Or how about the cereal ploy, they were filled at the factory but settled in transport.<br />Can manufacturers be honest in their dealings?"