In [1]:
import os

PROJECT_NAME = "compression-text-models"

curdir = os.path.abspath(os.path.curdir).split("/")
project_index = curdir.index(PROJECT_NAME)
os.chdir("/" + os.path.join(*curdir[:project_index + 1]))

In this notebook, we'll check what are the outputs of the `distilbert/scripts/binarized.py` and `distilbert/scripts/token_counts.py` are and reimplement them using multiprocessing.

Fist let's check out the result of the binarized data generated with the `data/clean/brwac-separated-sentences.txt` document.

This data was saved on `data/procesed/brwac-tokenized/brwac-joined-pagragraphs/tokenized-sentences.pickle`

In [2]:
import multiprocessing

In [3]:
import pickle

data_path = "data/processed/brwac-tokenized/brwac-joined-paragraphs/tokenized-sentences.pickle"

with open(data_path, 'rb') as f:
    data = pickle.load(f)

In [4]:
# checking an example

type(data), type(data[0]), data[0]

(list,
 numpy.ndarray,
 array([  101,  8917, 21010, 22281,  1040,   120, 18506,   120,  1492,
         4301, 18169, 10429,   259, 11963,  7245,  3570,   710,  4907,
         1451,   202,  1508,  4549,  1095,  4484,   125,  2054, 15803,
          173,  8331,   171,  2285,   161, 22316,   177,  7639,  3559,
          125, 19768,   125,  8331,   171,  2285,   161, 22316,  4172,
         1790,   202,  7391,  4782,  1516,   148,  2974,  2532,  1430,
          180,  5642, 19700,   125, 11327, 22290,   842,   117,   179,
          253, 14833,   423,  7305,   367,  5523,  6704, 21304,   247,
          222,  1160, 18169,   179, 14660,   260, 15947,   180,   100,
         2156,  1508,  4549,  1095,  4484,   125,  2054, 15803,   125,
         8331,   171,  2285,   161, 22316,   117,   122,   146,  7245,
         3570,   710,  4907,  1451,  1023,   532, 11963, 13207,   119,
          231,  7245, 18360,   327,  3035,   202, 14157,   117,   644,
        19148,   120, 16394,   117,   123,  1018,   29

It seems our data was tokenized by the `neuralmind/bert-base-portuguese-cased` tokenizer.

The result is a list of `numpy.ndarray` containing the tokens of each sentence.

We can try to convert one of these sentences back to their original state to check if it matches with the original text

In [5]:
import transformers

tk = transformers.AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
tk.decode(data[0])

'[CLS] Páginas 17 / 07 / 13 Novo sorteio define os adversários Sport Clube São Joseense no Intermunicipal de Futsal em Santana do Seridó A Secretaria Municipal de Esportes de Santana do Seridó realizou hoje no Programa Panorama Esportivo da Rádio Rural de Parelhas, que é comandado pelo reporter Antonio Januario um novo sorteio que definiu as chaves da [UNK] Copa Intermunicipal de Futsal de Santana do Seridó, e o Sport Clube São Joseense teve seus adversários definidos. O Sport fará sua estreia no sábado, dia 03 / 08, a partir 20 : 50 horas jogando contra o Samboys de Juazeirinho, ainda fazem parte da chave do Sport o Bela Vista de Equador, Satnad do Povoado Cobra municipio de Parelhas e Olarias do Povoado Currais Novos municipio de Jardim do Seridó. Amanhã o Secretário de Esportes de Santana do Seridó vai repassar todas as equipes em suas referidas chaves e a [UNK] rodada da competição, pois o mesmo ainda tá fechando a tabela. Temos ainda como representante de São José do Sabugi, o Ate

Let's compare this to the original sentence. Unfortunately, the `distilbert/scripts/binazrized.py` makes our sentence order random, so we need to search for this paragraph in all sentences.

In [6]:
sentences_path = 'data/clean/brwac-joined-sentences.txt'

with open(sentences_path, "r") as f:
    sentences = f.readlines()
sentence = list(filter(lambda x: x.startswith("Páginas 17/07/13"), sentences))
sentence

['Páginas 17/07/13 Novo sorteio define os adversários Sport Clube São Joseense no Intermunicipal de Futsal em Santana do Seridó A Secretaria Municipal de Esportes de Santana do Seridó realizou hoje no Programa Panorama Esportivo da Rádio Rural de Parelhas, que é comandado pelo reporter Antonio Januario um novo sorteio que definiu as chaves da 9ª Copa Intermunicipal de Futsal de Santana do Seridó, e o Sport Clube São Joseense teve seus adversários definidos.O Sport fará sua estreia no sábado, dia 03/08, a partir 20:50 horas jogando contra o Samboys de Juazeirinho, ainda fazem parte da chave do Sport o Bela Vista de Equador, Satnad do Povoado Cobra municipio de Parelhas e Olarias do Povoado Currais Novos municipio de Jardim do Seridó. Amanhã o Secretário de Esportes de Santana do Seridó vai repassar todas as equipes em suas referidas chaves e a 1ª rodada da competição, pois o mesmo ainda tá fechando a tabela.Temos ainda como representante de São José do Sabugi, o Atenda do presidente Cíc

This looks good enough to prove that the script is just tokenizing our dataset.

The final data then is a list of arrays pickled into a file. I'll try to reimplement it so we can paralelize this process later.

In [7]:
import numpy as np

import tqdm

def f(x):
    return np.uint16(tk.encode(x))

with multiprocessing.Pool(12) as p:
    result = list(tqdm.tqdm(p.imap(f, sentences), total=len(sentences)))

100%|██████████| 3530796/3530796 [39:05<00:00, 1505.52it/s] 


In [10]:
result[0]

array([  101, 11433, 22332,   243,  6240, 20697, 22341,  4790, 18471,
       22322,  9689, 22311,   187, 18394,  4790,  9545,   107,   248,
        7073, 22328, 22341, 22352, 22311,   213,  5054,  5476, 22309,
         107,  3199,   125,  4868,   125,  4155,   117,   977,   131,
       18506,   117,   240,  4735, 22280,   331,  7326,   255,  2224,
         118,   196,   409,  3185, 22279,  3370,  7127,   446, 12230,
         145,  8627,   191,  4759,   735,   119, 10247,  1088,  2303,
       14979,  1176,   299,  8718,  3549, 22327,   192, 22327,  9208,
       22322, 22447, 22320,  6162, 22317,   250, 22301,   290, 22327,
       18394, 15040,   213,  9008,  1431,   730,   809, 18430, 13389,
         376,  1183,  1859,   170,  5678,   229,  7592,   125, 11205,
         119,   503,  2947,   125, 11912,   740,  3379, 15221,   251,
         122,   346,  3196,   785,   123,  3855,   119,   409,  4397,
        3391, 22280,   171,   177, 12429,   117,   170,  1615,  8483,
         117,   740,

Now let's compare this result with the result generated by the huggingface's code

In [11]:
import pickle

tokenized_path = "data/processed/brwac-tokenized/brwac-joined-paragraphs/tokenized-sentences.pickle"

with open(tokenized_path, 'rb') as f:
    data = pickle.load(f)

Unfortunately the datasets are not in the same order, so there's not an elegant way to compare them both fast.

In [31]:
from collections import defaultdict

d = defaultdict(int)

for array in tqdm.tqdm(result):
    d[str(array.tolist()).strip()] += 1
for array in tqdm.tqdm(data):
    d[str(array.tolist()).strip()] += 1

100%|██████████| 3530796/3530796 [04:16<00:00, 13746.32it/s]
100%|██████████| 3530796/3530796 [04:15<00:00, 13814.56it/s]


In [32]:
from collections import Counter

Counter(list(d.values()))

Counter({2: 3527035, 4: 1842, 6: 15, 1: 46, 10: 1, 8: 1})