In [20]:
import tensorflow as tf 
import pandas as pd 
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
vocab_size = 80000  # Vocabulary size
maxlen = 80  # Maximum length of input sequences
dim_model = 256  # Dimension of the model
num_heads = 8  # Number of attention heads
ff_dim = 512  # Dimension of the feed-forward layer
num_blocks = 4  # Number of transformer blocks
dropout = 0.1  # Dropout rate

batch_size = 128

clearned_corpus = f'./data/clearned_corpus_00.txt.gz'
file_word_dict = './data/word_dict.pickle'
file_count_words = './data/count_words.parquet'
model_filename = './models/nano_gpt_v3_by_marcelo.keras'

df_count_words = pd.read_parquet(file_count_words, engine='pyarrow')
df_count_words.info()
vocab = df_count_words.sort_values('count', ascending=False).head(vocab_size - 1)['word'].unique().copy()
del df_count_words
print(vocab.shape)


tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token='<OOV>', filters='')
tokenizer.fit_on_texts(vocab)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1230424 entries, 0 to 1230423
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   word    1230424 non-null  object
 1   count   1230424 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 18.8+ MB
(79999,)


In [1]:
a = 'a casa na fazenda'
a.split()[-1]

'fazenda'

In [21]:
sequences = tokenizer.texts_to_sequences(["eu amo o brasil asdfas"])
padded_sequences = pad_sequences(sequences, padding="pre", maxlen=10) 
padded_sequences

array([[    0,     0,     0,     0,     0,   644, 16336,     5,    93,
            1]], dtype=int32)

In [22]:
tokenizer.sequences_to_texts(padded_sequences)

['<OOV> <OOV> <OOV> <OOV> <OOV> eu amo o brasil <OOV>']

In [23]:
len(tokenizer.word_index)

80000

In [7]:
import gzip
import json
import re
import string
import gzip
import pickle
import pandas as pd
import glob
import pickle
import os

from multiprocessing import Pool

filename = './data/ptwiki-articles-text-cleaned/AA/wiki_00'
path_files = './data/ptwiki-articles-text/**/*'
clearned_corpus = './data/clearned_corpus_{:02d}.txt.gz'
path_clearned_corpus = './data/clearned_corpus_*.txt.gz'
file_word_dict = './data/word_dict.pickle'
file_count_words = './data/count_words.parquet'

regex = r"([" + string.punctuation + '—–¿!' + "])"

In [2]:
samples = 2000000
# samples = 5000
maxlen = 80 + 1

In [None]:

def remove_spaces(line):
  punct_pattern = re.compile('[' + re.escape('\'"()[]{}●') + ']')
  line = re.sub(punct_pattern, ' ', line)
  alphabet = 'qwertyuiopasdfghjklçzxcvbnmàèìòùáéíóúâêîôûãõñïüýỳŷỹ'
  puncts = ',.;:?¿!¡'
  for c in alphabet:
    for p in puncts:
      line = line.replace(f'{c}{p}', f'{c} {p}')

  while '  ' in line:
    line = line.replace('  ', ' ').strip()
  while '..' in line:
    line = line.replace('..', '.')
  while '\n' in line:
    line = line.replace('\n', ' ').strip()

  return line.lower()
  ### === END FUNCTION

list_files = glob.glob(path_files)

count_fileout = 1
count_filein = 0
len_filein = len(list_files)
count = 0
for filename in list_files:
  count_filein += 1
  with open(filename, 'r') as f:
    line_aux = ''
    for line in f.readlines():
      if '<doc' in line or '</doc' in line:
        continue

      if len(line.split()) < maxlen:
        line_aux += ' ' + remove_spaces(line_aux)
        continue
      line = remove_spaces(line)
      _splited = line.split()
      if len(_splited) > maxlen - 2:
        _splited = _splited[:maxlen - 2]
      with gzip.open(clearned_corpus.format(count_fileout), 'at') as f_out:
        for i in range(1, len(_splited)):
          f_out.write(' '.join(_splited[0:i + 1]) + '\n')
          count += 1
        f_out.write(' '.join(_splited) + ' [eos]\n')  # Sempre escreve a última linha
        count += 1
      if count >= samples:
        print(f'Count file_in: {count_filein}/{len_filein} - Count file_out: {count_fileout}')
        count = 0
        count_fileout += 1

      line_aux = ''

In [5]:
def list_of_words(file: str):
  list_unique_word_file = set()
  list_count_words_file = {}
  with gzip.open(file, 'rt') as f_in:
    lines = f_in.readlines()
    for line in lines:
      for w in line.split():
        list_unique_word_file.add(w)
        if w in list_count_words_file:
          list_count_words_file[w] += 1
        else:
          list_count_words_file[w] = 1
    print(f'Size of Dict for file {file}: {len(list_unique_word_file)}')
  return list_unique_word_file, list_count_words_file

In [None]:
input_filenames = glob.glob(path_clearned_corpus)
print('files:', len(input_filenames))

with Pool(processes=os.cpu_count()) as pool:
  list_unique_word_all = set()
  list_count_words_all = {}
  processes = []
  for input_file in input_filenames:
    p = pool.apply_async(list_of_words, ([input_file]))
    processes.append(p)
  for p in processes:
    result_unique_word_file, result_count_word_file = p.get()

    for word in result_unique_word_file:  # append word distinct
      list_unique_word_all.add(word)

    for word in result_count_word_file:  # count how many times a word was used
      if word in list_count_words_all:
        list_count_words_all[word] += result_count_word_file[word]
      else:
        list_count_words_all[word] = result_count_word_file[word]

  print(f'Size of Dict for ALL files: {len(list_unique_word_all)}')

In [None]:
word_dict = {}
for i, word in enumerate(list_unique_word_all):
  word_dict[i] = word
print('size of word_dict:', len(word_dict))
# Store data (serialize)
with open(file_word_dict, 'wb') as handle:
  pickle.dump(word_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
  print(f'word_dict save to file: {file_word_dict}')

In [None]:
import pandas as pd

df = pd.DataFrame(list_count_words_all.items(), columns=['word', 'count'])
df.info()

In [None]:
df.sort_values('count', ascending=False).head(100)

In [12]:
df.to_parquet(file_count_words, engine='pyarrow', compression='gzip')