## Tokenizador

In [13]:
import json
import re
import nltk
from nltk import word_tokenize

In [14]:
data = []
with open('dump_small_clean.jsonln', 'r', encoding="utf8") as file:
    for line in file:
        data.append(json.loads(line))

In [15]:
texto = data[0]['body']
tokens = word_tokenize(texto)
# tokens

In [16]:
tokens = [w for w in tokens if re.fullmatch('\w+', w)]
# tokens

In [17]:
def minusculas(tokens):
    return [token.lower() for token in tokens]

def remove_digitos(tokens):
    return [token for token in tokens if re.fullmatch('[^\d]*', token)]

def pega_palavras(tokens):
    return [token for token in tokens if re.fullmatch('\w+', token)]
    
def limpa_tokens(tokens):
    tokens = minusculas(tokens)
    tokens = remove_digitos(tokens)
    tokens = pega_palavras(tokens)    
    return tokens

In [18]:
from tqdm import tqdm

all_words = []
for item in tqdm(data):
    texto = item['body']
    tokens = word_tokenize(texto)
    tokens = limpa_tokens(tokens)
    all_words += tokens

100%|███████████████████████████████████████████████████████████████████████████| 11225/11225 [00:42<00:00, 264.47it/s]


In [19]:
len(all_words)

3866876

In [85]:
def stopwords(all_words):
    stop = nltk.corpus.stopwords.words('portuguese')
    stop += nltk.corpus.stopwords.words('english')
    return [w for w in all_words if w not in stop]

stopwords(all_words)

['alexandre',
 'prenome',
 'popular',
 'língua',
 'portuguesa',
 'cognato',
 'nome',
 'alexander',
 'língua',
 'inglesa',
 'países',
 'lusófonos',
 'pessoas',
 'chamadas',
 'alexandre',
 'normalmente',
 'apelidadas',
 'alex',
 'origem',
 'nome',
 'deriva',
 'latim',
 'alexander',
 'romanização',
 'nome',
 'grego',
 'αλέξανδρος',
 'aléksandros',
 'etimologicamente',
 'nome',
 'composto',
 'verbo',
 'ἀλέξειν',
 'aléksein',
 'defender',
 'substantivo',
 'ἀνδρός',
 'andrós',
 'genitivo',
 'ἀνήρ',
 'anēr',
 'homem',
 'assim',
 'pode',
 'ser',
 'traduzido',
 'protetor',
 'humanidade',
 'termo',
 'tipo',
 'raro',
 'composto',
 'tatpurusha',
 'invertido',
 'modificante',
 'segunda',
 'posição',
 'tatpurusha',
 'cognato',
 'sânscrito',
 'sendo',
 'nararakṣa',
 'cf',
 'ramayana',
 'equivalente',
 'exato',
 'sânscrito',
 'rakṣinara',
 'pie',
 'hleks',
 'hnros',
 'composto',
 'gasto',
 'tipo',
 'terpsimbrotos',
 'cujo',
 'significado',
 'original',
 'protege',
 'homens',
 'primeiro',
 'registro',


In [26]:
from collections import Counter
word_counts = Counter(stopwords(all_words))
word_counts_list = list(word_counts.items())
word_counts_list_sorted = sorted(word_counts_list, key=lambda x: (-x[1], x[0]))
#word_counts_list_sorted

In [33]:
vocab =  word_counts_list_sorted[:10000]
vocab = dict(vocab)

In [34]:
LOWERCASE = [chr(x) for x in range(ord('a'), ord('z') + 1)]
#UPPERCASE = [chr(x) for x in range(ord('A'), ord('Z') + 1)]

#https://www.ascii-codes.com/cp860.html
LOWERCASE_OTHERS = ['ç', 'á', 'â', 'ã', 'à', 'é', 'í', 'ó', 'ú', 'ê','î', 'ô', 'û', 'õ']  # etc.
#UPPERCASE_OTHERS = [x.upper() for x in LOWERCASE_OTHERS]
LETTERS = LOWERCASE + LOWERCASE_OTHERS 

In [35]:
def edit1(text):
    words = []
    
    # Fase 1: as remoçoes.
    for p in range(len(text)):
        new_word = text[:p] + text[p + 1:]
        if len(new_word) > 0:
            words.append(new_word)
        
    # Fase 2: as adições.
    for p in range(len(text) + 1):
        for c in LETTERS:
            new_word = text[:p] + c + text[p:]
            words.append(new_word)
    
    # Fase 3: as substituições.
    for p in range(len(text)):
        orig_c = text[p]
        for c in LETTERS:
            if orig_c != c:
                new_word = text[:p] + c + text[p + 1:]
                words.append(new_word)
    
    return set(words)

def edit2(text):
    words1 = edit1(text)
    words2 = set()
    for w in words1:
        candidate_words2 = edit1(w)
        candidate_words2 -= words1
        words2.update(candidate_words2)
    words2 -= set([text])
    return words2


In [36]:
def candidates(word):
    if word in vocab:
        candidatos = [word]
    else:
        candidatos = []
        candidatos += \
        [w for w in edit1(word) if w in vocab] \
        + [w for w in edit2(word) if w in vocab] \
        + [word]
    return candidatos

print(candidates("banane"))

['banana', 'anne', 'baiano', 'brangane', 'baiana', 'banane']


In [79]:
def probabilidade(word, n = sum(vocab.values())):
    if word in vocab:
        return vocab[word] / n
    else:
        return 0



In [80]:
def corretor(word):
    return max(candidates(word), key=probabilidade)

In [88]:
def corrigeFrase(frase):
    tokens = frase.split()
    fraseCorrigida = ""
    for palavra in tokens:
        if palavra in nltk.corpus.stopwords.words('portuguese'):
            fraseCorrigida += palavra + " "
        else:
            fraseCorrigida += corretor(palavra) + " "
    return fraseCorrigida

corrigeFrase("andri na paça")

'andré na data '