### Importação dos metadados

In [1]:
from src import grouper
import pandas as pd
import nltk
%load_ext autoreload
%autoreload 2
df = pd.read_csv("dados/corpus_tratado/metadados.csv")

### Filtrando os ids por assuntos

In [2]:
corte = 50
df_validos = pd.DataFrame(df.groupby("Assunto").size()).reset_index()
df_validos.columns = ["assunto", "quant"]
df_validos = df_validos.loc[df_validos.quant >= corte]
display(df_validos.quant.sum())
display(df_validos.assunto.nunique())
documentos_validos = df[df.Assunto.isin(df_validos.assunto)][["id", "Assunto"]].reset_index().drop('index', axis = 1)
documentos_validos

35027

125

Unnamed: 0,id,Assunto
0,1500075-43.2019.8.26.0569,Tráfico de Drogas e Condutas Afins
1,1019468-32.2019.8.26.0562,Alienação Fiduciária
2,0000043-68.2019.8.26.0616,Furto
3,1007259-02.2018.8.26.0292,DIREITO PREVIDENCIÁRIO
4,1000425-53.2019.8.26.0128,Rural (Art. 48/51)
...,...,...
35022,0001233-88.2018.8.26.0426,Cheque
35023,1514316-66.2019.8.26.0037,Impostos
35024,1008737-24.2019.8.26.0320,Indenização por Dano Material
35025,0000206-22.2019.8.26.0075,IPTU/ Imposto Predial e Territorial Urbano


### Separando grupos de treino e teste de maneira que as distribuições de assuntos permaneçam similares

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
X = documentos_validos.id
y = documentos_validos.Assunto
caminho_corpus = "dados/corpus_tratado/"

#index[0] são os indices de treino, e index[1] são os de teste
#i é o código do experimento
for i, index in enumerate(sss.split(X, y)):    
    exp = i+1
    X_treino, X_teste = X[index[0]], X[index[1]]
    y_treino, y_teste = y[index[0]], y[index[1]]    

### Instanciando o corpus do conjunto de treinamento

In [4]:
from tqdm.notebook import tqdm
import os
diretorio = "dados/corpus_tratado/"
stopwords = nltk.corpus.stopwords.words('portuguese')

X_treino = pd.DataFrame(X_treino)
X_treino['id'] = X_treino.id + '.txt'

print("criando base de treino para o experimento "+str(exp))
if not os.path.exists('dados/experimento_'+str(exp)):
    os.makedirs('dados/experimento_'+str(exp))

#a base de treino para o word2vec e fasttext deve ter uma frase por linha
#a base de treino para o glove deve ter um documento por linha
base_treino = open('dados/experimento_'+str(exp)+'/base_treino.txt', 'w+', encoding='utf8')
base_treino_glv = open('dados/experimento_'+str(exp)+'/base_treino_glv.txt', 'w+', encoding='utf8')
tokens = 0
for documento in tqdm(X_treino.id.values):
    doc = open(diretorio + documento, 'r', encoding='utf8')
    for frase in doc:
        base_treino.write(frase)
        tokens += len(frase.split(" "))
    doc.close()
    
    doc = open(diretorio + documento, 'r', encoding='utf8')
    teor_completo = doc.read().replace('\n', '')
    teor_completo = grouper.remover_stopwords(teor_completo, stopwords)
    base_treino_glv.write(teor_completo + '\n')
    doc.close()
    
base_treino.close()
base_treino_glv.close()
print(str(tokens)+ " tokens copiados com sucesso")
print("preparando documentos para extração do vocabulário:")
X_treino['teores'] = [grouper.recuperar_teor(x, diretorio) for x in tqdm(X_treino.id)]
X_treino['assunto'] = y_treino
X_treino.shape

criando base de treino para o experimento 1


HBox(children=(IntProgress(value=0, max=28021), HTML(value='')))


16042036 tokens copiados com sucesso
preparando documentos para extração do vocabulário:


HBox(children=(IntProgress(value=0, max=28021), HTML(value='')))




(28021, 3)

### Vocabulário

In [5]:
freq_min = 100
vocab = grouper.extrair_vocabulario(X_treino, freq_min, stopwords)

extraindo termos com base no ICA


100%|██████████| 125/125 [00:00<00:00, 314.94it/s]


-processando strings do corpus
-treinando vetorizador
-ICA processado
extraindo termos com base na frequência - geralmente leva menos de 4 minutos
extraindo termos do tesauro
extração de vocabulário concluída!


### Treinamento dos modelos

In [6]:
w2v_jur = grouper.treinar_word2vec('dados/experimento_'+str(exp)+'/base_treino.txt', exp)

treinando modelo word2vec


2021-02-24 00:01:30,015 : INFO : collecting all words and their counts
2021-02-24 00:01:30,021 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-24 00:01:30,090 : INFO : PROGRESS: at sentence #10000, processed 235769 words, keeping 13479 word types
2021-02-24 00:01:30,162 : INFO : PROGRESS: at sentence #20000, processed 478506 words, keeping 18827 word types
2021-02-24 00:01:30,234 : INFO : PROGRESS: at sentence #30000, processed 716990 words, keeping 22433 word types
2021-02-24 00:01:30,304 : INFO : PROGRESS: at sentence #40000, processed 956912 words, keeping 25453 word types
2021-02-24 00:01:30,374 : INFO : PROGRESS: at sentence #50000, processed 1198453 words, keeping 27868 word types
2021-02-24 00:01:30,444 : INFO : PROGRESS: at sentence #60000, processed 1434600 words, keeping 30340 word types
2021-02-24 00:01:30,515 : INFO : PROGRESS: at sentence #70000, processed 1678430 words, keeping 32461 word types
2021-02-24 00:01:30,585 : INFO : PROGRESS: 

2021-02-24 00:01:35,392 : INFO : estimated required memory for 32596 words and 100 dimensions: 61932400 bytes
2021-02-24 00:01:35,393 : INFO : resetting layer weights
2021-02-24 00:01:35,711 : INFO : training model with 12 workers on 32596 vocabulary and 100 features, using sg=1 hs=1 sample=1e-05 negative=5 window=5
2021-02-24 00:01:36,722 : INFO : EPOCH 1 - PROGRESS: at 9.26% examples, 334972 words/s, in_qsize 0, out_qsize 0
2021-02-24 00:01:37,728 : INFO : EPOCH 1 - PROGRESS: at 18.28% examples, 331639 words/s, in_qsize 1, out_qsize 0
2021-02-24 00:01:38,730 : INFO : EPOCH 1 - PROGRESS: at 25.69% examples, 313549 words/s, in_qsize 0, out_qsize 0
2021-02-24 00:01:39,732 : INFO : EPOCH 1 - PROGRESS: at 34.17% examples, 312994 words/s, in_qsize 2, out_qsize 0
2021-02-24 00:01:40,734 : INFO : EPOCH 1 - PROGRESS: at 41.63% examples, 306512 words/s, in_qsize 0, out_qsize 0
2021-02-24 00:01:41,735 : INFO : EPOCH 1 - PROGRESS: at 48.81% examples, 300297 words/s, in_qsize 23, out_qsize 0
2021

2021-02-24 00:02:14,880 : INFO : EPOCH 4 - PROGRESS: at 26.82% examples, 324842 words/s, in_qsize 1, out_qsize 0
2021-02-24 00:02:15,886 : INFO : EPOCH 4 - PROGRESS: at 35.06% examples, 319651 words/s, in_qsize 19, out_qsize 0
2021-02-24 00:02:16,894 : INFO : EPOCH 4 - PROGRESS: at 44.05% examples, 323155 words/s, in_qsize 16, out_qsize 1
2021-02-24 00:02:17,899 : INFO : EPOCH 4 - PROGRESS: at 53.54% examples, 327532 words/s, in_qsize 0, out_qsize 1
2021-02-24 00:02:18,903 : INFO : EPOCH 4 - PROGRESS: at 61.88% examples, 324696 words/s, in_qsize 0, out_qsize 0
2021-02-24 00:02:19,903 : INFO : EPOCH 4 - PROGRESS: at 70.03% examples, 322143 words/s, in_qsize 7, out_qsize 0
2021-02-24 00:02:20,909 : INFO : EPOCH 4 - PROGRESS: at 78.64% examples, 321106 words/s, in_qsize 0, out_qsize 2
2021-02-24 00:02:21,926 : INFO : EPOCH 4 - PROGRESS: at 87.02% examples, 319208 words/s, in_qsize 0, out_qsize 0
2021-02-24 00:02:22,932 : INFO : EPOCH 4 - PROGRESS: at 95.56% examples, 318792 words/s, in_qs

2021-02-24 00:02:56,326 : INFO : EPOCH 7 - PROGRESS: at 86.53% examples, 318069 words/s, in_qsize 0, out_qsize 0
2021-02-24 00:02:57,329 : INFO : EPOCH 7 - PROGRESS: at 94.72% examples, 316954 words/s, in_qsize 15, out_qsize 1
2021-02-24 00:02:57,808 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-02-24 00:02:57,815 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-02-24 00:02:57,816 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-02-24 00:02:57,817 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-02-24 00:02:57,820 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-24 00:02:57,821 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-24 00:02:57,822 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-24 00:02:57,823 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-24 00:02:57,832 : INFO : worker thr

2021-02-24 00:03:31,928 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-24 00:03:31,930 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-24 00:03:31,934 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-24 00:03:31,940 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-24 00:03:31,948 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-24 00:03:31,952 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-24 00:03:31,953 : INFO : EPOCH - 10 : training on 15387847 raw words (3697305 effective words) took 11.3s, 327196 effective words/s
2021-02-24 00:03:31,955 : INFO : training on a 153878470 raw words (36974759 effective words) took 116.2s, 318084 effective words/s
2021-02-24 00:03:31,956 : INFO : saving Word2Vec object under dados/experimento_1/w2v_jur.model, separately None
2021-02-24 00:03:31,957 : INFO : not storing attribute vectors_norm
2

In [7]:
ftt_jur = grouper.treinar_fasttext('dados/experimento_'+str(exp)+'/base_treino.txt', exp)

treinando modelo word2vec


2021-02-24 00:03:32,952 : INFO : resetting layer weights
2021-02-24 00:03:37,921 : INFO : collecting all words and their counts
2021-02-24 00:03:37,922 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-24 00:03:37,990 : INFO : PROGRESS: at sentence #10000, processed 235769 words, keeping 13479 word types
2021-02-24 00:03:38,061 : INFO : PROGRESS: at sentence #20000, processed 478506 words, keeping 18827 word types
2021-02-24 00:03:38,133 : INFO : PROGRESS: at sentence #30000, processed 716990 words, keeping 22433 word types
2021-02-24 00:03:38,209 : INFO : PROGRESS: at sentence #40000, processed 956912 words, keeping 25453 word types
2021-02-24 00:03:38,279 : INFO : PROGRESS: at sentence #50000, processed 1198453 words, keeping 27868 word types
2021-02-24 00:03:38,350 : INFO : PROGRESS: at sentence #60000, processed 1434600 words, keeping 30340 word types
2021-02-24 00:03:38,421 : INFO : PROGRESS: at sentence #70000, processed 1678430 words, keeping 324

2021-02-24 00:03:43,717 : INFO : estimated required memory for 32596 words, 138020 buckets and 100 dimensions: 125922344 bytes
2021-02-24 00:03:43,721 : INFO : resetting layer weights
2021-02-24 00:03:47,289 : INFO : training model with 12 workers on 32596 vocabulary and 100 features, using sg=1 hs=1 sample=0.001 negative=5 window=5
2021-02-24 00:04:39,906 : INFO : EPOCH 1 - PROGRESS: at 8.34% words, 17576 words/s, in_qsize -1, out_qsize 1
2021-02-24 00:04:39,907 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-02-24 00:04:39,960 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-02-24 00:04:40,162 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-02-24 00:04:40,427 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-02-24 00:04:41,035 : INFO : EPOCH 1 - PROGRESS: at 41.68% words, 86009 words/s, in_qsize -1, out_qsize 1
2021-02-24 00:04:41,037 : INFO : worker thread finished; awaiting finish of 7 mo

2021-02-24 00:08:25,074 : INFO : not storing attribute vectors_ngrams_norm
2021-02-24 00:08:25,075 : INFO : not storing attribute buckets_word
2021-02-24 00:08:25,076 : INFO : storing np array 'vectors_ngrams_lockf' to dados/experimento_1/ftt_jur.model.trainables.vectors_ngrams_lockf.npy
2021-02-24 00:08:32,801 : INFO : saved dados/experimento_1/ftt_jur.model


In [9]:
glv_jur = grouper.treinar_glove(exp)

2021-02-24 00:12:50,826 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-24 00:12:50,828 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


treinando modelo glove
mkdir -p build
tokenizando corpus
$ build/vocab_count -min-count 5 -verbose 2 < ../mestrado/experimentos_mestrado/dados/experimento_1/base_treino_glv.txt > ../mestrado/experimentos_mestrado/dados/experimento_1/glove_vocab.txt
criando matriz de coocorrencia
$ build/cooccur -memory 4.0 -vocab-file ../mestrado/experimentos_mestrado/dados/experimento_1/glove_vocab.txt -verbose 2 -window-size 15 < ../mestrado/experimentos_mestrado/dados/experimento_1/base_treino_glv.txt > ../mestrado/experimentos_mestrado/dados/experimento_1/glv_concurrence.bin
$ build/shuffle -memory 4.0 -verbose 2 < ../mestrado/experimentos_mestrado/dados/experimento_1/glv_concurrence.bin > ../mestrado/experimentos_mestrado/dados/experimento_1/glv_concurrence_shuf.bin
$ build/glove -save-file ../mestrado/experimentos_mestrado/dados/experimento_1/glv_jur -threads 8 -input-file ../mestrado/experimentos_mestrado/dados/experimento_1/glv_concurrence_shuf.bin -x-max 10 -iter 15 -vector-size 100 -binary 2 

Merging cooccurrence files: processed 0 lines.100000 lines.200000 lines.300000 lines.400000 lines.500000 lines.600000 lines.700000 lines.800000 lines.900000 lines.1000000 lines.1100000 lines.1200000 lines.1300000 lines.1400000 lines.1500000 lines.1600000 lines.1700000 lines.1800000 lines.1900000 lines.2000000 lines.2100000 lines.2200000 lines.2300000 lines.2400000 lines.2500000 lines.2600000 lines.2700000 lines.2800000 lines.2900000 lines.3000000 lines.3100000 lines.3200000 lines.3300000 lines.3400000 lines.3500000 lines.3600000 lines.3700000 lines.3800000 lines.3900000 lines.4000000 lines.4100000 lines.4200000 lines.4300000 lines.4400000 lines.4500000 lines.4600000 lines.4700000 lines.4800000 lines.4900000 lines.5000000 lines.5100000 lines.5200000 lines.5300000 lines.5400000 lines.5500000 lines.5600000 lines.5700000 lines.5800000 lines.5900000 lines.6000000 lines.6100000 lines.6200000 lines.6300000 lines.6400000 lines.6500000 lines.6600000 lines.6700000 lines.6800000 lines.6900000 lin

2021-02-24 00:16:06,923 : INFO : converting 32432 vectors from dados/experimento_1/glv_jur.txt to C:\Users\CRISTI~1\AppData\Local\Temp\test_word2vec.txt
2021-02-24 00:16:07,006 : INFO : loading projection weights from C:\Users\CRISTI~1\AppData\Local\Temp\test_word2vec.txt
2021-02-24 00:16:09,851 : INFO : loaded (32432, 100) matrix from C:\Users\CRISTI~1\AppData\Local\Temp\test_word2vec.txt
