### Importação dos metadados

In [1]:
from src import grouper
import pandas as pd
import nltk
%load_ext autoreload
%autoreload 2
df = pd.read_csv("dados/corpus_tratado/metadados.csv")

### Filtrando os ids por assuntos

In [2]:
corte = 50
df_validos = pd.DataFrame(df.groupby("Assunto").size()).reset_index()
df_validos.columns = ["assunto", "quant"]
df_validos = df_validos.loc[df_validos.quant >= corte]
display(df_validos.quant.sum())
display(df_validos.assunto.nunique())
documentos_validos = df[df.Assunto.isin(df_validos.assunto)][["id", "Assunto"]].reset_index().drop('index', axis = 1)
documentos_validos

35027

125

Unnamed: 0,id,Assunto
0,1500075-43.2019.8.26.0569,Tráfico de Drogas e Condutas Afins
1,1019468-32.2019.8.26.0562,Alienação Fiduciária
2,0000043-68.2019.8.26.0616,Furto
3,1007259-02.2018.8.26.0292,DIREITO PREVIDENCIÁRIO
4,1000425-53.2019.8.26.0128,Rural (Art. 48/51)
...,...,...
35022,0001233-88.2018.8.26.0426,Cheque
35023,1514316-66.2019.8.26.0037,Impostos
35024,1008737-24.2019.8.26.0320,Indenização por Dano Material
35025,0000206-22.2019.8.26.0075,IPTU/ Imposto Predial e Territorial Urbano


### Separando grupos de treino e teste de maneira que as distribuições de assuntos permaneçam similares

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
X = documentos_validos.id
y = documentos_validos.Assunto
caminho_corpus = "dados/corpus_tratado/"

#index[0] são os indices de treino, e index[1] são os de teste
#i é o código do experimento
for i, index in enumerate(sss.split(X, y)):    
    exp = i+1
    X_treino, X_teste = X[index[0]], X[index[1]]
    y_treino, y_teste = y[index[0]], y[index[1]]    

### Instanciando o corpus do conjunto de treinamento

In [4]:
from tqdm.notebook import tqdm
import os
diretorio = "dados/corpus_tratado/"
X_treino = pd.DataFrame(X_treino)
X_treino['id'] = X_treino.id + '.txt'

print("criando base de treino para o experimento "+str(exp))
if not os.path.exists('dados/experimento_'+str(exp)):
    os.makedirs('dados/experimento_'+str(exp))

#a base de treino para o word2vec e fasttext deve ter uma frase por linha
#a base de treino para o glove deve ter um documento por linha
base_treino = open('dados/experimento_'+str(exp)+'/base_treino.txt', 'w+', encoding='utf8')
base_treino_glv = open('dados/experimento_'+str(exp)+'/base_treino_glv.txt', 'w+', encoding='utf8')
tokens = 0
for documento in tqdm(X_treino.id.values):
    doc = open(diretorio + documento, 'r', encoding='utf8')
    for frase in doc:
        base_treino.write(frase)
        tokens += len(frase.split(" "))
    doc.close()
    
    doc = open(diretorio + documento, 'r', encoding='utf8')
    teor_completo = doc.read().replace('\n', '')
    base_treino_glv.write(teor_completo + '\n')
    
base_treino.close()
base_treino_glv.close()
print(str(tokens)+ " tokens copiados com sucesso")
print("preparando documentos para extração do vocabulário:")
X_treino['teores'] = [grouper.recuperar_teor(x, diretorio) for x in tqdm(X_treino.id)]
X_treino['assunto'] = y_treino
X_treino.shape

criando base de treino para o experimento 1


HBox(children=(IntProgress(value=0, max=28021), HTML(value='')))


16042036 tokens copiados com sucesso
preparando documentos para extração do vocabulário:


HBox(children=(IntProgress(value=0, max=28021), HTML(value='')))




(28021, 3)

### Vocabulário

In [5]:
freq_min = 100
stopwords = nltk.corpus.stopwords.words('portuguese')
vocab = grouper.extrair_vocabulario(X_treino, freq_min, stopwords)

extraindo termos com base no ICA


100%|██████████| 125/125 [00:00<00:00, 321.23it/s]


-processando strings do corpus
-treinando vetorizador
-ICA processado
extraindo termos com base na frequência - geralmente leva menos de 4 minutos
extraindo termos do tesauro
extração de vocabulário concluída!


### Treinamento dos modelos

In [6]:
w2v_jur = grouper.treinar_word2vec('dados/experimento_'+str(exp)+'/base_treino.txt', exp)

treinando modelo word2vec


2021-02-23 20:17:49,660 : INFO : collecting all words and their counts
2021-02-23 20:17:49,666 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-23 20:17:49,733 : INFO : PROGRESS: at sentence #10000, processed 235769 words, keeping 13479 word types
2021-02-23 20:17:49,803 : INFO : PROGRESS: at sentence #20000, processed 478506 words, keeping 18827 word types
2021-02-23 20:17:49,870 : INFO : PROGRESS: at sentence #30000, processed 716990 words, keeping 22433 word types
2021-02-23 20:17:49,938 : INFO : PROGRESS: at sentence #40000, processed 956912 words, keeping 25453 word types
2021-02-23 20:17:50,005 : INFO : PROGRESS: at sentence #50000, processed 1198453 words, keeping 27868 word types
2021-02-23 20:17:50,074 : INFO : PROGRESS: at sentence #60000, processed 1434600 words, keeping 30340 word types
2021-02-23 20:17:50,143 : INFO : PROGRESS: at sentence #70000, processed 1678430 words, keeping 32461 word types
2021-02-23 20:17:50,212 : INFO : PROGRESS: 

2021-02-23 20:17:55,077 : INFO : estimated required memory for 32596 words and 100 dimensions: 61932400 bytes
2021-02-23 20:17:55,078 : INFO : resetting layer weights
2021-02-23 20:17:55,405 : INFO : training model with 12 workers on 32596 vocabulary and 100 features, using sg=1 hs=1 sample=1e-05 negative=5 window=5
2021-02-23 20:17:56,415 : INFO : EPOCH 1 - PROGRESS: at 9.26% examples, 335491 words/s, in_qsize 0, out_qsize 0
2021-02-23 20:17:57,421 : INFO : EPOCH 1 - PROGRESS: at 18.15% examples, 329441 words/s, in_qsize 0, out_qsize 1
2021-02-23 20:17:58,439 : INFO : EPOCH 1 - PROGRESS: at 25.93% examples, 315058 words/s, in_qsize 5, out_qsize 1
2021-02-23 20:17:59,445 : INFO : EPOCH 1 - PROGRESS: at 34.48% examples, 314471 words/s, in_qsize 1, out_qsize 1
2021-02-23 20:18:00,453 : INFO : EPOCH 1 - PROGRESS: at 42.64% examples, 312703 words/s, in_qsize 0, out_qsize 0
2021-02-23 20:18:01,463 : INFO : EPOCH 1 - PROGRESS: at 50.89% examples, 311287 words/s, in_qsize 7, out_qsize 0
2021-

2021-02-23 20:18:35,825 : INFO : EPOCH 4 - PROGRESS: at 32.25% examples, 295302 words/s, in_qsize 14, out_qsize 2
2021-02-23 20:18:36,837 : INFO : EPOCH 4 - PROGRESS: at 41.19% examples, 302191 words/s, in_qsize 0, out_qsize 1
2021-02-23 20:18:37,862 : INFO : EPOCH 4 - PROGRESS: at 49.45% examples, 302112 words/s, in_qsize 23, out_qsize 0
2021-02-23 20:18:38,871 : INFO : EPOCH 4 - PROGRESS: at 58.72% examples, 307469 words/s, in_qsize 1, out_qsize 0
2021-02-23 20:18:39,885 : INFO : EPOCH 4 - PROGRESS: at 67.64% examples, 310138 words/s, in_qsize 0, out_qsize 1
2021-02-23 20:18:40,899 : INFO : EPOCH 4 - PROGRESS: at 76.53% examples, 311657 words/s, in_qsize 0, out_qsize 4
2021-02-23 20:18:41,906 : INFO : EPOCH 4 - PROGRESS: at 85.59% examples, 313313 words/s, in_qsize 0, out_qsize 0
2021-02-23 20:18:42,967 : INFO : EPOCH 4 - PROGRESS: at 94.40% examples, 312761 words/s, in_qsize 14, out_qsize 1
2021-02-23 20:18:43,486 : INFO : worker thread finished; awaiting finish of 11 more threads
2

2021-02-23 20:19:18,119 : INFO : EPOCH 7 - PROGRESS: at 97.89% examples, 325988 words/s, in_qsize 0, out_qsize 0
2021-02-23 20:19:18,303 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-02-23 20:19:18,319 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-02-23 20:19:18,328 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-02-23 20:19:18,329 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-02-23 20:19:18,331 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-23 20:19:18,341 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-23 20:19:18,343 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-23 20:19:18,354 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-23 20:19:18,357 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-23 20:19:18,359 : INFO : worker thread finished; awaiting 

2021-02-23 20:19:52,536 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-23 20:19:52,543 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-23 20:19:52,550 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-23 20:19:52,554 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-23 20:19:52,556 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-23 20:19:52,556 : INFO : EPOCH - 10 : training on 15387847 raw words (3698278 effective words) took 11.3s, 326325 effective words/s
2021-02-23 20:19:52,558 : INFO : training on a 153878470 raw words (36976985 effective words) took 117.2s, 315635 effective words/s
2021-02-23 20:19:52,559 : INFO : saving Word2Vec object under dados/experimento_1/w2v_jur.model, separately None
2021-02-23 20:19:52,560 : INFO : not storing attribute vectors_norm
2021-02-23 20:19:52,561 : INFO : not storing attribute cum_table
2021-02-23 20:19:53,631 : I

In [7]:
ftt_jur = grouper.treinar_fasttext('dados/experimento_'+str(exp)+'/base_treino.txt', exp)

treinando modelo word2vec


2021-02-23 20:19:53,710 : INFO : resetting layer weights
2021-02-23 20:19:58,709 : INFO : collecting all words and their counts
2021-02-23 20:19:58,710 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-23 20:19:58,776 : INFO : PROGRESS: at sentence #10000, processed 235769 words, keeping 13479 word types
2021-02-23 20:19:58,848 : INFO : PROGRESS: at sentence #20000, processed 478506 words, keeping 18827 word types
2021-02-23 20:19:58,917 : INFO : PROGRESS: at sentence #30000, processed 716990 words, keeping 22433 word types
2021-02-23 20:19:58,986 : INFO : PROGRESS: at sentence #40000, processed 956912 words, keeping 25453 word types
2021-02-23 20:19:59,054 : INFO : PROGRESS: at sentence #50000, processed 1198453 words, keeping 27868 word types
2021-02-23 20:19:59,125 : INFO : PROGRESS: at sentence #60000, processed 1434600 words, keeping 30340 word types
2021-02-23 20:19:59,194 : INFO : PROGRESS: at sentence #70000, processed 1678430 words, keeping 324

2021-02-23 20:20:04,192 : INFO : estimated required memory for 32596 words, 138020 buckets and 100 dimensions: 125922344 bytes
2021-02-23 20:20:04,197 : INFO : resetting layer weights
2021-02-23 20:20:07,706 : INFO : training model with 12 workers on 32596 vocabulary and 100 features, using sg=1 hs=1 sample=0.001 negative=5 window=5
2021-02-23 20:21:01,488 : INFO : EPOCH 1 - PROGRESS: at 8.33% words, 17192 words/s, in_qsize -1, out_qsize 1
2021-02-23 20:21:01,490 : INFO : worker thread finished; awaiting finish of 11 more threads
2021-02-23 20:21:01,648 : INFO : worker thread finished; awaiting finish of 10 more threads
2021-02-23 20:21:01,891 : INFO : worker thread finished; awaiting finish of 9 more threads
2021-02-23 20:21:02,085 : INFO : worker thread finished; awaiting finish of 8 more threads
2021-02-23 20:21:02,162 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-23 20:21:02,235 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-

2021-02-23 20:24:49,400 : INFO : saving FastText object under dados/experimento_1/ftt_jur.model, separately None
2021-02-23 20:24:49,402 : INFO : storing np array 'vectors_ngrams' to dados/experimento_1/ftt_jur.model.wv.vectors_ngrams.npy
2021-02-23 20:24:56,570 : INFO : not storing attribute vectors_norm
2021-02-23 20:24:56,571 : INFO : not storing attribute vectors_vocab_norm
2021-02-23 20:24:56,572 : INFO : not storing attribute vectors_ngrams_norm
2021-02-23 20:24:56,573 : INFO : not storing attribute buckets_word
2021-02-23 20:24:56,573 : INFO : storing np array 'vectors_ngrams_lockf' to dados/experimento_1/ftt_jur.model.trainables.vectors_ngrams_lockf.npy
2021-02-23 20:25:04,309 : INFO : saved dados/experimento_1/ftt_jur.model


In [50]:
import subprocess
shellscript = subprocess.Popen(["src/glove.sh"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, text=True)
shellscript.wait()
returncode = shellscript.returncode

In [52]:
stdout

''

In [51]:
returncode

1

In [77]:
corpus="D:\\projetos\\mestrado\\experimentos_mestrado\\dados\\experimento_"+str(exp)+"\\base_treino_glv.txt"
vocab_file="D:\\projetos\\mestrado\\experimentos_mestrado\\dados\\experimento_"+str(exp)+"\\glove_vocab.txt"
coocurrence_file="D:\\projetos\\mestrado\\experimentos_mestrado\\dados\\experimento_"+str(exp)+"\\glv_concurrence.bin"
coocurrence_shuf_file="D:\\projetos\\mestrado\\experimentos_mestrado\\dados\\experimento_"+str(exp)+"\\glv_concurrence_shuf.bin"
bulddir="D:\\projetos\\gl\\GloVe-master\\GloVe-master\\build\\"
save_file="D:\\projetos\\mestrado\\experimentos_mestrado\\dados\\experimento_"+str(exp)+"\\glv_jur.model"
verbose=2
memory=4.0
vocab_min_count=5
vector_size=100
max_iter=15
window_size=15
binary=2
num_threads=8
x_max=10
shsc_vocabulario = "echo $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"

In [89]:
print("echo "+ bulddir+"vocab_count.exe "+"-min-count "+str(vocab_min_count)+" -verbose "+str(verbose)+" "+corpus+" "+vocab_file)

echo D:\projetos\gl\GloVe-master\GloVe-master\build\vocab_count.exe -min-count 5 -verbose 2 D:\projetos\mestrado\experimentos_mestrado\dados\experimento_1\base_treino_glv.txt D:\projetos\mestrado\experimentos_mestrado\dados\experimento_1\glove_vocab.txt


In [109]:
useless_cat_call = subprocess.Popen(["bash", "src/glove.sh", corpus], 
                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
output, errors = useless_cat_call.communicate()
useless_cat_call.wait()
print(output)
print(errors)

mkdir -p build

$ build/vocab_count -min-count 5 -verbose 2 < text8 > vocab.txt

src/glove.sh: line 29: text8: No such file or directory



In [84]:
useless_cat_call = subprocess.Popen(["bash src/glove.sh"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True,shell = True)
output, errors = useless_cat_call.communicate(input="Hello from the other side!")
useless_cat_call.wait()
print(output)
print(errors)


O sistema não pode encontrar o caminho especificado.



In [66]:
useless_cat_call = subprocess.run(["cat"], stdout=subprocess.PIPE, text=True, input="Hello from the other side", shell=True)
print(useless_cat_call.stdout)  # Hello from the other side


