In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from ast import literal_eval

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
THEMES = [5, 6, 26, 33, 139, 163, 232, 313, 339, 350, 406, 409, 555, 589,
          597, 634, 660, 695, 729, 766, 773, 793, 800, 810, 852, 895, 951, 975]
TRAIN_DATA_PATH = '../train.csv'
TEST_DATA_PATH = '../test.csv'
VALIDATION_DATA_PATH = '../validation.csv'

In [3]:
def get_data(path, preds=None, key=None):
    data = pd.read_csv(path)
    data = data.rename(columns={ 'pages': 'page'})
    data.body = data.body.str.strip('{}"')
    data = groupby_process(data)
    data.themes = data.themes.apply(lambda x: literal_eval(x))
    return data

In [4]:
def groupby_process(df):
    new_df = df.sort_values(['process_id', 'page'])
    new_df = new_df.groupby(
                ['process_id', 'themes'],
                group_keys=False
            ).apply(lambda x: x.body.str.cat(sep=' ')).reset_index()
    new_df = new_df.rename(index=str, columns={0: "body"})
    return new_df

In [5]:
train_data = get_data(TRAIN_DATA_PATH)
test_data = get_data(TEST_DATA_PATH)
validation_data = get_data(VALIDATION_DATA_PATH)

In [6]:
train_data.themes = train_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
test_data.themes = test_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
validation_data.themes = validation_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))

In [7]:
len(train_data), len(validation_data), len(test_data)

(31851, 6842, 6839)

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield([x for x in sentence.split(" ") if len(x) > 1])

train_words = list(sent_to_words(train_data.body.tolist()))


print(train_words[:1])

[['çft', 'manê', 'ado', 'intimação', 'extraído', 'relação', 'das', 'decisões', 'dos', 'despachos', 'publicados', 'diário', 'justiça', 'ordem', 'seqzetáriq', 'judiqário', 'eletn', 'jnicode', 'de', 'março', 'forma', 'abaixo', 'supremo', 'federal', 'tribunal', '__', 'que', 'oficial', 'justiça', 'intime', 'rocurndoria', 'gernl', 'fazenda', 'nacional', 'pessoa', 'procurndora', 'gernl', 'fazenda', 'jacional', 'doutora', 'adriana', 'queiroz', 'carvalho', 'na', 'quem', 'suas', 'vezes', 'fizer', 'corllendereço', 'sas', 'quadrn', 'lotes', 'bloco', 'andar', 'nesta', 'capital', 'inteiro', 'teorda', 'decisão', 'ões', 'do', 'despacho', 'referente', 's', 'processo', 'eletn', 'jnico', 'abaixo', 'relacionado', 'recurso', 'com', 'agravo', 'recurso', 'extraordinário', 'com', 'agravo', 'agra', 'de', 'instrumento', 'recurso', 'recurso', 'recurso', 'recurso', 'extraordinário', 'recurso', 'ário', 'reqjrso', 'com', 'agravo', 'recurso', 'com', 'agravo', 'total', 'dado', 'assado', 'nesta', 'secretapa', 'supremo

In [9]:
len(train_words)

31851

In [10]:
# # Build the bigram and trigram models
# bigram = gensim.models.Phrases(train_words, min_count=50, threshold=100) # higher threshold fewer phrases.
# trigram = gensim.models.Phrases(bigram[train_words], min_count=5, threshold=100)  

# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

# # See trigram example
# print(trigram_mod[bigram_mod[train_words[0]]])

In [11]:
# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [12]:
# train_words = make_bigrams(train_words)
# train_words = make_trigrams(train_words)

In [14]:
# Create Dictionary
# id2word = corpora.Dictionary(train_words)
id2word = corpora.Dictionary.load("dicts/big_dict")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [15]:
def save_dic(dic, filename="dic"):
    with open(filename, "wb") as handle:
        dic.save(handle)

In [16]:
# save_dic(id2word, "big_dict")

In [17]:
len(id2word.dfs), id2word.dfs

(81418,
 {9824: 50,
  318: 4457,
  3766: 6461,
  2638: 5105,
  7557: 2121,
  2894: 15575,
  111: 3824,
  6541: 14614,
  5040: 11730,
  3831: 15882,
  2969: 1859,
  339: 3630,
  7654: 2960,
  1382: 11765,
  3986: 13366,
  8454: 1770,
  5690: 2011,
  1210: 4164,
  622: 13391,
  1340: 12364,
  9014: 64,
  9880: 14155,
  5238: 66,
  7938: 7572,
  412: 2244,
  9814: 1896,
  2307: 15804,
  875: 105,
  5780: 973,
  4160: 4024,
  2148: 12749,
  5306: 10230,
  9331: 12375,
  3119: 233,
  1847: 5262,
  8484: 128,
  1393: 11974,
  5099: 342,
  5781: 4319,
  9057: 542,
  234: 2894,
  5820: 15158,
  2019: 3914,
  4981: 14997,
  6250: 8031,
  4696: 3370,
  5933: 97,
  4268: 14739,
  8844: 11699,
  7954: 14943,
  3149: 8096,
  4264: 7741,
  428: 2368,
  468: 9032,
  1325: 6466,
  4085: 12416,
  6070: 6003,
  392: 2546,
  9193: 6904,
  6054: 9640,
  35: 6169,
  7526: 8563,
  1278: 7596,
  8333: 11001,
  9754: 6328,
  2199: 3228,
  7757: 110,
  2096: 9144,
  4170: 4977,
  6859: 7589,
  9520: 2516,
  58

In [18]:
# from copy import deepcopy

# copy_dict = deepcopy(id2word)
# copy_dict.filter_extremes(no_below=50, no_above=.5, keep_n=None)

In [19]:
# len(copy_dict.dfs), copy_dict.dfs

NameError: name 'copy_dict' is not defined

In [None]:
# id2word = deepcopy(copy_dict)
# del(copy_dict)

In [20]:
[(id2word[x], y) for (x, y) in sorted(id2word.dfs.items(), key=lambda x: x[1], reverse=True)]

[('deverá', 15922),
 ('ela', 15920),
 ('regra', 15920),
 ('brasileiro', 15906),
 ('ARTIGO_543', 15900),
 ('controvérsia', 15900),
 ('tribunais', 15898),
 ('exercício', 15887),
 ('demonstrado', 15886),
 ('secretário', 15884),
 ('fazenda', 15882),
 ('nada', 15866),
 ('âmbito', 15851),
 ('devida', 15845),
 ('eventual', 15834),
 ('atividade', 15830),
 ('serem', 15818),
 ('impossibilidade', 15807),
 ('dado', 15804),
 ('cujo', 15804),
 ('obrigação', 15799),
 ('regional', 15784),
 ('falta', 15763),
 ('contudo', 15761),
 ('inexistência', 15735),
 ('tipo', 15730),
 ('correção', 15730),
 ('procedente', 15723),
 ('realizada', 15718),
 ('recebido', 15705),
 ('tão', 15692),
 ('portaria', 15668),
 ('sim', 15667),
 ('sempre', 15667),
 ('verdade', 15655),
 ('tratar', 15644),
 ('expressamente', 15627),
 ('serão', 15627),
 ('ponto', 15617),
 ('certo', 15594),
 ('verbis', 15593),
 ('contrário', 15593),
 ('diário', 15575),
 ('região', 15567),
 ('restou', 15544),
 ('filho', 15520),
 ('limites', 15520),
 ('

In [21]:
# save_dic(id2word, "big_dict")

In [22]:
train_corpus = [id2word.doc2bow(text) for text in train_words]

In [24]:
from gensim.models import HdpModel

hdp = HdpModel(train_corpus, id2word)

In [26]:
pprint(hdp.print_topics(num_words=10))

[(0,
  '0.005*eletronicamente + 0.004*recebido + 0.003*evento + 0.002*original + '
  '0.002*contribuição + 0.002*fazenda + 0.002*benefício + 0.002*procuradoria + '
  '0.002*usuário + 0.002*página'),
 (1,
  '0.017*jam + 0.009*ento + 0.009*eletronicamente + 0.008*recebido + '
  '0.008*empresa + 0.007*recolhido + 0.007*nâo + 0.007*credito + 0.006*ente + '
  '0.004*ro'),
 (2,
  '0.014*ipi + 0.012*saúde + 0.009*evento + 0.009*icms + 0.008*nfe + '
  '0.007*ltda + 0.006*eletronicamente + 0.005*imposto + 0.005*medicamentos + '
  '0.005*serviços'),
 (3,
  '0.015*servidores + 0.014*gratificação + 0.013*desempenho + 0.009*avaliação '
  '+ 0.008*servidor + 0.008*atividade + 0.007*inativos + 0.005*pontos + '
  '0.005*ativos + 0.005*proventos'),
 (4,
  '0.055*original + 0.037*cópia + 0.037*informe + 0.037*acesse + 0.037*site + '
  '0.022*recebido + 0.019*acessar + 0.018*conferir + 0.018*eletronicamente + '
  '0.010*protocolado'),
 (5,
  '0.011*gerais + 0.011*minas + 0.010*aposentadoria + 0.008*ativi

In [28]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=train_corpus,
                                           id2word=id2word,
                                           num_topics=50, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [29]:
pprint(lda_model.print_topics())

[(6,
  '0.085*"ipi" + 0.077*"icms" + 0.046*"ltda" + 0.038*"importação" + '
  '0.034*"imposto" + 0.030*"produtos" + 0.023*"produto" + 0.021*"aduaneiro" + '
  '0.018*"uso" + 0.017*"cumulatividade"'),
 (14,
  '0.104*"paraná" + 0.090*"curitiba" + 0.041*"projudi" + 0.041*"página" + '
  '0.038*"juntada" + 0.036*"ref" + 0.028*"centro" + 0.026*"arq" + '
  '0.022*"movimentação" + 0.019*"londrina"'),
 (29,
  '0.048*"art" + 0.038*"crea" + 0.026*"conselho" + 0.023*"engenharia" + '
  '0.021*"técnica" + 0.020*"taxa" + 0.018*"mútua" + 0.017*"responsabilidade" + '
  '0.017*"pgto" + 0.016*"anotação"'),
 (10,
  '0.299*"cnpj" + 0.028*"créditos" + 0.025*"meses" + 0.024*"receita" + '
  '0.022*"débito" + 0.020*"crédito" + 0.017*"pis" + 0.016*"saldo" + '
  '0.015*"ano" + 0.011*"corrente"'),
 (12,
  '0.067*"folha" + 0.061*"horas" + 0.060*"fgts" + 0.036*"inss" + '
  '0.035*"salário" + 0.026*"extras" + 0.025*"repouso" + 0.022*"férias" + '
  '0.019*"salario" + 0.017*"ferias"'),
 (11,
  '0.082*"saúde" + 0.023*"me

In [30]:
lda_model.save("lda_big_30")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [32]:
del(hdp)
del(train_data)
del(validation_data)
del(test_data)
del(train_words)

In [33]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, train_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
import os
os.environ.update({'MALLET_HOME':"/home/isis/Davi_Alves/data/parts/topic_modeling/mallet/mallet-2.0.8"})

mallet_path = "/home/isis/Davi_Alves/data/parts/topic_modeling/mallet/mallet-2.0.8/bin/mallet" # update this path

ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=train_corpus, num_topics=10, id2word=id2word, workers=23)

In [None]:
pprint(ldamallet.print_topics())

In [None]:
with open("models/ldamallet_big", "wb") as handle:
    ldamallet.save(handle)