In [1]:
import re
from unicodedata import normalize

import gensim
import spacy
from gensim import corpora
from gensim import models
from nltk import word_tokenize
from nltk.corpus import stopwords

import pandas as pd

In [2]:
nlp = spacy.load('es_core_news_sm')

### Apertura archivo y preprocesamiento

In [3]:
with open("data/nc_ch.txt", "r") as file:
    data = file.read()
    data = data.lower()
    data = data.replace("\n", " ")
    data = re.sub("\d+", "", data)
    data = normalize('NFC', re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize("NFD", data), 0, re.I))
    data = data.replace("articulo  ", " ")
    data = data.replace("articulo   ", " ")
    data = data.replace("  ", " ")
    data = data.replace("   ", " ")
    data = data.replace("       ", " ")

    data = data.replace("   constitucion politica de la republica constitucion politica de la republica  ", " ")
    data = data.replace("constitucion politica de la republica de chile", " ")
    data = data.replace("capitulo i – principios generales", " ")
    data = data.replace("capitulo i principios y disposiciones generales", " ")
    data = data.replace("capitulo ii – derechos fundamentales y garantias", " ")
    data = data.replace("capitulo ii derechos fundamentales y garantias", " ")
    data = data.replace("capitulo iii – naturaleza y medioambiente", " ")
    data = data.replace("capitulo iii naturaleza y medioambiente", " ")
    data = data.replace("capitulo iv participacion democratica", " ")
    data = data.replace("capitulo v buen gobierno y funcion publica", " ")
    data = data.replace("capitulo v – buen gobierno y funcion publica", " ")
    data = data.replace("capitulo iv – participacion democratica", " ")
    data = data.replace("capitulo vi – estado regional y organizacion territorial", " ")
    data = data.replace("capitulo vi estado regional y organizacion territorial", " ")
    data = data.replace("capitulo vii poder legislativo", " ")
    data = data.replace("capitulo vii poder legislativo congreso de diputadas y diputados", " ")
    data = data.replace(" capitulo vii poder legislativo congreso de diputadas y diputados  ", " ")
    data = data.replace("capitulo vii – poder legislativo", " ")
    data = data.replace("capitulo viii poder ejecutivo", " ")
    data = data.replace("capitulo viii – poder ejecutivo", " ")
    data = data.replace("capitulo ix – sistemas de justicia", " ")
    data = data.replace("capitulo ix sistemas de justicia", " ")
    data = data.replace("capitulo x organos autonomos constitucionales", " ")
    data = data.replace("capitulo x – organos autonomos constitucionales", " ")
    data = data.replace("capitulo xi – reforma y reemplazo de la constitucion", " ")
    data = data.replace("capitulo xi – reforma y reemplazo de la constitucion", " ")
    data = data.replace("capitulo xi reforma y reemplazo de la constitucion reforma constitucional", " ")
    data = data.replace("capitulo xi reforma y reemplazo de la constitucion reforma constitucional  ", " ")
    data = data.translate(str.maketrans(' ', ' ', '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'))
    data = data.replace(". . ", ". ")
    data = " ".join(data.split())


In [4]:
esp_stop = stopwords.words('spanish')
esp_stop = esp_stop + ['ser','tener','deber']
esp_stop.remove('estado')

In [5]:
def lemmatizer(text):
  doc = nlp(text)
  return ' '.join([word.lemma_ for word in doc if word.lemma_ not in esp_stop])

In [6]:
l = []
for i in data.split(". "):
    if i != '':
        l.append(i)

In [7]:
corpus = [word_tokenize(lemmatizer(i)) for i in l]

In [None]:
dict_corpus = corpora.Dictionary(corpus)

dtm_gensim = [dict_corpus.doc2bow(i) for i in corpus]
tfidf = models.TfidfModel(dtm_gensim)  # para transformar a tf-idf
dtm_gensim_tfidf = tfidf[dtm_gensim]  # para transformar a tf-idf

In [9]:
lda_gensim = gensim.models.ldamodel.LdaModel
ldamodel_gensim = lda_gensim(dtm_gensim_tfidf, num_topics=19, id2word=dict_corpus, random_state=0, eval_every=None)

In [10]:
ldamodel_gensim.print_topics(num_words=20)

[(0,
  '0.042*"convencion" + 0.038*"referir" + 0.026*"texto" + 0.023*"legal" + 0.022*"entenderar" + 0.020*"tercero" + 0.020*"abogada" + 0.020*"mencionado" + 0.017*"elegir" + 0.017*"presentar" + 0.017*"financiamiento" + 0.016*"dos" + 0.016*"cuarto" + 0.016*"sucesora" + 0.015*"secretaria" + 0.014*"creado" + 0.014*"inciso" + 0.014*"funcionario" + 0.013*"funcionaria" + 0.012*"proyecto"'),
 (1,
  '0.004*"ejercicio" + 0.003*"constitucion" + 0.003*"ley" + 0.003*"integrante" + 0.002*"determinar" + 0.002*"estado" + 0.002*"indigena" + 0.002*"consejo" + 0.002*"derecho" + 0.002*"asegurar" + 0.002*"persona" + 0.002*"pueblo" + 0.002*"incompatibilidad" + 0.002*"republica" + 0.002*"organo" + 0.002*"dema" + 0.002*"publico" + 0.002*"procedimiento" + 0.002*"podra" + 0.002*"integrado"'),
 (2,
  '0.053*"despacho" + 0.039*"biblioteca" + 0.037*"tendra" + 0.033*"prever" + 0.033*"ingreso" + 0.029*"plazo" + 0.023*"legislativo" + 0.018*"normativa" + 0.018*"poder" + 0.018*"propio" + 0.014*"dos" + 0.014*"legal" + 

In [11]:
df_topic_term = pd.DataFrame(ldamodel_gensim.get_topics(), columns=ldamodel_gensim.id2word.values(), index=[f'topic{i}' for i in range(ldamodel_gensim.num_topics)])
df_topic_term

Unnamed: 0,acordado,chile,conformado,constitucion,democratico,diverso,libremente,nación,nosotra,otorgar,...,constituyo,mayo,convencional,ejemplar,s.e,autentica,impreso,numerada,reservar,santiago
topic0,0.000182,0.006212,0.000297,0.010922,0.000158,0.000388,0.000182,0.000254,0.000207,0.000163,...,6.7e-05,6.7e-05,6.6e-05,6.6e-05,6.6e-05,6.6e-05,6.6e-05,6.6e-05,6.6e-05,6.6e-05
topic1,0.000185,0.000484,0.000185,0.003235,0.000353,0.000418,0.000185,0.00107,0.000185,0.000185,...,0.000185,0.000185,0.000185,0.000185,0.000185,0.000185,0.000185,0.000185,0.000185,0.000185
topic2,0.000101,0.000101,0.000337,0.010721,0.000101,0.000305,0.000101,0.000101,0.000101,0.000101,...,0.000101,0.000101,0.000101,0.000101,0.000101,0.000101,0.000101,0.000101,0.000102,0.000101
topic3,6.9e-05,0.000435,0.000162,0.008869,0.00021,0.02955,0.000173,6.9e-05,6.9e-05,0.000345,...,6.9e-05,6.9e-05,6.9e-05,6.9e-05,6.9e-05,6.9e-05,6.9e-05,6.9e-05,6.9e-05,6.9e-05
topic4,0.000205,0.000384,0.000205,0.001137,0.000205,0.000709,0.000205,0.001283,0.000205,0.000315,...,0.000205,0.000205,0.000205,0.000205,0.000205,0.000205,0.000205,0.000205,0.000205,0.000205
topic5,9.9e-05,0.000259,9.9e-05,0.005499,0.000281,0.000335,0.000286,9.9e-05,9.9e-05,9.9e-05,...,9.9e-05,9.9e-05,9.9e-05,9.9e-05,9.9e-05,9.9e-05,9.9e-05,9.9e-05,9.9e-05,9.9e-05
topic6,0.000314,0.000687,0.000103,0.001432,0.000246,0.000192,0.000103,0.000661,0.000103,0.000103,...,0.000103,0.000103,0.000103,0.000103,0.000103,0.000105,0.000105,0.000105,0.000103,0.000103
topic7,0.000134,0.007367,0.000454,0.001173,0.000134,0.000474,0.000134,0.001237,0.000134,0.000134,...,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134
topic8,0.00019,0.000659,0.000661,0.001587,0.000469,0.000561,0.00019,0.000733,0.00019,0.000656,...,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019
topic9,0.000114,0.000429,0.000114,0.003477,0.000114,0.000294,0.000114,0.000114,0.000114,0.000114,...,0.000114,0.000114,0.000114,0.000114,0.000114,0.000114,0.000114,0.000114,0.000114,0.000114


In [12]:
df_topic_term.loc['topic1'].sort_values(ascending=False)

ejercicio             0.003889
constitucion          0.003235
ley                   0.002773
integrante            0.002685
determinar            0.002366
                        ...   
independiente         0.000185
relevante             0.000185
transdisciplinario    0.000185
ecosistema            0.000185
santiago              0.000185
Name: topic1, Length: 3438, dtype: float32