In [10]:
import re
from unicodedata import normalize

import gensim
import spacy
from gensim import corpora
from gensim import models
from nltk import word_tokenize
from nltk.corpus import stopwords

import pandas as pd

In [11]:
nlp = spacy.load('es_core_news_sm')

### Apertura archivo y preprocesamiento

In [73]:
with open("data/nc_ch.txt", "r") as file:
    data = file.read()
    data = data.replace("\n", " ")
    data = re.sub("\d+", "", data)
    data = data.lower()
    list_art = data.split("--articulo--")

In [74]:
df_art = pd.DataFrame(list_art, columns = ["article"])

In [89]:
import string

def preprocess(df):
  articles = df

  articles['article'] = articles['article'].str.translate(str.maketrans(' ', ' ', string.punctuation))
  articles['article'] = articles['article'].str.strip()
  articles['article'] = articles['article'].str.replace('\n', ' ')
  articles['article'] = articles['article'].str.translate(str.maketrans(' ', ' ', string.punctuation))
  articles['article'] = articles['article'].apply(lambda x: " ".join(x.split()))
  articles['article'] = articles['article'].apply(lambda x: re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize("NFD", x), 0, re.I))

  articles = articles[[len(str(x).split()) > 2 for x in articles["article"]]]
  return articles.reset_index(drop=True)

In [93]:
df_art_p = preprocess(df_art)

In [94]:
esp_stop = stopwords.words('spanish')
esp_stop = esp_stop
esp_stop.remove('estado')

In [95]:
def lemmatizer(text):
  doc = nlp(text)
  return ' '.join([word.lemma_ for word in doc if word.lemma_ not in esp_stop])

In [96]:
corpus = [word_tokenize(lemmatizer(i)) for i in df_art_p["article"]]

In [99]:
dict_corpus = corpora.Dictionary(corpus)

dtm_gensim = [dict_corpus.doc2bow(i) for i in corpus]
dtm_gensim_tfidf = models.TfidfModel(dtm_gensim)[dtm_gensim]

In [100]:
lda_gensim = gensim.models.ldamodel.LdaModel
ldamodel_gensim = lda_gensim(
    dtm_gensim_tfidf,
    num_topics = 50,
    passes = 100,
    id2word = dict_corpus,
    random_state = 0,
    eval_every = None,
    alpha = 'auto'
)

In [104]:
ldamodel_gensim.print_topics(
    num_words = 15,
    num_topics = 30
)

[(12,
  '0.005*"sobreviniente" + 0.005*"incapacidad" + 0.004*"remocion" + 0.003*"merecer" + 0.003*"setento" + 0.003*"aflictivo" + 0.003*"constatado" + 0.003*"aceptar" + 0.003*"mental" + 0.003*"condena" + 0.003*"cesarar" + 0.003*"renunciar" + 0.002*"renuncia" + 0.002*"delito" + 0.000*"integrar"'),
 (10,
  '0.004*"devolucion" + 0.004*"modificada" + 0.004*"derogada" + 0.004*"enviara" + 0.004*"continuar" + 0.004*"terminado" + 0.003*"despachado" + 0.002*"tratar" + 0.002*"aprobado" + 0.002*"momento" + 0.002*"presidencia" + 0.002*"efecto" + 0.000*"votacion" + 0.000*"tramitacion" + 0.000*"miembro"'),
 (39,
  '0.006*"corrupcion" + 0.004*"perseguir" + 0.003*"atentar" + 0.003*"sector" + 0.003*"contrario" + 0.003*"sancion" + 0.002*"democratico" + 0.001*"estudio" + 0.000*"sanción" + 0.000*"competente" + 0.000*"coordinar" + 0.000*"aplicacion" + 0.000*"velarar" + 0.000*"penitenciario" + 0.000*"pena"'),
 (17,
  '0.008*"compuesto" + 0.005*"orientación" + 0.005*"sexoafectiva" + 0.004*"afirmativo" + 0.00

In [102]:
df_topic_term = pd.DataFrame(
    ldamodel_gensim.get_topics(),
    columns = ldamodel_gensim.id2word.values(),
    index = [f'topic{i}' for i in range(ldamodel_gensim.num_topics)])
df_topic_term

Unnamed: 0,actividad,asegurar,bien,chile,colectivo,condición,constituir,cultural,deber,democracia,...,quincuagesima,quincuagesimo,rebajar,recaer,reclamacion,reconstruccion,rever,segunir,tipificado,transitarar
topic0,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,...,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231,0.000231
topic1,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.011262,...,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133,0.000133
topic2,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,...,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262
topic3,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.000353,...,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026
topic4,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,...,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179,0.000179
topic5,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,...,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223,0.000223
topic6,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,...,0.000958,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05
topic7,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,...,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247,0.000247
topic8,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,...,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218,0.000218
topic9,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,...,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254,0.000254


In [105]:
df_topic_term.loc['topic1'].sort_values(ascending=False)

financiero      0.016182
adecuado        0.015892
menos           0.015625
incluir         0.013468
coordinacion    0.013318
                  ...   
llevar          0.000133
monto           0.000133
posesion        0.000133
precio          0.000133
transitarar     0.000133
Name: topic1, Length: 3448, dtype: float32