In [1]:
# Run in terminal or command prompt
# python3 -m spacy download en



import numpy as np
import pandas as pd
import sklearn
import re, nltk, spacy, gensim
import os
import json

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
import gensim
from gensim.models import CoherenceModel

# Plotting tools
!pip install pyLDAvis
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

os.chdir("/content/drive/MyDrive/Grupo MIDAS/LDA Horus")



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#collection data is being loaded from
collection =  "lda_info"
#documents data is being loaded from
documents = [
  "_id",
  "dc:description:eng",
  "dc:title:eng"
]#read file
f = open("avois_words.txt", "r")
#this is a list of words that are not relevant to the topic
stopwords = f.read().splitlines()
f.close()

In [3]:
# Opening JSON file
f = f=open("lda_info.json","r",encoding='utf-8')

# returns JSON object as 
# a dictionary
data = f.read().splitlines()

data_full=[]
for i in data:
  data_full.append(json.loads(i))
f.close()

In [4]:
data = []
cnt = 0
for document in data_full:
    new_doc = []
    for(key, value) in document.items():
      if(key in documents):
        new_doc.append(key + ":" + str(value))
    data.append(new_doc)

In [5]:
#TOKENIZE WORDS
def tokenize_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
#Tokenizer de Stanford
data_words = list(tokenize_words(data))

print(data_words[:1])

[['numberint', 'dc', 'title', 'eng', 'systematic', 'evaluation', 'of', 'jatropha', 'curcas', 'oil', 'obtention', 'by', 'mechanical', 'and', 'solvent', 'extraction', 'dc', 'description', 'eng', 'current', 'biodiesel', 'production', 'relies', 'mostly', 'on', 'edible', 'oils', 'soybean', 'palm', 'rapeseed', 'which', 'is', 'major', 'drawback', 'for', 'the', 'process', 'as', 'raw', 'materials', 'represent', 'the', 'major', 'part', 'of', 'the', 'final', 'costs', 'of', 'the', 'biofuel', 'this', 'also', 'brings', 'out', 'concerns', 'about', 'food', 'and', 'feed', 'security', 'in', 'the', 'last', 'years', 'jatropha', 'curca', 'oil', 'has', 'obtained', 'attention', 'as', 'an', 'alternative', 'oleochemical', 'feedstock', 'because', 'it', 'is', 'perennial', 'crop', 'able', 'to', 'produce', 'up', 'to', 'gallon', 'per', 'hectare', 'year', 'of', 'non', 'edible', 'oil', 'even', 'though', 'several', 'authors', 'have', 'reported', 'process', 'conditions', 'and', 'yields', 'on', 'variety', 'of', 'extract

In [6]:
# Remove long words

def remove_long_words(words):
  for word in words:
    aux_word=[]
    for character in word:
      if len(character)>3:
          aux_word.append(character)
    yield aux_word


short_words=list(remove_long_words(data_words))

Se podría pintar el histograma de frecuencias

In [7]:
#LEMMATIZATION
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(short_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])



In [8]:
#Create a words dictionary with tokenized, clean and lemmatized elements
dictionary = gensim.corpora.Dictionary(data_lemmatized)

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#For each document we create an dictionary that show many words and the number of times repeated 
bow_corpus = [dictionary.doc2bow(doc) for doc in data_lemmatized]

We create a tf-idf model object using models.TfidfModel from "bow_corpus" and place it in tfidf, then apply the transformation to the entire corpus and call it corpus_tfidf. Finally, we preview the TF-IDF scores for our first document.

In [9]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.04712022481724221),
 (1, 0.03017857062411565),
 (2, 0.062394614181510016),
 (3, 0.05603770123450342),
 (4, 0.04147885214562807),
 (5, 0.05909957685270562),
 (6, 0.07549608164697481),
 (7, 0.057005479083418464),
 (8, 0.22648824494092443),
 (9, 0.058024215550057816),
 (10, 0.1257867236087987),
 (11, 0.06412151395401403),
 (12, 0.05035297592891162),
 (13, 0.05181416213577583),
 (14, 0.04712022481724221),
 (15, 0.06412151395401403),
 (16, 0.08768345080163738),
 (17, 0.06208262741732341),
 (18, 0.09669861201188158),
 (19, 0.06412151395401403),
 (20, 0.032272668393402815),
 (21, 0.04147885214562807),
 (22, 0.060833708683168175),
 (23, 0.038400707906124425),
 (24, 0.5909957685270562),
 (25, 0.07307398409491288),
 (26, 0.06485184681290412),
 (27, 0.04834930600594079),
 (28, 0.13796966223776375),
 (29, 0.11710885490769656),
 (30, 0.10070595185782324),
 (31, 0.06341774461871613),
 (32, 0.14393576336699937),
 (33, 0.06023823019508788),
 (34, 0.05557137688923049),
 (35, 0.026879139324744502

We are going to train our LDA model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [10]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [11]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

Topic: 0 Word: 0.014*"image" + 0.007*"student" + 0.006*"search" + 0.006*"visual" + 0.006*"representation" + 0.006*"method" + 0.006*"operator" + 0.006*"cluster" + 0.005*"collection" + 0.005*"network"
Topic: 1 Word: 0.006*"system" + 0.006*"software" + 0.005*"project" + 0.005*"framework" + 0.005*"course" + 0.005*"process" + 0.005*"ethanol" + 0.005*"design" + 0.005*"development" + 0.005*"model"
Topic: 2 Word: 0.007*"image" + 0.007*"immune" + 0.006*"software" + 0.006*"architecture" + 0.006*"system" + 0.006*"management" + 0.005*"agent" + 0.005*"network" + 0.005*"research" + 0.005*"model"
Topic: 3 Word: 0.009*"multimodal" + 0.007*"cluster" + 0.007*"operator" + 0.006*"company" + 0.006*"self" + 0.005*"factorization" + 0.005*"fuzzy" + 0.005*"datum" + 0.005*"space" + 0.005*"numberint"
Topic: 4 Word: 0.012*"code" + 0.011*"fault" + 0.009*"source" + 0.008*"emission" + 0.007*"model" + 0.006*"developer" + 0.006*"knowledge" + 0.005*"system" + 0.005*"process" + 0.005*"software"
Topic: 5 Word: 0.007*"clu

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

**Tareas a implementar**

- Usar perplejidad para determinar la cantidad de clusters
- Ver gráfica de frecuencias en palabras
- Visualización dentro de Horus (Tipo cluseters e individuos)
- Utilizar temática para la busqueda (se pueden utilizar medidas como la de cos2) -> Sistema de recuperación de informació basado en conceptos 
- Filtro de grupo
- Filtro por concepto

**Model evaluation**

In [12]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
        coherence_values = []
        model_list = []
        topics=[]
        dictionary_aux=[]
        for num_topics in range(start, limit, step):
            model=gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())
            topics.append(num_topics)
            dictionary_aux.append({"modelo":model,"n_topicos":num_topics,"coherencia":coherencemodel.get_coherence()})

        return dictionary_aux

In [13]:
diccionario_prueba=compute_coherence_values(dictionary=dictionary,corpus=corpus_tfidf,texts=data_lemmatized,limit=35,step=1)

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(E

Are selected the number of topics with higher coherence

In [21]:
topic_index=pd.DataFrame(diccionario_prueba).drop("modelo",axis=1).sort_values(by="coherencia",ascending=False).query("n_topicos<10").head(1).index.to_list()[0]

In [22]:
mejor_modelo=diccionario_prueba[topic_index]["modelo"]

In [23]:
for idx, topic in mejor_modelo.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"fault" + 0.006*"network" + 0.006*"system" + 0.006*"knowledge" + 0.006*"image" + 0.005*"student" + 0.005*"model" + 0.004*"management" + 0.004*"base" + 0.004*"design"
Topic: 1 Word: 0.009*"image" + 0.006*"classification" + 0.005*"network" + 0.005*"system" + 0.005*"agent" + 0.005*"model" + 0.005*"theory" + 0.004*"innovation" + 0.004*"analysis" + 0.004*"feature"
Topic: 2 Word: 0.007*"system" + 0.006*"network" + 0.006*"service" + 0.006*"emission" + 0.005*"model" + 0.004*"cluster" + 0.004*"communication" + 0.004*"mode" + 0.004*"production" + 0.004*"chain"
Topic: 3 Word: 0.007*"image" + 0.006*"design" + 0.006*"student" + 0.005*"network" + 0.005*"learn" + 0.005*"architecture" + 0.005*"model" + 0.005*"structure" + 0.005*"system" + 0.004*"problem"
Topic: 4 Word: 0.009*"cluster" + 0.007*"network" + 0.007*"operator" + 0.006*"software" + 0.005*"architecture" + 0.005*"problem" + 0.005*"model" + 0.005*"column" + 0.005*"evolutionary" + 0.004*"feature"
Topic: 5 Word: 0.008*"image"

In [24]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(mejor_modelo, corpus_tfidf, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [18]:
jnsajasncsa

NameError: ignored

In [None]:
datos=pd.concat([pd.DataFrame(b),pd.DataFrame(c)],axis=1)
datos.columns=["Topicos","Coherencia"]

In [None]:
print(datos)

In [None]:
b

In [None]:
# Create document word matrix
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words=stopwords,             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)
print(data_vectorized.shape)

In [None]:
#SPARSE DATA
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

In [None]:
#GRID SEARCH TO FIND BEST n_components
#TARDA 11 min maomenos
# Define Search Param
search_params = {'n_components': [4], 
                 'learning_decay': [0.5],
                 'max_iter':[100],            # Max learning iterations
                 'evaluate_every': [20],
                 'batch_size':[128],          # n docs in each learning iter
                 'n_jobs':[-1],               # Use all available CPUs
                 }

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(estimator=lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

In [None]:
#GET BEST MODEL
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))