# Trabajando con Gensim - LDA

## Preparación de documentos

In [1]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."
# compilar documentos
doc_complete = [doc1, doc2, doc3, doc4, doc5]


## Pre-procesamiento

In [2]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]  
doc_clean

[[u'sugar', u'bad', u'consume', u'sister', u'like', u'sugar', u'father'],
 ['father',
  'spends',
  'lot',
  'time',
  'driving',
  'sister',
  'around',
  'dance',
  'practice'],
 [u'doctor',
  u'suggest',
  u'driving',
  u'may',
  u'cause',
  u'increased',
  u'stress',
  u'blood',
  u'pressure'],
 ['sometimes',
  'feel',
  'pressure',
  'perform',
  'well',
  'school',
  'father',
  'never',
  'seems',
  'drive',
  'sister',
  'better'],
 [u'health', u'expert', u'say', u'sugar', u'good', u'lifestyle']]

## Matriz término-documento

In [5]:
import gensim
from gensim import corpora

# Creación del diccionario de términos de nuestro courpus, donde se asigna un índice a cada término único.
dictionary = corpora.Dictionary(doc_clean)

# Convertir la lista de documentos (corpus) en la Matriz de Términos del Documento utilizando el diccionario.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
print (dictionary)
doc_term_matrix

Dictionary(35 unique tokens: [u'expert', u'consume', u'dance', u'seems', u'say']...)


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
 [(2, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(2, 1),
  (4, 1),
  (18, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(5, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]]

## Ejecutar modelo LDA

In [6]:
# Creación del objeto para el modelo LDA usando la librería gensim
Lda = gensim.models.ldamodel.LdaModel

# Ejecución y entrenamiento del modelo LDA en la matriz de términos del documento.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [7]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, u'0.029*"sister" + 0.029*"father" + 0.029*"driving"'), (1, u'0.099*"sugar" + 0.040*"suggest" + 0.040*"blood"'), (2, u'0.072*"father" + 0.072*"sister" + 0.041*"pressure"')]


# Caso de estudio de corpus de noticias

## Leer el corpus y extraer características

In [10]:
import pickle
import gensim
from sklearn.feature_extraction.text import CountVectorizer

# Cargar lista de documentos
with open('newsgroups', 'rb') as f:
    newsgroup_data = pickle.load(f)

# Usar CountVectorizer para encontrar tokens de tres letras,eliminar palabras cerradas,
# eliminar tokens que aparecen en al menos 20 documentos,
# eliminar tokens que no aparecen en más del 20% de los documentos
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english', 
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Ajustamos y transformamos
X = vect.fit_transform(newsgroup_data)

# Convertir la matriz dispersa al corpus gensim.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Asignación de ID de palabra a palabras (para usar en el parámetro id2word de LdaModel)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())
print(X.shape)

(2000, 901)


## Modelo LDA

In [12]:
# Usamos el constructor gensim.models.ldamodel.LdaModel para estimar
# los parámetros del modelo LDA en el corpus, y guardar en la variable `ldamodel`

ldamodel = gensim.models.ldamodel.LdaModel (corpus, num_topics=10, id2word=id_map, passes=25, random_state=34)

In [13]:
# Retorna una lista de los 10 topicos y las 10 palabras más significativas en cada topico.
def lda_topics():
    topics=ldamodel.print_topics(num_topics=10, num_words=10)
    return topics
lda_topics()

[(0,
  '0.056*"edu" + 0.043*"com" + 0.033*"thanks" + 0.022*"mail" + 0.021*"know" + 0.020*"does" + 0.014*"info" + 0.012*"monitor" + 0.010*"looking" + 0.010*"don"'),
 (1,
  '0.024*"ground" + 0.018*"current" + 0.018*"just" + 0.013*"want" + 0.013*"use" + 0.011*"using" + 0.011*"used" + 0.010*"power" + 0.010*"speed" + 0.010*"output"'),
 (2,
  '0.061*"drive" + 0.042*"disk" + 0.033*"scsi" + 0.030*"drives" + 0.028*"hard" + 0.028*"controller" + 0.027*"card" + 0.020*"rom" + 0.018*"floppy" + 0.017*"bus"'),
 (3,
  '0.023*"time" + 0.015*"atheism" + 0.014*"list" + 0.013*"left" + 0.012*"alt" + 0.012*"faq" + 0.012*"probably" + 0.011*"know" + 0.011*"send" + 0.010*"months"'),
 (4,
  '0.025*"car" + 0.016*"just" + 0.014*"don" + 0.014*"bike" + 0.012*"good" + 0.011*"new" + 0.011*"think" + 0.010*"year" + 0.010*"cars" + 0.010*"time"'),
 (5,
  '0.030*"game" + 0.027*"team" + 0.023*"year" + 0.017*"games" + 0.016*"play" + 0.012*"season" + 0.012*"players" + 0.012*"win" + 0.011*"hockey" + 0.011*"good"'),
 (6,
  '0.0

## Distribución de tópicos

In [23]:
def topic_distribution():
    # Transformar documento a vector
    Xnew = vect.transform(new_doc)

    # Convertir matriz dispersa a corpus.
    newcorpus = gensim.matutils.Sparse2Corpus(Xnew, documents_columns=False)
    
    # distribucion de topicos
    topic_dis = list(ldamodel[newcorpus])[0] 
    
    return topic_dis

new_doc = ["\n\nIt's my understanding that the freezing will start to occur because \
of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
Krumins\n-- "]

topic_distribution()


[(0, 0.02000183),
 (1, 0.020002047),
 (2, 0.020000001),
 (3, 0.49662152),
 (4, 0.020002762),
 (5, 0.020002853),
 (6, 0.020001693),
 (7, 0.020001365),
 (8, 0.020001847),
 (9, 0.34336406)]