In [None]:
import os
import ssl

import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

from sklearn.datasets import fetch_20newsgroups


def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts, trigram_mod):
    [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatization(texts,
                  nlp,
                  allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):

    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# !wget https://github.com/khider/INF549/tree/master/In-class%20exercises/Practicum-7/20news-bydate_py3.pkz

Import a corpus of documents and build a dictionary out of it

In [None]:
from gensim.test.utils import common_texts

print(common_texts)

In [None]:
common_dictionary = corpora.Dictionary(common_texts)

print(list(common_dictionary.items()))

In [None]:
# alternatively
print(common_dictionary.token2id)

Build BOW representation of corpus of common texts

In [None]:
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
print(common_corpus)

Show bow representation of corpus with actual words

In [None]:
print("* BOW corpus with actual words")
id_words = [[(common_dictionary[id], count) for (id, count) in line] for line in common_corpus]
print(id_words)

Build LDA model with 10 topics

In [None]:
lda = LdaModel(corpus = common_corpus, id2word = common_dictionary, num_topics=10)

Display learned topics

In [None]:
#for topic in lda.get_topics():
#    print(topic)

# display words numerically encoded
#for topic in lda.show_topics():
#    print(topic)

# diplay topics
pprint(lda.show_topics(num_words=len(common_dictionary)))

Consider new documents

In [None]:
new_documents = [
        ['computer', 'time', 'graph'],
        ['survey', 'response', 'eps'],
        ['human', 'system', 'computer']
    ]
new_corpus = [common_dictionary.doc2bow(text) for text in new_documents]

print(new_corpus)

Get BOW representation of first new document

In [None]:
# get vector of document
bow_doc = common_dictionary.doc2bow(new_documents[0])

print(bow_doc)

Embed document in space: express bow representation as convex combination of topics

In [None]:
print(">>>> topic prob distribution: lda[bow_doc]")
print(lda[bow_doc])

# or
# print(lda.get_document_topics(bow_doc))

Update LDA model with new texts

In [None]:
lda.update(new_corpus)
pprint(lda.show_topics(num_words=len(common_dictionary)))

In [None]:
Consider new corpus

In [None]:
    # to do only once
    # import nltk
    # nltk.download("stopwords")
    if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
            getattr(ssl, '_create_unverified_context', None)):
        ssl._create_default_https_context = ssl._create_unverified_context

    # define stop words
    stop_words = ['stop', 'the', 'to', 'and', 'a', 'in', 'it',
                  'is', 'I', 'that', 'had', 'on', 'for', 'were', 'was',
                  'from', 'subject', 're', 'edu', 'use']

    # nltk.download('stopwords')
    #stop_words = set(stopwords.words('english'))
    #stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

    # read corpus
    newsgroups_train = fetch_20newsgroups(subset='train', data_home="./")
    data = newsgroups_train.data
    
    print(data[:4])

Preprocess data

In [None]:
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

# it will print the data after prepared for stopwords
print(data[:4])

Remove stop words and lemmatize texts

In [None]:
bigram = gensim.models.Phrases(data, min_count=5, threshold=100)
#trigram = gensim.models.Phrases(bigram[data], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

data_words_nostops = remove_stopwords(data, stop_words)
data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=[
    'NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:4])  # it will print the lemmatized data.

Build LDA Model

In [None]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

print("* corpus in BOW format with actual words")
# print(corpus[:4])
# it will print the words with their frequencies.
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:4]])

print("\n* Build LDA model")
lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=20,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=10,
        alpha='auto',
        per_word_topics=True
    )

doc_lda = lda_model[corpus]
pprint(lda_model.print_topics())

In [None]:
# save model
lda_model.save("big_lda_model")

# load with
# lda = LdaModel.load("big_lda_model")