In [2]:
import re
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import nltk
nltk.download('stopwords')

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [53]:
f = open("doc1.txt", "r")
text = f.read()
data = [text]

In [34]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [33]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [13]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [32]:
data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [31]:
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

In [22]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [39]:
doc_lda = lda_model[corpus]

In [55]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"precaution" + 0.010*"program" + 0.010*"people" + 0.010*"poorly" + 0.010*"possibility" + 0.010*"oral" + 0.010*"really" + 0.010*"publication" + 0.010*"open" + 0.010*"public"
Topic: 1 Word: 0.013*"say" + 0.012*"transmission" + 0.012*"evidence" + 0.011*"emerge" + 0.011*"regard" + 0.011*"have" + 0.011*"look" + 0.011*"virus" + 0.011*"people" + 0.011*"need"
Topic: 2 Word: 0.010*"say" + 0.010*"transmission" + 0.010*"evidence" + 0.010*"field" + 0.010*"have" + 0.010*"look" + 0.010*"letter" + 0.010*"tell" + 0.010*"people" + 0.010*"covid"
Topic: 3 Word: 0.010*"precaution" + 0.010*"program" + 0.010*"people" + 0.010*"poorly" + 0.010*"possibility" + 0.010*"oral" + 0.010*"really" + 0.010*"publication" + 0.010*"open" + 0.010*"public"
Topic: 4 Word: 0.010*"precaution" + 0.010*"program" + 0.010*"people" + 0.010*"poorly" + 0.010*"possibility" + 0.010*"oral" + 0.010*"really" + 0.010*"publication" + 0.010*"open" + 0.010*"public"
Topic: 5 Word: 0.010*"precaution" + 0.010*"program" + 0.0