# *Unsupervised learning: Latent Dirichlet allocation (LDA) topic modeling*

In [None]:
## Install a Python package for LDA
# http://pythonhosted.org/lda/getting_started.html

!pip3 install lda

In [None]:
## Importing basic packages

import os
import numpy as np

In [None]:
## Downloading 'Essays' by Ralph Waldo Emerson

os.chdir('/sharedfolder/')

!wget -N http://www.gutenberg.org/cache/epub/16643/pg16643.txt

In [None]:
## Loading the text

text_path = 'pg16643.txt'

text_data = open(text_path).read()

In [None]:
## Dividing the document into segments, with the aim of extracting individual essays

len(text_data.split('\n\n\n\n\n'))

In [None]:
## Viewing the beginning of each segment to determine which ones to keep

counter = 0

for item in text_data.split('\n\n\n\n\n'):
    print('-----')
    print(counter)
    print(item[:80])
    counter+=1

In [None]:
## Creating a list of essays

document_list = text_data.split('\n\n\n\n\n')[9:20]

print(len(document_list))

In [None]:
## Creating a vectorized representation of each essay in the list

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(document_list) 

In [None]:
## Viewing a single essay's vector

sample_essay_vector = X.toarray()[3]

print(len(sample_essay_vector))

sample_essay_vector

In [None]:
## Creating a vocabulary list corresponding to the vectors we created above

vocabulary = vectorizer.get_feature_names()

vocabulary[8950:8980]

In [None]:
## Viewing the 10 most frequent words in a single essay

print(np.array(vocabulary)[np.argsort(sample_essay_vector)[::-1]][:10])

print(np.argsort(sample_essay_vector)[::-1][:10]) # corresponding frequency values

In [None]:
## Initializing an LDA model: 10 topics and 1000 iterations

import lda

model = lda.LDA(n_topics=10, n_iter=1000, random_state=1)

In [None]:
## Fitting the model using our list of vectors

model.fit(X)

In [None]:
## Viewing the top 50 words in each 'topic'

topic_word = model.topic_word_

n_top_words = 50

for i, topic_distribution in enumerate(topic_word):
    topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    print()

### Repeating the process, removing stop words and punctuation first

In [None]:
from nltk.tokenize import word_tokenize

word_tokenize('We are symbols, and inhabit symbols.')

In [None]:
## Importing NLTK stop words

from nltk.corpus import stopwords
 
stop_words = set(stopwords.words('english'))

stop_words

In [None]:
## Importing Python punctuation set

import string

string.punctuation

In [None]:
## Testing tokenization + stop word removal

sentence = 'We are symbols, and inhabit symbols.'.lower()

token_list = word_tokenize(sentence)

sentence_filtered = [item for item in token_list if (item not in stop_words)&(item not in string.punctuation)]

sentence_filtered

In [None]:
## Tokenizing and removing stop words from our list of essays

documents_filtered = []

for document in document_list:
    token_list = word_tokenize(document.lower())
    tokens_filtered = [item for item in token_list if (item not in stop_words)&(item not in string.punctuation)]
    documents_filtered.append(' '.join(tokens_filtered))

In [None]:
## Viewing a segment of a preprocessed essay

documents_filtered[3][2000:2100]

In [None]:
## Vectorizing preprocessed essays

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(documents_filtered) 

In [None]:
## Creating a vocabulary list corresponding to the vectors we created above

vocabulary = vectorizer.get_feature_names()

vocabulary[1140:1160]

In [None]:
## Initializing an LDA model: 10 topics and 1000 iterations

model = lda.LDA(n_topics=10, n_iter=1000, random_state=1)

In [None]:
## Fitting the model using our list of vectors

model.fit(X)

In [None]:
## Viewing the top 50 words in each 'topic'

topic_word = model.topic_word_

n_top_words = 50

for i, topic_distribution in enumerate(topic_word):
    topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]
    print('Topic ' + str(i) + ':')
    print(' '.join(topic_words))
    print()

### ▷Assignment

    Modify the code above: Apply a stemming step to each word before vectorizing the text.
    See example stemming code in the following cell.

In [None]:
## Stemming example

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

print(stemmer.stem('nature'))

print(stemmer.stem('natural'))

print(stemmer.stem('naturalism'))