# *Unsupervised learning: Latent Dirichlet allocation (LDA) topic modeling*

In [1]:
## Install a Python package for LDA
# http://pythonhosted.org/lda/getting_started.html

!pip3 install lda

In [2]:
## Importing basic packages

import os
import numpy as np

In [3]:
## Downloading 'Essays' by Ralph Waldo Emerson

os.chdir('/sharedfolder/')

!wget -N http://www.gutenberg.org/cache/epub/16643/pg16643.txt

In [4]:
## Loading the text

text_path = 'pg16643.txt'

text_data = open(text_path).read()

In [5]:
## Dividing the document into segments, with the aim of extracting individual essays

len(text_data.split('\n\n\n\n\n'))

22

In [7]:
## Creating a list of essays

document_list = text_data.split('\n\n\n\n\n')[9:20]

print(len(document_list))

11


In [8]:
## Importing NLTK stop words

from nltk.corpus import stopwords
 
stop_words = set(stopwords.words('english'))

#stop_words

In [9]:
## Importing Python punctuation set

import string

#string.punctuation

### ▷Assignment

    Apply a stemming step to each word before vectorizing the text.
    See example stemming code in the following cell.

In [10]:
## Tokenizing, stemming, and removing stop words from our list of essays

from nltk.tokenize import word_tokenize

from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

documents_filtered = []

for document in document_list:
    token_list = word_tokenize(document.lower())
    tokens_filtered = [item for item in token_list if (item not in stop_words)&(item not in string.punctuation)]
    tokens_stemmed = [stemmer.stem(item) for item in tokens_filtered]     ###  <--- STEMMING STEP
    documents_filtered.append(' '.join(tokens_stemmed))

In [11]:
## Vectorizing preprocessed essays

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(documents_filtered) 

In [12]:
## Creating a vocabulary list corresponding to the vectors we created above

vocabulary = vectorizer.get_feature_names()

#vocabulary[1140:1160]

In [13]:
## Initializing an LDA model: 10 topics and 1000 iterations

import lda

model = lda.LDA(n_topics=10, n_iter=1000, random_state=1)

In [14]:
## Fitting the model using our list of vectors

model.fit(X)

INFO:lda:n_documents: 11
INFO:lda:vocab_size: 6723
INFO:lda:n_words: 33319
INFO:lda:n_topics: 10
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -386710
INFO:lda:<10> log likelihood: -306919
INFO:lda:<20> log likelihood: -300273
INFO:lda:<30> log likelihood: -297565
INFO:lda:<40> log likelihood: -295762
INFO:lda:<50> log likelihood: -295375
INFO:lda:<60> log likelihood: -294941
INFO:lda:<70> log likelihood: -295211
INFO:lda:<80> log likelihood: -295572
INFO:lda:<90> log likelihood: -295578
INFO:lda:<100> log likelihood: -294955
INFO:lda:<110> log likelihood: -294397
INFO:lda:<120> log likelihood: -294157
INFO:lda:<130> log likelihood: -294274
INFO:lda:<140> log likelihood: -294048
INFO:lda:<150> log likelihood: -293620
INFO:lda:<160> log likelihood: -293353
INFO:lda:<170> log likelihood: -293277
INFO:lda:<180> log likelihood: -292928
INFO:lda:<190> log likelihood: -292638
INFO:lda:<200> log likelihood: -292372
INFO:lda:<210> log likelihood: -292941
INFO:lda:<220> log likelihood: -29

<lda.lda.LDA at 0x7fe2f5bd2b70>

In [15]:
## Viewing the top 50 words in each 'topic'

topic_word = model.topic_word_

n_top_words = 50

for i, topic_distribution in enumerate(topic_word):
    topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]
    print('Topic ' + str(i) + ':')
    print(' '.join(topic_words))
    print()

Topic 0:
man men world must shall know soul good like come god alway without well love noth say part feel self look present whole right univers act said stand fear perfect divin take show individu lose reason highest face particular pleasur seek wise hear offic believ someth valu measur element moment

Topic 1:
one man thought time see mind find life day new action truth book geniu hand think true last human state work art light ever learn read becom write fact age lie scholar hope live alreadi past privat begin feet influenc eye scienc thousand event literatur inspir draw longer young lost

Topic 2:
live work appear done somewhat follow given suffer nation opinion line hero prayer take voic seem war heroism precis lord reli teach percept interest need prais travel home brave die number temper ear tri custom heroic gave wife associ asham color standard scorn enter conform hospit text palac truli hast

Topic 3:
poet shakspear histori origin peopl power men king whether import account pi

In [16]:
## Viewing the breakdown of topics present in a single document (in this case, document #8)

print(document_list[8][:500])

model.doc_topic_[8]

SHAKSPEARE;[525] OR, THE POET

[Transcriber's Note: Shakspeare is spelled as "Shakspeare" as well as
"Shakespeare" in this book. The original spellings have been retained.]


1. Great men are more distinguished by range and extent, than by
originality. If we require the originality which consists in weaving,
like a spider, their web from their own bowels; in finding clay, and
making bricks, and building the house; no great men are original. Nor
does valuable originality consist in unlikeness to 


array([ 0.07895359,  0.14812297,  0.08368312,  0.28912208,  0.05530594,
        0.03668342,  0.10999113,  0.07924919,  0.06979013,  0.04909843])

In [17]:
## Viewing topics present in a single document alongside text labels for each topic

topic_titles = ['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9']

topic_values = model.doc_topic_[8]

list(zip(topic_titles, topic_values))

[('Topic 0', 0.078953591486845998),
 ('Topic 1', 0.14812296778007686),
 ('Topic 2', 0.083683121489801962),
 ('Topic 3', 0.2891220809932013),
 ('Topic 4', 0.05530594147206621),
 ('Topic 5', 0.036683417085427134),
 ('Topic 6', 0.10999113213124447),
 ('Topic 7', 0.079249187112030745),
 ('Topic 8', 0.069790127106118829),
 ('Topic 9', 0.049098433343186518)]

In [18]:
## Viewing a ranked list of the most prominent topics present in a document

from operator import itemgetter

sorted(list(zip(topic_titles, topic_values)), key=itemgetter(1))[::-1]

[('Topic 3', 0.2891220809932013),
 ('Topic 1', 0.14812296778007686),
 ('Topic 6', 0.10999113213124447),
 ('Topic 2', 0.083683121489801962),
 ('Topic 7', 0.079249187112030745),
 ('Topic 0', 0.078953591486845998),
 ('Topic 8', 0.069790127106118829),
 ('Topic 4', 0.05530594147206621),
 ('Topic 9', 0.049098433343186518),
 ('Topic 5', 0.036683417085427134)]

# *Supervised learning: Naive Bayes classification*

In [19]:
## Download sample text corpora from GitHub, then unzip.

os.chdir('/sharedfolder/')

## Uncomment the lines below if you need to re-download test corpora we used last week.

#!wget -N https://github.com/pcda18/pcda18.github.io/blob/master/week/8/Sample_corpora.zip?raw=true -O Sample_corpora.zip
#!unzip -o Sample_corpora.zip

In [20]:
## Loading Melville novels

os.chdir('/sharedfolder/Sample_corpora/Herman_Melville/')

melville_texts = []

for filename in os.listdir('./'):
    text_data = open(filename).read().replace('\n', ' ')
    melville_texts.append(text_data)

print(len(melville_texts))

13


In [21]:
## Loading Austen novels

os.chdir('/sharedfolder/Sample_corpora/Jane_Austen/')

austen_texts = []

for filename in os.listdir('./'):
    text_data = open(filename).read().replace('\n', ' ')
    austen_texts.append(text_data)

print(len(austen_texts))

6


In [22]:
## Removing the last novel from each list so we can use it to test our classifier

melville_train_texts = melville_texts[:-1]
austen_train_texts = austen_texts[:-1]

melville_test_text = melville_texts[-1]
austen_test_text = austen_texts[-1]

In [23]:
## Creating a master list of Melville sentences

from nltk.tokenize import sent_tokenize

melville_combined_texts = ' '.join(melville_train_texts)

melville_sentences = sent_tokenize(melville_combined_texts)

print(len(melville_sentences))

melville_sentences[9999]

51940


'He discoursed imperially with his chiefs; nodded his sovereign will to his pages; called for another gourd of wine; in all respects carrying his royalty bravely.'

In [24]:
## Extracting 2000 Melville sentences at random for use as a training set

import random

melville_train_sentences = random.sample(melville_sentences, 2000)

In [25]:
## Creating a list of Melville sentences for our test set

melville_test_sentences = sent_tokenize(melville_test_text)

print(len(melville_test_sentences))

melville_test_sentences[997]

6224


'With brown sugar, taken from the mess-chests, and hot water begged from the galley-cooks, the men made all manner of punches, toddies, and cocktails, letting fall therein a small drop of tar, like a bit of brown toast, by way of imparting a flavour.'

In [26]:
## Creating a master list of Austen sentences

austen_combined_texts = ' '.join(austen_train_texts)

austen_sentences = sent_tokenize(austen_combined_texts)

print(len(austen_sentences))

austen_sentences[8979]

23803


'From that time Mr. Crawford sat down likewise.'

In [27]:
## Extracting 2000 Austen sentences at random for use as a training set

austen_train_sentences = random.sample(austen_sentences, 2000)

In [28]:
## Creating a list of Austen sentences for our test set

austen_test_sentences = sent_tokenize(austen_test_text)

print(len(austen_test_sentences))

austen_test_sentences[1000]

4835


'Mrs. Smith has this morning exercised the privilege of riches upon a poor dependent cousin, by sending me on business to London.'

In [29]:
## Combing training data
combined_texts = melville_train_sentences + austen_train_sentences

## Creating list of associated class values: 
## 0 for Melville, 1 for Austen
y = [0]*len(melville_train_sentences) + [1]*len(austen_train_sentences)

In [30]:
## Creating vectorized training set using our combined sentence list

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(combined_texts).toarray()

X.shape

(4000, 10629)

In [31]:
## Training a multinomial naive Bayes classifier
## X is a combined list of Melville and Austen sentences (2000 sentences from each)
## y is a list of classes (0 or 1)

from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB().fit(X, y)

### ▷Assignment

    Write a script that prints Austen-like sentences written 
    by Melville, and Melville-like sentences written by Austen.

In [32]:
## Classifying all sentences in our Austen test set
# Recall that 0 means Melville & 1 means Austen

austen_sentence_vectors = vectorizer.transform(austen_test_sentences)   ## Converting a list of strings to the same
                                                                        ## vector format we used for our training set

austen_sentence_classifications = classifier.predict(austen_sentence_vectors)   ## Classifying every sentence in the novel

austen_sentence_classifications[:20]                                    ## Viewing the first 20 sentence classifications

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [33]:
## Printing all misclassified Austen sentences

for i in range(len(austen_test_sentences)):
    if austen_sentence_classifications[i] == 0.0:
        print(austen_test_sentences[i])
        print()

Three thousand pounds!

They encouraged each other now in the violence of their affliction.

The agony of grief which overpowered them at first, was voluntarily renewed, was sought for, was created again and again.

"Fifteen years!

It may be very inconvenient some years to spare a hundred, or even fifty pounds from our own expenses."

"Yes; and the set of breakfast china is twice as handsome as what belongs to this house.

But Edward had no turn for great men or barouches.

"Like him!"

We shall live within a few miles of each other, and shall meet every day of our lives.

I have the highest opinion in the world of Edward's heart.

His eyes want all that spirit, that fire, which at once announce virtue and intelligence.

Like him!

So far from hence!

It chiefly consisted of household linen, plate, china, and books, with a handsome pianoforte of Marianne's.

Many were the tears shed by them in their last adieus to a place so much beloved.

After winding along it for more than a mile, 

In [34]:
## Classifying all sentences in our Melville test set
# Recall that 0 means Melville & 1 means Austen

melville_sentence_vectors = vectorizer.transform(melville_test_sentences)   ## Converting a list of strings to the same
                                                                            ## vector format we used for our training set

melville_sentence_classifications = classifier.predict(melville_sentence_vectors)   ## Classifying every sentence in the novel

melville_sentence_classifications[:20]                                      ## Viewing the first 20 sentence classifications

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [37]:
## Printing all misclassified Melville sentences

for i in range(len(melville_test_sentences)):
    if melville_sentence_classifications[i] == 1.0:
        print(melville_test_sentences[i])
        print()

NOTE.

IX.

THE PURSUIT OF POETRY UNDER DIFFICULTIES.

XII.

XIII.

XIV.

XVI.

XXVII.

XXXIII.

FLOGGING NOT LAWFUL.

FLOGGING NOT NECESSARY.

"THE PEOPLE" ARE GIVEN "LIBERTY."

"SINK, BURN, AND DESTROY."

The gash being made, a metamorphosis took place, transcending any related by Ovid.

And my shroud it afterward came very near proving, as he who reads further will find.

Very true; and that thought very early occurred to me; for no idea had I of scudding round Cape Horn in my shirt; for _that_ would have been almost scudding under bare poles, indeed.

No, my dear friend; and that was the deuce of it.

Waterproof it was not, no more than a sponge.

No, no; up you go: fat or lean: Lambert or Edson: never mind how much avoirdupois you might weigh.

It had been my intention to make it thoroughly impervious, by giving it a coating of paint, But bitter fate ever overtakes us unfortunates.

A rather serious circumstance growing out of such a case will be related in some future chapter.

B

xiii.]

Nor is this the only instance where right and salutary principles, in themselves almost self-evident and infallible, have been advanced in justification of things, which in themselves are just as self-evidently wrong and pernicious.

Indifferent as to who may be the parties concerned, I but desire to see wrong things righted, and equal justice administered to all.

Had this gentleman had his way, our checker-boards would very soon have been pitched out of the ports.

But quite as often as the law against smuggling spirits is transgressed by the unscrupulous sailors, the statutes against cards and dice are evaded.

But this is not all.

Now, in addition to having an eye on the master-at-arms and his aids, the day-gamblers must see to it, that every person suspected of being a _white-mouse_ or _fancy-man_, is like-wise dogged wherever he goes.

Additional scouts are retained constantly to snuff at their trail.

I say, Pounce, has any one been scouting around _you_ this morning?"


# *Creating a text concordance with NLTK*

In [38]:
import nltk
from nltk.tokenize import word_tokenize

tokens = word_tokenize(melville_texts[6])

nltk_text = nltk.Text(tokens)

nltk_text.concordance('whale', lines=100)

Displaying 100 of 1073 matches:
                                     WHALE . By Herman Melville ETYMOLOGY . ( S
t which is not true. ” —Hackluyt . “ WHALE . * * * Sw. and Dan . hval . This an
ulted. ” —Webster ’ s Dictionary . “ WHALE . * * * It is more immediately from 
ish . WAL , Dutch . HWAL , Swedish . WHALE , Icelandic . WHALE , English . BALE
HWAL , Swedish . WHALE , Icelandic . WHALE , English . BALEINE , French . BALLE
t least , take the higgledy-piggledy whale statements , however authentic , in 
 dreadful gulf of this monster ’ s ( whale ’ s ) mouth , are immediately lost a
of patient Job. ” —Rabelais . “ This whale ’ s liver was two cartloads. ” —Stow
 Touching that monstrous bulk of the whale or ork we have received nothing cert
restless paine , Like as the wounded whale to shore flies thro ’ the maine. ” —
. Of Sperma Ceti and the Sperma Ceti Whale . Vide his V. E. “ Like Spencer ’ s 
en ’ s Annus Mirabilis . “ While the whale is floating at the stern of the ship
onas-in-