In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from gensim import models, matutils

In [3]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [1]:
test = pd.read_csv('eng_clean.csv')
test = test.clean

In [5]:
# stop word list of uninformative, nonsense, or over common words
stops = ['mama','aunt','cousin','uncle','ii','iv','iii','vi','vii','papa','mamma','madam','you','nay','ay',
         'tis','vol','pp','de','ne','ere','her','ha','dollar','lord','prince','princess','lady','king','queen',
         'thou','thy','thee','wherefore','ms','er','tis','ye','hath','doth','yea','saith','ed','unto','hast','hence','shalt',
         'devil','thine','alas',
         'male','female','poem','electricity','behold','discharge','south','troop','army','sex','sexual','edition','north',
         'south','reader','mode','acid','shew','text',
         'sin','righteousness','mercy','sinner','holy','saint','gospel','flesh','hell','scripture','salvation','eternal',
         'thereof','colour','somebody','somehow','grey','title','author','gray','anybody','american','british','french',
         'irish','lad','publish','nod','stair',
         'that','which','the','is','those','be','of','toward',]

In [None]:
cv = CountVectorizer(max_df=.65,min_df=.05,stop_words = stops)
cv_test = cv.fit_transform(test)

# LSA
Not doing so hot.

In [85]:
lsa = TruncatedSVD(11,random_state=8,n_iter=10)
doc_topic = lsa.fit_transform(cv_test)
sum(lsa.explained_variance_ratio_)

0.4094519600067894

In [86]:
display_topics(lsa,cv.get_feature_names(),15)


Topic  0
specie, honour, principle, officer, prayer, captain, nation, conscience, doctrine, island, government, worship, religion, curse, angel

Topic  1
prayer, righteous, conscience, angel, throne, curse, worship, doctrine, covenant, wicked, apostle, iniquity, holiness, godly, kingdom

Topic  2
specie, variety, selection, seed, doctrine, distinct, righteous, variation, breed, covenant, prayer, conscience, structure, angel, differ

Topic  3
principle, government, science, proposition, phenomenon, political, poet, system, theory, modern, writer, social, capital, nation, induction

Topic  4
officer, gun, government, vessel, current, island, captain, camp, capture, battery, cavalry, division, principle, phenomenon, regiment

Topic  5
current, plate, wire, surface, experiment, particle, pole, electric, induction, chemical, ball, gas, phenomenon, substance, solution

Topic  6
current, doctor, officer, plate, wire, marriage, experiment, carriage, captain, particle, ball, electric, honour, 

# NMF
Much better.

In [87]:
nmf = NMF(random_state=8,shuffle=True,n_components=14)
doc_topic = nmf.fit_transform(cv_test)

In [88]:
display_topics(nmf,cv.get_feature_names(), 15)


Topic  0
doctor, marriage, carriage, honour, tea, stair, drawing, dine, shop, lawyer, wine, lover, mistress, card, visitor

Topic  1
specie, selection, variety, variation, structure, island, distinct, organ, breed, individual, differ, modification, insect, modify, genus

Topic  2
prayer, doctrine, conscience, righteous, angel, worship, curse, throne, wicked, covenant, kingdom, apostle, iniquity, holiness, godly

Topic  3
proposition, phenomenon, science, attribute, theory, process, principle, observation, induction, individual, sensation, definition, conception, substance, scientific

Topic  4
officer, gun, regiment, camp, cavalry, prisoner, capture, division, garrison, infantry, retreat, column, fort, military, artillery

Topic  5
current, plate, wire, experiment, particle, electric, pole, surface, chemical, ball, gas, conductor, positive, metal, solution

Topic  6
poet, literature, poetry, century, literary, modern, writer, verse, novel, artist, ideal, criticism, genius, critic, col

# LDA
Beyond help - to be fair, it never should have worked, all of my texts are multitopic.

In [89]:
cvt = cv_test.transpose()
corpus = matutils.Sparse2Corpus(cvt)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [91]:
lda = models.LdaModel(corpus=corpus, num_topics=12, passes=100,random_state=8,id2word=id2word)

In [92]:
lda.print_topics()

[(0,
  '0.004*"honour" + 0.003*"etc" + 0.003*"prayer" + 0.002*"poet" + 0.002*"virtue" + 0.002*"nation" + 0.002*"curse" + 0.002*"verse" + 0.002*"scarce" + 0.002*"sword"'),
 (1,
  '0.004*"marriage" + 0.003*"doctor" + 0.003*"carriage" + 0.003*"honour" + 0.002*"lover" + 0.002*"lawyer" + 0.002*"mistress" + 0.002*"countenance" + 0.002*"drawing" + 0.002*"favour"'),
 (2,
  '0.005*"shop" + 0.004*"tea" + 0.003*"baby" + 0.003*"bottle" + 0.003*"kitchen" + 0.003*"doctor" + 0.003*"boot" + 0.003*"pipe" + 0.002*"stair" + 0.002*"candle"'),
 (3,
  '0.015*"captain" + 0.011*"deck" + 0.010*"island" + 0.009*"vessel" + 0.006*"sailor" + 0.006*"crew" + 0.006*"mate" + 0.006*"canoe" + 0.005*"coast" + 0.005*"gun"'),
 (4,
  '0.008*"knight" + 0.008*"sword" + 0.006*"priest" + 0.006*"god" + 0.005*"palace" + 0.005*"castle" + 0.004*"forest" + 0.004*"slave" + 0.004*"slay" + 0.004*"spear"'),
 (5,
  '0.018*"specie" + 0.008*"variety" + 0.006*"surface" + 0.006*"seed" + 0.006*"distinct" + 0.005*"selection" + 0.005*"structure