In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from gensim import models, matutils

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [4]:
test = pd.read_csv('eng_clean.csv')
authors = test.author
titles = test.title
test = test.clean

In [5]:
# creating title-author ids for my flask app
ids = titles + '; by ' + authors

In [7]:
stops = [
    'papa','cousin','uncle','aunt','mamma','madam','lordship','ladyship','lord','prince','queen','niece','monsieur',
    'thou','thy','thee','thine','hast','st','hath','tis','er','ye','ses','wot','se','ad','fre','arter','nay','ere',
         'doth','ead','ave','agin','ow','ome','afore','ay','ha','de','wi','yer','ap','ard','hae','ouse','ud','ope','author',
    'chap','vol','fer','git','american','fat','funny','dollar','alas','dost','morn','somehow','eh','thoughtfully','anybody',
    'pal','somebody','gray','male','female','lovely','fun','egg','apple','mouse','cake',
    'poet','illustration','gal','colour','lad','grey','anyone','nod','french','lunch','maybe','dollar','cent','color','tail',
    'car','publish','anyway','her','stair',
    'realise','shrug','recognise','realize','softly','recognize','fust','pore','boot','shoe','pie','shew','jolly',
    'toward','whilst','unto','amongst','ii',
        ]

In [8]:
tf = TfidfVectorizer(min_df=.05,max_df=.65,stop_words=stops)
tf_test = tf.fit_transform(test)

# LSA Model
Some okay topics.

In [10]:
lsa = TruncatedSVD(15,random_state=8,n_iter=10)

In [11]:
doc_topic = lsa.fit_transform(tf_test)

In [12]:
display_topics(lsa,tf.get_feature_names(),15)


Topic  0
doctor, honour, captain, officer, marriage, snow, tea, hotel, army, gun, carriage, island, reader, camp, sword

Topic  1
poem, principle, modern, government, nation, political, science, literature, century, writer, religion, system, literary, doctrine, social

Topic  2
army, officer, troop, captain, island, sword, gun, vessel, canoe, camp, knight, nation, government, prisoner, god

Topic  3
poem, knight, sword, god, sin, weep, priest, holy, behold, honour, maiden, slay, castle, fairy, palace

Topic  4
officer, army, honour, troop, marriage, knight, prisoner, regiment, castle, carriage, sword, captain, mistress, favour, doctor

Topic  5
captain, deck, skipper, mate, vessel, island, crew, sailor, schooner, pirate, seaman, voyage, ashore, brig, anchor

Topic  6
poem, officer, literary, literature, poetry, army, skipper, verse, deck, captain, troop, critic, novel, german, regiment

Topic  7
sin, skipper, nation, god, government, science, priest, slave, religion, kingdom, doctrine

# NMF Model
What I used for topics! Good separation and variety.

In [13]:
nmf = NMF(random_state=8,shuffle=True,n_components=13)
doc_topic = nmf.fit_transform(tf_test)

In [14]:
display_topics(nmf,tf.get_feature_names(), 15)


Topic  0
marriage, honour, carriage, doctor, lover, drawing, favour, mistress, lawyer, endeavour, countenance, clergyman, engagement, widow, inquiry

Topic  1
specie, science, hypothesis, scientific, phenomenon, structure, physical, theory, variety, process, development, modification, organic, modern, surface

Topic  2
officer, army, troop, regiment, gun, cavalry, infantry, camp, british, prisoner, colonel, garrison, fort, battery, military

Topic  3
sin, god, weep, behold, holy, poem, woe, mortal, priest, angel, prayer, dew, shalt, lover, divine

Topic  4
trail, snow, camp, ranch, reckon, cabin, prairie, wagon, team, hotel, pine, valley, miner, saddle, rifle

Topic  5
captain, deck, vessel, sailor, crew, island, cabin, mate, schooner, pirate, anchor, brig, coast, ashore, voyage

Topic  6
poem, literature, literary, poetry, verse, reader, writer, novel, critic, artist, modern, genius, criticism, prose, century

Topic  7
skipper, mate, wharf, cook, pint, watchman, deck, beer, landlord,

In [15]:
#save off with title-author index
topicd = pd.DataFrame(doc_topic.round(5),
                      index=ids,
                      columns = ['romdram','science','war','religion',
                                 'wild','boats_adv','lit','boats_life',
                                 'crime','folk','politics','e_america',
                                 'knights'])

In [18]:
topicd.to_csv('topic_en.csv')

# LDA
Struggling, but I didn't run it to optimisation anyway since I knew it would not work.

In [None]:
tft = tf_test.transpose()
corpus = matutils.Sparse2Corpus(tft)
id2word = dict((v, k) for k, v in tf.vocabulary_.items())

In [114]:
lda = models.LdaModel(corpus=corpus, num_topics=12, passes=100,random_state=8,id2word=id2word)

In [115]:
lda.print_topics()

[(0,
  '0.000*"moccasin" + 0.000*"sullenness" + 0.000*"yawned" + 0.000*"outwork" + 0.000*"bestride" + 0.000*"rudest" + 0.000*"dispirited" + 0.000*"incapacitate" + 0.000*"denude" + 0.000*"phalanx"'),
 (1,
  '0.000*"moccasin" + 0.000*"sullenness" + 0.000*"yawned" + 0.000*"outwork" + 0.000*"bestride" + 0.000*"rudest" + 0.000*"dispirited" + 0.000*"incapacitate" + 0.000*"denude" + 0.000*"phalanx"'),
 (2,
  '0.001*"captain" + 0.001*"doctor" + 0.001*"officer" + 0.001*"honour" + 0.001*"snow" + 0.001*"gun" + 0.001*"army" + 0.001*"camp" + 0.001*"island" + 0.001*"marriage"'),
 (3,
  '0.000*"moccasin" + 0.000*"sullenness" + 0.000*"yawned" + 0.000*"outwork" + 0.000*"bestride" + 0.000*"rudest" + 0.000*"dispirited" + 0.000*"incapacitate" + 0.000*"denude" + 0.000*"phalanx"'),
 (4,
  '0.000*"moccasin" + 0.000*"sullenness" + 0.000*"yawned" + 0.000*"outwork" + 0.000*"bestride" + 0.000*"rudest" + 0.000*"dispirited" + 0.000*"incapacitate" + 0.000*"denude" + 0.000*"phalanx"'),
 (5,
  '0.000*"moccasin" + 0.0